xref: /linux/tools/perf/builtin-record.c (revision 17e548405a81665fd14cee960db7d093d1396400)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/pmu.h"
50 #include "util/pmus.h"
51 #include "util/clockid.h"
52 #include "util/off_cpu.h"
53 #include "util/bpf-filter.h"
54 #include "util/strbuf.h"
55 #include "asm/bug.h"
56 #include "perf.h"
57 #include "cputopo.h"
58 
59 #include <errno.h>
60 #include <inttypes.h>
61 #include <locale.h>
62 #include <poll.h>
63 #include <pthread.h>
64 #include <unistd.h>
65 #ifndef HAVE_GETTID
66 #include <syscall.h>
67 #endif
68 #include <sched.h>
69 #include <signal.h>
70 #ifdef HAVE_EVENTFD_SUPPORT
71 #include <sys/eventfd.h>
72 #endif
73 #include <sys/mman.h>
74 #include <sys/wait.h>
75 #include <sys/types.h>
76 #include <sys/stat.h>
77 #include <fcntl.h>
78 #include <linux/err.h>
79 #include <linux/string.h>
80 #include <linux/time64.h>
81 #include <linux/zalloc.h>
82 #include <linux/bitmap.h>
83 #include <sys/time.h>
84 
85 struct switch_output {
86 	bool		 enabled;
87 	bool		 signal;
88 	unsigned long	 size;
89 	unsigned long	 time;
90 	const char	*str;
91 	bool		 set;
92 	char		 **filenames;
93 	int		 num_files;
94 	int		 cur_file;
95 };
96 
97 struct thread_mask {
98 	struct mmap_cpu_mask	maps;
99 	struct mmap_cpu_mask	affinity;
100 };
101 
102 struct record_thread {
103 	pid_t			tid;
104 	struct thread_mask	*mask;
105 	struct {
106 		int		msg[2];
107 		int		ack[2];
108 	} pipes;
109 	struct fdarray		pollfd;
110 	int			ctlfd_pos;
111 	int			nr_mmaps;
112 	struct mmap		**maps;
113 	struct mmap		**overwrite_maps;
114 	struct record		*rec;
115 	unsigned long long	samples;
116 	unsigned long		waking;
117 	u64			bytes_written;
118 	u64			bytes_transferred;
119 	u64			bytes_compressed;
120 };
121 
122 static __thread struct record_thread *thread;
123 
124 enum thread_msg {
125 	THREAD_MSG__UNDEFINED = 0,
126 	THREAD_MSG__READY,
127 	THREAD_MSG__MAX,
128 };
129 
130 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
131 	"UNDEFINED", "READY"
132 };
133 
134 enum thread_spec {
135 	THREAD_SPEC__UNDEFINED = 0,
136 	THREAD_SPEC__CPU,
137 	THREAD_SPEC__CORE,
138 	THREAD_SPEC__PACKAGE,
139 	THREAD_SPEC__NUMA,
140 	THREAD_SPEC__USER,
141 	THREAD_SPEC__MAX,
142 };
143 
144 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
145 	"undefined", "cpu", "core", "package", "numa", "user"
146 };
147 
148 struct pollfd_index_map {
149 	int evlist_pollfd_index;
150 	int thread_pollfd_index;
151 };
152 
153 struct record {
154 	struct perf_tool	tool;
155 	struct record_opts	opts;
156 	u64			bytes_written;
157 	u64			thread_bytes_written;
158 	struct perf_data	data;
159 	struct auxtrace_record	*itr;
160 	struct evlist	*evlist;
161 	struct perf_session	*session;
162 	struct evlist		*sb_evlist;
163 	pthread_t		thread_id;
164 	int			realtime_prio;
165 	bool			latency;
166 	bool			switch_output_event_set;
167 	bool			no_buildid;
168 	bool			no_buildid_set;
169 	bool			no_buildid_cache;
170 	bool			no_buildid_cache_set;
171 	bool			buildid_all;
172 	bool			buildid_mmap;
173 	bool			timestamp_filename;
174 	bool			timestamp_boundary;
175 	bool			off_cpu;
176 	const char		*filter_action;
177 	struct switch_output	switch_output;
178 	unsigned long long	samples;
179 	unsigned long		output_max_size;	/* = 0: unlimited */
180 	struct perf_debuginfod	debuginfod;
181 	int			nr_threads;
182 	struct thread_mask	*thread_masks;
183 	struct record_thread	*thread_data;
184 	struct pollfd_index_map	*index_map;
185 	size_t			index_map_sz;
186 	size_t			index_map_cnt;
187 };
188 
189 static volatile int done;
190 
191 static volatile int auxtrace_record__snapshot_started;
192 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
193 static DEFINE_TRIGGER(switch_output_trigger);
194 
195 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
196 	"SYS", "NODE", "CPU"
197 };
198 
199 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
200 				  struct perf_sample *sample, struct machine *machine);
201 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
202 				   struct perf_sample *sample, struct machine *machine);
203 static int process_timestamp_boundary(const struct perf_tool *tool,
204 				      union perf_event *event,
205 				      struct perf_sample *sample,
206 				      struct machine *machine);
207 
208 #ifndef HAVE_GETTID
209 static inline pid_t gettid(void)
210 {
211 	return (pid_t)syscall(__NR_gettid);
212 }
213 #endif
214 
215 static int record__threads_enabled(struct record *rec)
216 {
217 	return rec->opts.threads_spec;
218 }
219 
220 static bool switch_output_signal(struct record *rec)
221 {
222 	return rec->switch_output.signal &&
223 	       trigger_is_ready(&switch_output_trigger);
224 }
225 
226 static bool switch_output_size(struct record *rec)
227 {
228 	return rec->switch_output.size &&
229 	       trigger_is_ready(&switch_output_trigger) &&
230 	       (rec->bytes_written >= rec->switch_output.size);
231 }
232 
233 static bool switch_output_time(struct record *rec)
234 {
235 	return rec->switch_output.time &&
236 	       trigger_is_ready(&switch_output_trigger);
237 }
238 
239 static u64 record__bytes_written(struct record *rec)
240 {
241 	return rec->bytes_written + rec->thread_bytes_written;
242 }
243 
244 static bool record__output_max_size_exceeded(struct record *rec)
245 {
246 	return rec->output_max_size &&
247 	       (record__bytes_written(rec) >= rec->output_max_size);
248 }
249 
250 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
251 			 void *bf, size_t size)
252 {
253 	struct perf_data_file *file = &rec->session->data->file;
254 
255 	if (map && map->file)
256 		file = map->file;
257 
258 	if (perf_data_file__write(file, bf, size) < 0) {
259 		pr_err("failed to write perf data, error: %m\n");
260 		return -1;
261 	}
262 
263 	if (map && map->file) {
264 		thread->bytes_written += size;
265 		rec->thread_bytes_written += size;
266 	} else {
267 		rec->bytes_written += size;
268 	}
269 
270 	if (record__output_max_size_exceeded(rec) && !done) {
271 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
272 				" stopping session ]\n",
273 				record__bytes_written(rec) >> 10);
274 		done = 1;
275 	}
276 
277 	if (switch_output_size(rec))
278 		trigger_hit(&switch_output_trigger);
279 
280 	return 0;
281 }
282 
283 static int record__aio_enabled(struct record *rec);
284 static int record__comp_enabled(struct record *rec);
285 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
286 			    void *dst, size_t dst_size, void *src, size_t src_size);
287 
288 #ifdef HAVE_AIO_SUPPORT
289 static int record__aio_write(struct aiocb *cblock, int trace_fd,
290 		void *buf, size_t size, off_t off)
291 {
292 	int rc;
293 
294 	cblock->aio_fildes = trace_fd;
295 	cblock->aio_buf    = buf;
296 	cblock->aio_nbytes = size;
297 	cblock->aio_offset = off;
298 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
299 
300 	do {
301 		rc = aio_write(cblock);
302 		if (rc == 0) {
303 			break;
304 		} else if (errno != EAGAIN) {
305 			cblock->aio_fildes = -1;
306 			pr_err("failed to queue perf data, error: %m\n");
307 			break;
308 		}
309 	} while (1);
310 
311 	return rc;
312 }
313 
314 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
315 {
316 	void *rem_buf;
317 	off_t rem_off;
318 	size_t rem_size;
319 	int rc, aio_errno;
320 	ssize_t aio_ret, written;
321 
322 	aio_errno = aio_error(cblock);
323 	if (aio_errno == EINPROGRESS)
324 		return 0;
325 
326 	written = aio_ret = aio_return(cblock);
327 	if (aio_ret < 0) {
328 		if (aio_errno != EINTR)
329 			pr_err("failed to write perf data, error: %m\n");
330 		written = 0;
331 	}
332 
333 	rem_size = cblock->aio_nbytes - written;
334 
335 	if (rem_size == 0) {
336 		cblock->aio_fildes = -1;
337 		/*
338 		 * md->refcount is incremented in record__aio_pushfn() for
339 		 * every aio write request started in record__aio_push() so
340 		 * decrement it because the request is now complete.
341 		 */
342 		perf_mmap__put(&md->core);
343 		rc = 1;
344 	} else {
345 		/*
346 		 * aio write request may require restart with the
347 		 * remainder if the kernel didn't write whole
348 		 * chunk at once.
349 		 */
350 		rem_off = cblock->aio_offset + written;
351 		rem_buf = (void *)(cblock->aio_buf + written);
352 		record__aio_write(cblock, cblock->aio_fildes,
353 				rem_buf, rem_size, rem_off);
354 		rc = 0;
355 	}
356 
357 	return rc;
358 }
359 
360 static int record__aio_sync(struct mmap *md, bool sync_all)
361 {
362 	struct aiocb **aiocb = md->aio.aiocb;
363 	struct aiocb *cblocks = md->aio.cblocks;
364 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
365 	int i, do_suspend;
366 
367 	do {
368 		do_suspend = 0;
369 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
370 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
371 				if (sync_all)
372 					aiocb[i] = NULL;
373 				else
374 					return i;
375 			} else {
376 				/*
377 				 * Started aio write is not complete yet
378 				 * so it has to be waited before the
379 				 * next allocation.
380 				 */
381 				aiocb[i] = &cblocks[i];
382 				do_suspend = 1;
383 			}
384 		}
385 		if (!do_suspend)
386 			return -1;
387 
388 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
389 			if (!(errno == EAGAIN || errno == EINTR))
390 				pr_err("failed to sync perf data, error: %m\n");
391 		}
392 	} while (1);
393 }
394 
395 struct record_aio {
396 	struct record	*rec;
397 	void		*data;
398 	size_t		size;
399 };
400 
401 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
402 {
403 	struct record_aio *aio = to;
404 
405 	/*
406 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
407 	 * to release space in the kernel buffer as fast as possible, calling
408 	 * perf_mmap__consume() from perf_mmap__push() function.
409 	 *
410 	 * That lets the kernel to proceed with storing more profiling data into
411 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
412 	 *
413 	 * Coping can be done in two steps in case the chunk of profiling data
414 	 * crosses the upper bound of the kernel buffer. In this case we first move
415 	 * part of data from map->start till the upper bound and then the remainder
416 	 * from the beginning of the kernel buffer till the end of the data chunk.
417 	 */
418 
419 	if (record__comp_enabled(aio->rec)) {
420 		ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
421 						   mmap__mmap_len(map) - aio->size,
422 						   buf, size);
423 		if (compressed < 0)
424 			return (int)compressed;
425 
426 		size = compressed;
427 	} else {
428 		memcpy(aio->data + aio->size, buf, size);
429 	}
430 
431 	if (!aio->size) {
432 		/*
433 		 * Increment map->refcount to guard map->aio.data[] buffer
434 		 * from premature deallocation because map object can be
435 		 * released earlier than aio write request started on
436 		 * map->aio.data[] buffer is complete.
437 		 *
438 		 * perf_mmap__put() is done at record__aio_complete()
439 		 * after started aio request completion or at record__aio_push()
440 		 * if the request failed to start.
441 		 */
442 		perf_mmap__get(&map->core);
443 	}
444 
445 	aio->size += size;
446 
447 	return size;
448 }
449 
450 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
451 {
452 	int ret, idx;
453 	int trace_fd = rec->session->data->file.fd;
454 	struct record_aio aio = { .rec = rec, .size = 0 };
455 
456 	/*
457 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
458 	 * becomes available after previous aio write operation.
459 	 */
460 
461 	idx = record__aio_sync(map, false);
462 	aio.data = map->aio.data[idx];
463 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
464 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
465 		return ret;
466 
467 	rec->samples++;
468 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
469 	if (!ret) {
470 		*off += aio.size;
471 		rec->bytes_written += aio.size;
472 		if (switch_output_size(rec))
473 			trigger_hit(&switch_output_trigger);
474 	} else {
475 		/*
476 		 * Decrement map->refcount incremented in record__aio_pushfn()
477 		 * back if record__aio_write() operation failed to start, otherwise
478 		 * map->refcount is decremented in record__aio_complete() after
479 		 * aio write operation finishes successfully.
480 		 */
481 		perf_mmap__put(&map->core);
482 	}
483 
484 	return ret;
485 }
486 
487 static off_t record__aio_get_pos(int trace_fd)
488 {
489 	return lseek(trace_fd, 0, SEEK_CUR);
490 }
491 
492 static void record__aio_set_pos(int trace_fd, off_t pos)
493 {
494 	lseek(trace_fd, pos, SEEK_SET);
495 }
496 
497 static void record__aio_mmap_read_sync(struct record *rec)
498 {
499 	int i;
500 	struct evlist *evlist = rec->evlist;
501 	struct mmap *maps = evlist->mmap;
502 
503 	if (!record__aio_enabled(rec))
504 		return;
505 
506 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
507 		struct mmap *map = &maps[i];
508 
509 		if (map->core.base)
510 			record__aio_sync(map, true);
511 	}
512 }
513 
514 static int nr_cblocks_default = 1;
515 static int nr_cblocks_max = 4;
516 
517 static int record__aio_parse(const struct option *opt,
518 			     const char *str,
519 			     int unset)
520 {
521 	struct record_opts *opts = (struct record_opts *)opt->value;
522 
523 	if (unset) {
524 		opts->nr_cblocks = 0;
525 	} else {
526 		if (str)
527 			opts->nr_cblocks = strtol(str, NULL, 0);
528 		if (!opts->nr_cblocks)
529 			opts->nr_cblocks = nr_cblocks_default;
530 	}
531 
532 	return 0;
533 }
534 #else /* HAVE_AIO_SUPPORT */
535 static int nr_cblocks_max = 0;
536 
537 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
538 			    off_t *off __maybe_unused)
539 {
540 	return -1;
541 }
542 
543 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
544 {
545 	return -1;
546 }
547 
548 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
549 {
550 }
551 
552 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
553 {
554 }
555 #endif
556 
557 static int record__aio_enabled(struct record *rec)
558 {
559 	return rec->opts.nr_cblocks > 0;
560 }
561 
562 #define MMAP_FLUSH_DEFAULT 1
563 static int record__mmap_flush_parse(const struct option *opt,
564 				    const char *str,
565 				    int unset)
566 {
567 	int flush_max;
568 	struct record_opts *opts = (struct record_opts *)opt->value;
569 	static struct parse_tag tags[] = {
570 			{ .tag  = 'B', .mult = 1       },
571 			{ .tag  = 'K', .mult = 1 << 10 },
572 			{ .tag  = 'M', .mult = 1 << 20 },
573 			{ .tag  = 'G', .mult = 1 << 30 },
574 			{ .tag  = 0 },
575 	};
576 
577 	if (unset)
578 		return 0;
579 
580 	if (str) {
581 		opts->mmap_flush = parse_tag_value(str, tags);
582 		if (opts->mmap_flush == (int)-1)
583 			opts->mmap_flush = strtol(str, NULL, 0);
584 	}
585 
586 	if (!opts->mmap_flush)
587 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
588 
589 	flush_max = evlist__mmap_size(opts->mmap_pages);
590 	flush_max /= 4;
591 	if (opts->mmap_flush > flush_max)
592 		opts->mmap_flush = flush_max;
593 
594 	return 0;
595 }
596 
597 #ifdef HAVE_ZSTD_SUPPORT
598 static unsigned int comp_level_default = 1;
599 
600 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
601 {
602 	struct record_opts *opts = opt->value;
603 
604 	if (unset) {
605 		opts->comp_level = 0;
606 	} else {
607 		if (str)
608 			opts->comp_level = strtol(str, NULL, 0);
609 		if (!opts->comp_level)
610 			opts->comp_level = comp_level_default;
611 	}
612 
613 	return 0;
614 }
615 #endif
616 static unsigned int comp_level_max = 22;
617 
618 static int record__comp_enabled(struct record *rec)
619 {
620 	return rec->opts.comp_level > 0;
621 }
622 
623 static int process_synthesized_event(const struct perf_tool *tool,
624 				     union perf_event *event,
625 				     struct perf_sample *sample __maybe_unused,
626 				     struct machine *machine __maybe_unused)
627 {
628 	struct record *rec = container_of(tool, struct record, tool);
629 	return record__write(rec, NULL, event, event->header.size);
630 }
631 
632 static struct mutex synth_lock;
633 
634 static int process_locked_synthesized_event(const struct perf_tool *tool,
635 				     union perf_event *event,
636 				     struct perf_sample *sample __maybe_unused,
637 				     struct machine *machine __maybe_unused)
638 {
639 	int ret;
640 
641 	mutex_lock(&synth_lock);
642 	ret = process_synthesized_event(tool, event, sample, machine);
643 	mutex_unlock(&synth_lock);
644 	return ret;
645 }
646 
647 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
648 {
649 	struct record *rec = to;
650 
651 	if (record__comp_enabled(rec)) {
652 		ssize_t compressed = zstd_compress(rec->session, map, map->data,
653 						   mmap__mmap_len(map), bf, size);
654 
655 		if (compressed < 0)
656 			return (int)compressed;
657 
658 		size = compressed;
659 		bf   = map->data;
660 	}
661 
662 	thread->samples++;
663 	return record__write(rec, map, bf, size);
664 }
665 
666 static volatile sig_atomic_t signr = -1;
667 static volatile sig_atomic_t child_finished;
668 #ifdef HAVE_EVENTFD_SUPPORT
669 static volatile sig_atomic_t done_fd = -1;
670 #endif
671 
672 static void sig_handler(int sig)
673 {
674 	if (sig == SIGCHLD)
675 		child_finished = 1;
676 	else
677 		signr = sig;
678 
679 	done = 1;
680 #ifdef HAVE_EVENTFD_SUPPORT
681 	if (done_fd >= 0) {
682 		u64 tmp = 1;
683 		int orig_errno = errno;
684 
685 		/*
686 		 * It is possible for this signal handler to run after done is
687 		 * checked in the main loop, but before the perf counter fds are
688 		 * polled. If this happens, the poll() will continue to wait
689 		 * even though done is set, and will only break out if either
690 		 * another signal is received, or the counters are ready for
691 		 * read. To ensure the poll() doesn't sleep when done is set,
692 		 * use an eventfd (done_fd) to wake up the poll().
693 		 */
694 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
695 			pr_err("failed to signal wakeup fd, error: %m\n");
696 
697 		errno = orig_errno;
698 	}
699 #endif // HAVE_EVENTFD_SUPPORT
700 }
701 
702 static void sigsegv_handler(int sig)
703 {
704 	perf_hooks__recover();
705 	sighandler_dump_stack(sig);
706 }
707 
708 static void record__sig_exit(void)
709 {
710 	if (signr == -1)
711 		return;
712 
713 	signal(signr, SIG_DFL);
714 	raise(signr);
715 }
716 
717 #ifdef HAVE_AUXTRACE_SUPPORT
718 
719 static int record__process_auxtrace(const struct perf_tool *tool,
720 				    struct mmap *map,
721 				    union perf_event *event, void *data1,
722 				    size_t len1, void *data2, size_t len2)
723 {
724 	struct record *rec = container_of(tool, struct record, tool);
725 	struct perf_data *data = &rec->data;
726 	size_t padding;
727 	u8 pad[8] = {0};
728 
729 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
730 		off_t file_offset;
731 		int fd = perf_data__fd(data);
732 		int err;
733 
734 		file_offset = lseek(fd, 0, SEEK_CUR);
735 		if (file_offset == -1)
736 			return -1;
737 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
738 						     event, file_offset);
739 		if (err)
740 			return err;
741 	}
742 
743 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
744 	padding = (len1 + len2) & 7;
745 	if (padding)
746 		padding = 8 - padding;
747 
748 	record__write(rec, map, event, event->header.size);
749 	record__write(rec, map, data1, len1);
750 	if (len2)
751 		record__write(rec, map, data2, len2);
752 	record__write(rec, map, &pad, padding);
753 
754 	return 0;
755 }
756 
757 static int record__auxtrace_mmap_read(struct record *rec,
758 				      struct mmap *map)
759 {
760 	int ret;
761 
762 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
763 				  record__process_auxtrace);
764 	if (ret < 0)
765 		return ret;
766 
767 	if (ret)
768 		rec->samples++;
769 
770 	return 0;
771 }
772 
773 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
774 					       struct mmap *map)
775 {
776 	int ret;
777 
778 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
779 					   record__process_auxtrace,
780 					   rec->opts.auxtrace_snapshot_size);
781 	if (ret < 0)
782 		return ret;
783 
784 	if (ret)
785 		rec->samples++;
786 
787 	return 0;
788 }
789 
790 static int record__auxtrace_read_snapshot_all(struct record *rec)
791 {
792 	int i;
793 	int rc = 0;
794 
795 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
796 		struct mmap *map = &rec->evlist->mmap[i];
797 
798 		if (!map->auxtrace_mmap.base)
799 			continue;
800 
801 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
802 			rc = -1;
803 			goto out;
804 		}
805 	}
806 out:
807 	return rc;
808 }
809 
810 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
811 {
812 	pr_debug("Recording AUX area tracing snapshot\n");
813 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
814 		trigger_error(&auxtrace_snapshot_trigger);
815 	} else {
816 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
817 			trigger_error(&auxtrace_snapshot_trigger);
818 		else
819 			trigger_ready(&auxtrace_snapshot_trigger);
820 	}
821 }
822 
823 static int record__auxtrace_snapshot_exit(struct record *rec)
824 {
825 	if (trigger_is_error(&auxtrace_snapshot_trigger))
826 		return 0;
827 
828 	if (!auxtrace_record__snapshot_started &&
829 	    auxtrace_record__snapshot_start(rec->itr))
830 		return -1;
831 
832 	record__read_auxtrace_snapshot(rec, true);
833 	if (trigger_is_error(&auxtrace_snapshot_trigger))
834 		return -1;
835 
836 	return 0;
837 }
838 
839 static int record__auxtrace_init(struct record *rec)
840 {
841 	int err;
842 
843 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
844 	    && record__threads_enabled(rec)) {
845 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
846 		return -EINVAL;
847 	}
848 
849 	if (!rec->itr) {
850 		rec->itr = auxtrace_record__init(rec->evlist, &err);
851 		if (err)
852 			return err;
853 	}
854 
855 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
856 					      rec->opts.auxtrace_snapshot_opts);
857 	if (err)
858 		return err;
859 
860 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
861 					    rec->opts.auxtrace_sample_opts);
862 	if (err)
863 		return err;
864 
865 	err = auxtrace_parse_aux_action(rec->evlist);
866 	if (err)
867 		return err;
868 
869 	return auxtrace_parse_filters(rec->evlist);
870 }
871 
872 #else
873 
874 static inline
875 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
876 			       struct mmap *map __maybe_unused)
877 {
878 	return 0;
879 }
880 
881 static inline
882 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
883 				    bool on_exit __maybe_unused)
884 {
885 }
886 
887 static inline
888 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
889 {
890 	return 0;
891 }
892 
893 static inline
894 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
895 {
896 	return 0;
897 }
898 
899 static int record__auxtrace_init(struct record *rec __maybe_unused)
900 {
901 	return 0;
902 }
903 
904 #endif
905 
906 static int record__config_text_poke(struct evlist *evlist)
907 {
908 	struct evsel *evsel;
909 
910 	/* Nothing to do if text poke is already configured */
911 	evlist__for_each_entry(evlist, evsel) {
912 		if (evsel->core.attr.text_poke)
913 			return 0;
914 	}
915 
916 	evsel = evlist__add_dummy_on_all_cpus(evlist);
917 	if (!evsel)
918 		return -ENOMEM;
919 
920 	evsel->core.attr.text_poke = 1;
921 	evsel->core.attr.ksymbol = 1;
922 	evsel->immediate = true;
923 	evsel__set_sample_bit(evsel, TIME);
924 
925 	return 0;
926 }
927 
928 static int record__config_off_cpu(struct record *rec)
929 {
930 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
931 }
932 
933 static bool record__tracking_system_wide(struct record *rec)
934 {
935 	struct evlist *evlist = rec->evlist;
936 	struct evsel *evsel;
937 
938 	/*
939 	 * If non-dummy evsel exists, system_wide sideband is need to
940 	 * help parse sample information.
941 	 * For example, PERF_EVENT_MMAP event to help parse symbol,
942 	 * and PERF_EVENT_COMM event to help parse task executable name.
943 	 */
944 	evlist__for_each_entry(evlist, evsel) {
945 		if (!evsel__is_dummy_event(evsel))
946 			return true;
947 	}
948 
949 	return false;
950 }
951 
952 static int record__config_tracking_events(struct record *rec)
953 {
954 	struct record_opts *opts = &rec->opts;
955 	struct evlist *evlist = rec->evlist;
956 	bool system_wide = false;
957 	struct evsel *evsel;
958 
959 	/*
960 	 * For initial_delay, system wide or a hybrid system, we need to add
961 	 * tracking event so that we can track PERF_RECORD_MMAP to cover the
962 	 * delay of waiting or event synthesis.
963 	 */
964 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
965 	    perf_pmus__num_core_pmus() > 1) {
966 
967 		/*
968 		 * User space tasks can migrate between CPUs, so when tracing
969 		 * selected CPUs, sideband for all CPUs is still needed.
970 		 */
971 		if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
972 			system_wide = true;
973 
974 		evsel = evlist__findnew_tracking_event(evlist, system_wide);
975 		if (!evsel)
976 			return -ENOMEM;
977 
978 		/*
979 		 * Enable the tracking event when the process is forked for
980 		 * initial_delay, immediately for system wide.
981 		 */
982 		if (opts->target.initial_delay && !evsel->immediate &&
983 		    !target__has_cpu(&opts->target))
984 			evsel->core.attr.enable_on_exec = 1;
985 		else
986 			evsel->immediate = 1;
987 	}
988 
989 	return 0;
990 }
991 
992 static bool record__kcore_readable(struct machine *machine)
993 {
994 	char kcore[PATH_MAX];
995 	int fd;
996 
997 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
998 
999 	fd = open(kcore, O_RDONLY);
1000 	if (fd < 0)
1001 		return false;
1002 
1003 	close(fd);
1004 
1005 	return true;
1006 }
1007 
1008 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
1009 {
1010 	char from_dir[PATH_MAX];
1011 	char kcore_dir[PATH_MAX];
1012 	int ret;
1013 
1014 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
1015 
1016 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1017 	if (ret)
1018 		return ret;
1019 
1020 	return kcore_copy(from_dir, kcore_dir);
1021 }
1022 
1023 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1024 {
1025 	thread_data->pipes.msg[0] = -1;
1026 	thread_data->pipes.msg[1] = -1;
1027 	thread_data->pipes.ack[0] = -1;
1028 	thread_data->pipes.ack[1] = -1;
1029 }
1030 
1031 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1032 {
1033 	if (pipe(thread_data->pipes.msg))
1034 		return -EINVAL;
1035 
1036 	if (pipe(thread_data->pipes.ack)) {
1037 		close(thread_data->pipes.msg[0]);
1038 		thread_data->pipes.msg[0] = -1;
1039 		close(thread_data->pipes.msg[1]);
1040 		thread_data->pipes.msg[1] = -1;
1041 		return -EINVAL;
1042 	}
1043 
1044 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1045 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1046 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1047 
1048 	return 0;
1049 }
1050 
1051 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1052 {
1053 	if (thread_data->pipes.msg[0] != -1) {
1054 		close(thread_data->pipes.msg[0]);
1055 		thread_data->pipes.msg[0] = -1;
1056 	}
1057 	if (thread_data->pipes.msg[1] != -1) {
1058 		close(thread_data->pipes.msg[1]);
1059 		thread_data->pipes.msg[1] = -1;
1060 	}
1061 	if (thread_data->pipes.ack[0] != -1) {
1062 		close(thread_data->pipes.ack[0]);
1063 		thread_data->pipes.ack[0] = -1;
1064 	}
1065 	if (thread_data->pipes.ack[1] != -1) {
1066 		close(thread_data->pipes.ack[1]);
1067 		thread_data->pipes.ack[1] = -1;
1068 	}
1069 }
1070 
1071 static bool evlist__per_thread(struct evlist *evlist)
1072 {
1073 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1074 }
1075 
1076 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1077 {
1078 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1079 	struct mmap *mmap = evlist->mmap;
1080 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1081 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1082 	bool per_thread = evlist__per_thread(evlist);
1083 
1084 	if (per_thread)
1085 		thread_data->nr_mmaps = nr_mmaps;
1086 	else
1087 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1088 						      thread_data->mask->maps.nbits);
1089 	if (mmap) {
1090 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1091 		if (!thread_data->maps)
1092 			return -ENOMEM;
1093 	}
1094 	if (overwrite_mmap) {
1095 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1096 		if (!thread_data->overwrite_maps) {
1097 			zfree(&thread_data->maps);
1098 			return -ENOMEM;
1099 		}
1100 	}
1101 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1102 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1103 
1104 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1105 		if (per_thread ||
1106 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1107 			if (thread_data->maps) {
1108 				thread_data->maps[tm] = &mmap[m];
1109 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1110 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1111 			}
1112 			if (thread_data->overwrite_maps) {
1113 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1114 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1115 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1116 			}
1117 			tm++;
1118 		}
1119 	}
1120 
1121 	return 0;
1122 }
1123 
1124 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1125 {
1126 	int f, tm, pos;
1127 	struct mmap *map, *overwrite_map;
1128 
1129 	fdarray__init(&thread_data->pollfd, 64);
1130 
1131 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1132 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1133 		overwrite_map = thread_data->overwrite_maps ?
1134 				thread_data->overwrite_maps[tm] : NULL;
1135 
1136 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1137 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1138 
1139 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1140 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1141 							      &evlist->core.pollfd);
1142 				if (pos < 0)
1143 					return pos;
1144 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1145 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1146 			}
1147 		}
1148 	}
1149 
1150 	return 0;
1151 }
1152 
1153 static void record__free_thread_data(struct record *rec)
1154 {
1155 	int t;
1156 	struct record_thread *thread_data = rec->thread_data;
1157 
1158 	if (thread_data == NULL)
1159 		return;
1160 
1161 	for (t = 0; t < rec->nr_threads; t++) {
1162 		record__thread_data_close_pipes(&thread_data[t]);
1163 		zfree(&thread_data[t].maps);
1164 		zfree(&thread_data[t].overwrite_maps);
1165 		fdarray__exit(&thread_data[t].pollfd);
1166 	}
1167 
1168 	zfree(&rec->thread_data);
1169 }
1170 
1171 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1172 						    int evlist_pollfd_index,
1173 						    int thread_pollfd_index)
1174 {
1175 	size_t x = rec->index_map_cnt;
1176 
1177 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1178 		return -ENOMEM;
1179 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1180 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1181 	rec->index_map_cnt += 1;
1182 	return 0;
1183 }
1184 
1185 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1186 						    struct evlist *evlist,
1187 						    struct record_thread *thread_data)
1188 {
1189 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1190 	struct pollfd *t_entries = thread_data->pollfd.entries;
1191 	int err = 0;
1192 	size_t i;
1193 
1194 	for (i = 0; i < rec->index_map_cnt; i++) {
1195 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1196 		int t_pos = rec->index_map[i].thread_pollfd_index;
1197 
1198 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1199 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1200 			pr_err("Thread and evlist pollfd index mismatch\n");
1201 			err = -EINVAL;
1202 			continue;
1203 		}
1204 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1205 	}
1206 	return err;
1207 }
1208 
1209 static int record__dup_non_perf_events(struct record *rec,
1210 				       struct evlist *evlist,
1211 				       struct record_thread *thread_data)
1212 {
1213 	struct fdarray *fda = &evlist->core.pollfd;
1214 	int i, ret;
1215 
1216 	for (i = 0; i < fda->nr; i++) {
1217 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1218 			continue;
1219 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1220 		if (ret < 0) {
1221 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1222 			return ret;
1223 		}
1224 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1225 			  thread_data, ret, fda->entries[i].fd);
1226 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1227 		if (ret < 0) {
1228 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1229 			return ret;
1230 		}
1231 	}
1232 	return 0;
1233 }
1234 
1235 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1236 {
1237 	int t, ret;
1238 	struct record_thread *thread_data;
1239 
1240 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1241 	if (!rec->thread_data) {
1242 		pr_err("Failed to allocate thread data\n");
1243 		return -ENOMEM;
1244 	}
1245 	thread_data = rec->thread_data;
1246 
1247 	for (t = 0; t < rec->nr_threads; t++)
1248 		record__thread_data_init_pipes(&thread_data[t]);
1249 
1250 	for (t = 0; t < rec->nr_threads; t++) {
1251 		thread_data[t].rec = rec;
1252 		thread_data[t].mask = &rec->thread_masks[t];
1253 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1254 		if (ret) {
1255 			pr_err("Failed to initialize thread[%d] maps\n", t);
1256 			goto out_free;
1257 		}
1258 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1259 		if (ret) {
1260 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1261 			goto out_free;
1262 		}
1263 		if (t) {
1264 			thread_data[t].tid = -1;
1265 			ret = record__thread_data_open_pipes(&thread_data[t]);
1266 			if (ret) {
1267 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1268 				goto out_free;
1269 			}
1270 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1271 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1272 			if (ret < 0) {
1273 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1274 				goto out_free;
1275 			}
1276 			thread_data[t].ctlfd_pos = ret;
1277 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1278 				 thread_data, thread_data[t].ctlfd_pos,
1279 				 thread_data[t].pipes.msg[0]);
1280 		} else {
1281 			thread_data[t].tid = gettid();
1282 
1283 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1284 			if (ret < 0)
1285 				goto out_free;
1286 
1287 			thread_data[t].ctlfd_pos = -1; /* Not used */
1288 		}
1289 	}
1290 
1291 	return 0;
1292 
1293 out_free:
1294 	record__free_thread_data(rec);
1295 
1296 	return ret;
1297 }
1298 
1299 static int record__mmap_evlist(struct record *rec,
1300 			       struct evlist *evlist)
1301 {
1302 	int i, ret;
1303 	struct record_opts *opts = &rec->opts;
1304 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1305 				  opts->auxtrace_sample_mode;
1306 	char msg[512];
1307 
1308 	if (opts->affinity != PERF_AFFINITY_SYS)
1309 		cpu__setup_cpunode_map();
1310 
1311 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1312 				 opts->auxtrace_mmap_pages,
1313 				 auxtrace_overwrite,
1314 				 opts->nr_cblocks, opts->affinity,
1315 				 opts->mmap_flush, opts->comp_level) < 0) {
1316 		if (errno == EPERM) {
1317 			pr_err("Permission error mapping pages.\n"
1318 			       "Consider increasing "
1319 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1320 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1321 			       "(current value: %u,%u)\n",
1322 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1323 			return -errno;
1324 		} else {
1325 			pr_err("failed to mmap with %d (%s)\n", errno,
1326 				str_error_r(errno, msg, sizeof(msg)));
1327 			if (errno)
1328 				return -errno;
1329 			else
1330 				return -EINVAL;
1331 		}
1332 	}
1333 
1334 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1335 		return -1;
1336 
1337 	ret = record__alloc_thread_data(rec, evlist);
1338 	if (ret)
1339 		return ret;
1340 
1341 	if (record__threads_enabled(rec)) {
1342 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1343 		if (ret) {
1344 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1345 			return ret;
1346 		}
1347 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1348 			if (evlist->mmap)
1349 				evlist->mmap[i].file = &rec->data.dir.files[i];
1350 			if (evlist->overwrite_mmap)
1351 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1352 		}
1353 	}
1354 
1355 	return 0;
1356 }
1357 
1358 static int record__mmap(struct record *rec)
1359 {
1360 	return record__mmap_evlist(rec, rec->evlist);
1361 }
1362 
1363 static int record__open(struct record *rec)
1364 {
1365 	char msg[BUFSIZ];
1366 	struct evsel *pos;
1367 	struct evlist *evlist = rec->evlist;
1368 	struct perf_session *session = rec->session;
1369 	struct record_opts *opts = &rec->opts;
1370 	int rc = 0;
1371 
1372 	evlist__for_each_entry(evlist, pos) {
1373 try_again:
1374 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1375 			if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1376 				if (verbose > 0)
1377 					ui__warning("%s\n", msg);
1378 				goto try_again;
1379 			}
1380 			if ((errno == EINVAL || errno == EBADF) &&
1381 			    pos->core.leader != &pos->core &&
1382 			    pos->weak_group) {
1383 			        pos = evlist__reset_weak_group(evlist, pos, true);
1384 				goto try_again;
1385 			}
1386 			rc = -errno;
1387 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1388 			ui__error("%s\n", msg);
1389 			goto out;
1390 		}
1391 
1392 		pos->supported = true;
1393 	}
1394 
1395 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1396 		pr_warning(
1397 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1398 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1399 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1400 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1401 "Samples in kernel modules won't be resolved at all.\n\n"
1402 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1403 "even with a suitable vmlinux or kallsyms file.\n\n");
1404 	}
1405 
1406 	if (evlist__apply_filters(evlist, &pos, &opts->target)) {
1407 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1408 			pos->filter ?: "BPF", evsel__name(pos), errno,
1409 			str_error_r(errno, msg, sizeof(msg)));
1410 		rc = -1;
1411 		goto out;
1412 	}
1413 
1414 	rc = record__mmap(rec);
1415 	if (rc)
1416 		goto out;
1417 
1418 	session->evlist = evlist;
1419 	perf_session__set_id_hdr_size(session);
1420 out:
1421 	return rc;
1422 }
1423 
1424 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1425 {
1426 	if (rec->evlist->first_sample_time == 0)
1427 		rec->evlist->first_sample_time = sample_time;
1428 
1429 	if (sample_time)
1430 		rec->evlist->last_sample_time = sample_time;
1431 }
1432 
1433 static int process_sample_event(const struct perf_tool *tool,
1434 				union perf_event *event,
1435 				struct perf_sample *sample,
1436 				struct evsel *evsel,
1437 				struct machine *machine)
1438 {
1439 	struct record *rec = container_of(tool, struct record, tool);
1440 
1441 	set_timestamp_boundary(rec, sample->time);
1442 
1443 	if (rec->buildid_all)
1444 		return 0;
1445 
1446 	rec->samples++;
1447 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1448 }
1449 
1450 static int process_buildids(struct record *rec)
1451 {
1452 	struct perf_session *session = rec->session;
1453 
1454 	if (perf_data__size(&rec->data) == 0)
1455 		return 0;
1456 
1457 	/*
1458 	 * During this process, it'll load kernel map and replace the
1459 	 * dso->long_name to a real pathname it found.  In this case
1460 	 * we prefer the vmlinux path like
1461 	 *   /lib/modules/3.16.4/build/vmlinux
1462 	 *
1463 	 * rather than build-id path (in debug directory).
1464 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1465 	 */
1466 	symbol_conf.ignore_vmlinux_buildid = true;
1467 
1468 	/*
1469 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1470 	 * so no need to process samples. But if timestamp_boundary is enabled,
1471 	 * it still needs to walk on all samples to get the timestamps of
1472 	 * first/last samples.
1473 	 */
1474 	if (rec->buildid_all && !rec->timestamp_boundary)
1475 		rec->tool.sample = process_event_sample_stub;
1476 
1477 	return perf_session__process_events(session);
1478 }
1479 
1480 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1481 {
1482 	int err;
1483 	struct perf_tool *tool = data;
1484 	/*
1485 	 *As for guest kernel when processing subcommand record&report,
1486 	 *we arrange module mmap prior to guest kernel mmap and trigger
1487 	 *a preload dso because default guest module symbols are loaded
1488 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1489 	 *method is used to avoid symbol missing when the first addr is
1490 	 *in module instead of in guest kernel.
1491 	 */
1492 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1493 					     machine);
1494 	if (err < 0)
1495 		pr_err("Couldn't record guest kernel [%d]'s reference"
1496 		       " relocation symbol.\n", machine->pid);
1497 
1498 	/*
1499 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1500 	 * have no _text sometimes.
1501 	 */
1502 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1503 						 machine);
1504 	if (err < 0)
1505 		pr_err("Couldn't record guest kernel [%d]'s reference"
1506 		       " relocation symbol.\n", machine->pid);
1507 }
1508 
1509 static struct perf_event_header finished_round_event = {
1510 	.size = sizeof(struct perf_event_header),
1511 	.type = PERF_RECORD_FINISHED_ROUND,
1512 };
1513 
1514 static struct perf_event_header finished_init_event = {
1515 	.size = sizeof(struct perf_event_header),
1516 	.type = PERF_RECORD_FINISHED_INIT,
1517 };
1518 
1519 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1520 {
1521 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1522 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1523 			  thread->mask->affinity.nbits)) {
1524 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1525 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1526 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1527 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1528 					(cpu_set_t *)thread->mask->affinity.bits);
1529 		if (verbose == 2) {
1530 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1531 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1532 		}
1533 	}
1534 }
1535 
1536 static size_t process_comp_header(void *record, size_t increment)
1537 {
1538 	struct perf_record_compressed *event = record;
1539 	size_t size = sizeof(*event);
1540 
1541 	if (increment) {
1542 		event->header.size += increment;
1543 		return increment;
1544 	}
1545 
1546 	event->header.type = PERF_RECORD_COMPRESSED;
1547 	event->header.size = size;
1548 
1549 	return size;
1550 }
1551 
1552 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1553 			    void *dst, size_t dst_size, void *src, size_t src_size)
1554 {
1555 	ssize_t compressed;
1556 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1557 	struct zstd_data *zstd_data = &session->zstd_data;
1558 
1559 	if (map && map->file)
1560 		zstd_data = &map->zstd_data;
1561 
1562 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1563 						     max_record_size, process_comp_header);
1564 	if (compressed < 0)
1565 		return compressed;
1566 
1567 	if (map && map->file) {
1568 		thread->bytes_transferred += src_size;
1569 		thread->bytes_compressed  += compressed;
1570 	} else {
1571 		session->bytes_transferred += src_size;
1572 		session->bytes_compressed  += compressed;
1573 	}
1574 
1575 	return compressed;
1576 }
1577 
1578 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1579 				    bool overwrite, bool synch)
1580 {
1581 	u64 bytes_written = rec->bytes_written;
1582 	int i;
1583 	int rc = 0;
1584 	int nr_mmaps;
1585 	struct mmap **maps;
1586 	int trace_fd = rec->data.file.fd;
1587 	off_t off = 0;
1588 
1589 	if (!evlist)
1590 		return 0;
1591 
1592 	nr_mmaps = thread->nr_mmaps;
1593 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1594 
1595 	if (!maps)
1596 		return 0;
1597 
1598 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1599 		return 0;
1600 
1601 	if (record__aio_enabled(rec))
1602 		off = record__aio_get_pos(trace_fd);
1603 
1604 	for (i = 0; i < nr_mmaps; i++) {
1605 		u64 flush = 0;
1606 		struct mmap *map = maps[i];
1607 
1608 		if (map->core.base) {
1609 			record__adjust_affinity(rec, map);
1610 			if (synch) {
1611 				flush = map->core.flush;
1612 				map->core.flush = 1;
1613 			}
1614 			if (!record__aio_enabled(rec)) {
1615 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1616 					if (synch)
1617 						map->core.flush = flush;
1618 					rc = -1;
1619 					goto out;
1620 				}
1621 			} else {
1622 				if (record__aio_push(rec, map, &off) < 0) {
1623 					record__aio_set_pos(trace_fd, off);
1624 					if (synch)
1625 						map->core.flush = flush;
1626 					rc = -1;
1627 					goto out;
1628 				}
1629 			}
1630 			if (synch)
1631 				map->core.flush = flush;
1632 		}
1633 
1634 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1635 		    !rec->opts.auxtrace_sample_mode &&
1636 		    record__auxtrace_mmap_read(rec, map) != 0) {
1637 			rc = -1;
1638 			goto out;
1639 		}
1640 	}
1641 
1642 	if (record__aio_enabled(rec))
1643 		record__aio_set_pos(trace_fd, off);
1644 
1645 	/*
1646 	 * Mark the round finished in case we wrote
1647 	 * at least one event.
1648 	 *
1649 	 * No need for round events in directory mode,
1650 	 * because per-cpu maps and files have data
1651 	 * sorted by kernel.
1652 	 */
1653 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1654 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1655 
1656 	if (overwrite)
1657 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1658 out:
1659 	return rc;
1660 }
1661 
1662 static int record__mmap_read_all(struct record *rec, bool synch)
1663 {
1664 	int err;
1665 
1666 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1667 	if (err)
1668 		return err;
1669 
1670 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1671 }
1672 
1673 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1674 					   void *arg __maybe_unused)
1675 {
1676 	struct perf_mmap *map = fda->priv[fd].ptr;
1677 
1678 	if (map)
1679 		perf_mmap__put(map);
1680 }
1681 
1682 static void *record__thread(void *arg)
1683 {
1684 	enum thread_msg msg = THREAD_MSG__READY;
1685 	bool terminate = false;
1686 	struct fdarray *pollfd;
1687 	int err, ctlfd_pos;
1688 
1689 	thread = arg;
1690 	thread->tid = gettid();
1691 
1692 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1693 	if (err == -1)
1694 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1695 			   thread->tid, strerror(errno));
1696 
1697 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1698 
1699 	pollfd = &thread->pollfd;
1700 	ctlfd_pos = thread->ctlfd_pos;
1701 
1702 	for (;;) {
1703 		unsigned long long hits = thread->samples;
1704 
1705 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1706 			break;
1707 
1708 		if (hits == thread->samples) {
1709 
1710 			err = fdarray__poll(pollfd, -1);
1711 			/*
1712 			 * Propagate error, only if there's any. Ignore positive
1713 			 * number of returned events and interrupt error.
1714 			 */
1715 			if (err > 0 || (err < 0 && errno == EINTR))
1716 				err = 0;
1717 			thread->waking++;
1718 
1719 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1720 					    record__thread_munmap_filtered, NULL) == 0)
1721 				break;
1722 		}
1723 
1724 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1725 			terminate = true;
1726 			close(thread->pipes.msg[0]);
1727 			thread->pipes.msg[0] = -1;
1728 			pollfd->entries[ctlfd_pos].fd = -1;
1729 			pollfd->entries[ctlfd_pos].events = 0;
1730 		}
1731 
1732 		pollfd->entries[ctlfd_pos].revents = 0;
1733 	}
1734 	record__mmap_read_all(thread->rec, true);
1735 
1736 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1737 	if (err == -1)
1738 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1739 			   thread->tid, strerror(errno));
1740 
1741 	return NULL;
1742 }
1743 
1744 static void record__init_features(struct record *rec)
1745 {
1746 	struct perf_session *session = rec->session;
1747 	int feat;
1748 
1749 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1750 		perf_header__set_feat(&session->header, feat);
1751 
1752 	if (rec->no_buildid)
1753 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1754 
1755 	if (!have_tracepoints(&rec->evlist->core.entries))
1756 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1757 
1758 	if (!rec->opts.branch_stack)
1759 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1760 
1761 	if (!rec->opts.full_auxtrace)
1762 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1763 
1764 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1765 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1766 
1767 	if (!rec->opts.use_clockid)
1768 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1769 
1770 	if (!record__threads_enabled(rec))
1771 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1772 
1773 	if (!record__comp_enabled(rec))
1774 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1775 
1776 	perf_header__clear_feat(&session->header, HEADER_STAT);
1777 }
1778 
1779 static void
1780 record__finish_output(struct record *rec)
1781 {
1782 	int i;
1783 	struct perf_data *data = &rec->data;
1784 	int fd = perf_data__fd(data);
1785 
1786 	if (data->is_pipe) {
1787 		/* Just to display approx. size */
1788 		data->file.size = rec->bytes_written;
1789 		return;
1790 	}
1791 
1792 	rec->session->header.data_size += rec->bytes_written;
1793 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1794 	if (record__threads_enabled(rec)) {
1795 		for (i = 0; i < data->dir.nr; i++)
1796 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1797 	}
1798 
1799 	if (!rec->no_buildid) {
1800 		process_buildids(rec);
1801 
1802 		if (rec->buildid_all)
1803 			perf_session__dsos_hit_all(rec->session);
1804 	}
1805 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1806 
1807 	return;
1808 }
1809 
1810 static int record__synthesize_workload(struct record *rec, bool tail)
1811 {
1812 	int err;
1813 	struct perf_thread_map *thread_map;
1814 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1815 
1816 	if (rec->opts.tail_synthesize != tail)
1817 		return 0;
1818 
1819 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1820 	if (thread_map == NULL)
1821 		return -1;
1822 
1823 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1824 						 process_synthesized_event,
1825 						 &rec->session->machines.host,
1826 						 needs_mmap,
1827 						 rec->opts.sample_address);
1828 	perf_thread_map__put(thread_map);
1829 	return err;
1830 }
1831 
1832 static int write_finished_init(struct record *rec, bool tail)
1833 {
1834 	if (rec->opts.tail_synthesize != tail)
1835 		return 0;
1836 
1837 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1838 }
1839 
1840 static int record__synthesize(struct record *rec, bool tail);
1841 
1842 static int
1843 record__switch_output(struct record *rec, bool at_exit)
1844 {
1845 	struct perf_data *data = &rec->data;
1846 	char *new_filename = NULL;
1847 	int fd, err;
1848 
1849 	/* Same Size:      "2015122520103046"*/
1850 	char timestamp[] = "InvalidTimestamp";
1851 
1852 	record__aio_mmap_read_sync(rec);
1853 
1854 	write_finished_init(rec, true);
1855 
1856 	record__synthesize(rec, true);
1857 	if (target__none(&rec->opts.target))
1858 		record__synthesize_workload(rec, true);
1859 
1860 	rec->samples = 0;
1861 	record__finish_output(rec);
1862 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1863 	if (err) {
1864 		pr_err("Failed to get current timestamp\n");
1865 		return -EINVAL;
1866 	}
1867 
1868 	fd = perf_data__switch(data, timestamp,
1869 			       rec->session->header.data_offset,
1870 			       at_exit, &new_filename);
1871 	if (fd >= 0 && !at_exit) {
1872 		rec->bytes_written = 0;
1873 		rec->session->header.data_size = 0;
1874 	}
1875 
1876 	if (!quiet) {
1877 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1878 			data->path, timestamp);
1879 	}
1880 
1881 	if (rec->switch_output.num_files) {
1882 		int n = rec->switch_output.cur_file + 1;
1883 
1884 		if (n >= rec->switch_output.num_files)
1885 			n = 0;
1886 		rec->switch_output.cur_file = n;
1887 		if (rec->switch_output.filenames[n]) {
1888 			remove(rec->switch_output.filenames[n]);
1889 			zfree(&rec->switch_output.filenames[n]);
1890 		}
1891 		rec->switch_output.filenames[n] = new_filename;
1892 	} else {
1893 		free(new_filename);
1894 	}
1895 
1896 	/* Output tracking events */
1897 	if (!at_exit) {
1898 		record__synthesize(rec, false);
1899 
1900 		/*
1901 		 * In 'perf record --switch-output' without -a,
1902 		 * record__synthesize() in record__switch_output() won't
1903 		 * generate tracking events because there's no thread_map
1904 		 * in evlist. Which causes newly created perf.data doesn't
1905 		 * contain map and comm information.
1906 		 * Create a fake thread_map and directly call
1907 		 * perf_event__synthesize_thread_map() for those events.
1908 		 */
1909 		if (target__none(&rec->opts.target))
1910 			record__synthesize_workload(rec, false);
1911 		write_finished_init(rec, false);
1912 	}
1913 	return fd;
1914 }
1915 
1916 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1917 					struct perf_record_lost_samples *lost,
1918 					int cpu_idx, int thread_idx, u64 lost_count,
1919 					u16 misc_flag)
1920 {
1921 	struct perf_sample_id *sid;
1922 	struct perf_sample sample;
1923 	int id_hdr_size;
1924 
1925 	perf_sample__init(&sample, /*all=*/true);
1926 	lost->lost = lost_count;
1927 	if (evsel->core.ids) {
1928 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1929 		sample.id = sid->id;
1930 	}
1931 
1932 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1933 						       evsel->core.attr.sample_type, &sample);
1934 	lost->header.size = sizeof(*lost) + id_hdr_size;
1935 	lost->header.misc = misc_flag;
1936 	record__write(rec, NULL, lost, lost->header.size);
1937 	perf_sample__exit(&sample);
1938 }
1939 
1940 static void record__read_lost_samples(struct record *rec)
1941 {
1942 	struct perf_session *session = rec->session;
1943 	struct perf_record_lost_samples_and_ids lost;
1944 	struct evsel *evsel;
1945 
1946 	/* there was an error during record__open */
1947 	if (session->evlist == NULL)
1948 		return;
1949 
1950 	evlist__for_each_entry(session->evlist, evsel) {
1951 		struct xyarray *xy = evsel->core.sample_id;
1952 		u64 lost_count;
1953 
1954 		if (xy == NULL || evsel->core.fd == NULL)
1955 			continue;
1956 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1957 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1958 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1959 			continue;
1960 		}
1961 
1962 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1963 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1964 				struct perf_counts_values count;
1965 
1966 				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1967 					pr_debug("read LOST count failed\n");
1968 					return;
1969 				}
1970 
1971 				if (count.lost) {
1972 					memset(&lost, 0, sizeof(lost));
1973 					lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
1974 					__record__save_lost_samples(rec, evsel, &lost.lost,
1975 								    x, y, count.lost, 0);
1976 				}
1977 			}
1978 		}
1979 
1980 		lost_count = perf_bpf_filter__lost_count(evsel);
1981 		if (lost_count) {
1982 			memset(&lost, 0, sizeof(lost));
1983 			lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
1984 			__record__save_lost_samples(rec, evsel, &lost.lost, 0, 0, lost_count,
1985 						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
1986 		}
1987 	}
1988 }
1989 
1990 static volatile sig_atomic_t workload_exec_errno;
1991 
1992 /*
1993  * evlist__prepare_workload will send a SIGUSR1
1994  * if the fork fails, since we asked by setting its
1995  * want_signal to true.
1996  */
1997 static void workload_exec_failed_signal(int signo __maybe_unused,
1998 					siginfo_t *info,
1999 					void *ucontext __maybe_unused)
2000 {
2001 	workload_exec_errno = info->si_value.sival_int;
2002 	done = 1;
2003 	child_finished = 1;
2004 }
2005 
2006 static void snapshot_sig_handler(int sig);
2007 static void alarm_sig_handler(int sig);
2008 
2009 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
2010 {
2011 	if (evlist) {
2012 		if (evlist->mmap && evlist->mmap[0].core.base)
2013 			return evlist->mmap[0].core.base;
2014 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2015 			return evlist->overwrite_mmap[0].core.base;
2016 	}
2017 	return NULL;
2018 }
2019 
2020 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2021 {
2022 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2023 	if (pc)
2024 		return pc;
2025 	return NULL;
2026 }
2027 
2028 static int record__synthesize(struct record *rec, bool tail)
2029 {
2030 	struct perf_session *session = rec->session;
2031 	struct machine *machine = &session->machines.host;
2032 	struct perf_data *data = &rec->data;
2033 	struct record_opts *opts = &rec->opts;
2034 	struct perf_tool *tool = &rec->tool;
2035 	int err = 0;
2036 	event_op f = process_synthesized_event;
2037 
2038 	if (rec->opts.tail_synthesize != tail)
2039 		return 0;
2040 
2041 	if (data->is_pipe) {
2042 		err = perf_event__synthesize_for_pipe(tool, session, data,
2043 						      process_synthesized_event);
2044 		if (err < 0)
2045 			goto out;
2046 
2047 		rec->bytes_written += err;
2048 	}
2049 
2050 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2051 					  process_synthesized_event, machine);
2052 	if (err)
2053 		goto out;
2054 
2055 	/* Synthesize id_index before auxtrace_info */
2056 	err = perf_event__synthesize_id_index(tool,
2057 					      process_synthesized_event,
2058 					      session->evlist, machine);
2059 	if (err)
2060 		goto out;
2061 
2062 	if (rec->opts.full_auxtrace) {
2063 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2064 					session, process_synthesized_event);
2065 		if (err)
2066 			goto out;
2067 	}
2068 
2069 	if (!evlist__exclude_kernel(rec->evlist)) {
2070 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2071 							 machine);
2072 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2073 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2074 				   "Check /proc/kallsyms permission or run as root.\n");
2075 
2076 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2077 						     machine);
2078 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2079 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2080 				   "Check /proc/modules permission or run as root.\n");
2081 	}
2082 
2083 	if (perf_guest) {
2084 		machines__process_guests(&session->machines,
2085 					 perf_event__synthesize_guest_os, tool);
2086 	}
2087 
2088 	err = perf_event__synthesize_extra_attr(&rec->tool,
2089 						rec->evlist,
2090 						process_synthesized_event,
2091 						data->is_pipe);
2092 	if (err)
2093 		goto out;
2094 
2095 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2096 						 process_synthesized_event,
2097 						NULL);
2098 	if (err < 0) {
2099 		pr_err("Couldn't synthesize thread map.\n");
2100 		return err;
2101 	}
2102 
2103 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2104 					     process_synthesized_event, NULL);
2105 	if (err < 0) {
2106 		pr_err("Couldn't synthesize cpu map.\n");
2107 		return err;
2108 	}
2109 
2110 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2111 						machine, opts);
2112 	if (err < 0) {
2113 		pr_warning("Couldn't synthesize bpf events.\n");
2114 		err = 0;
2115 	}
2116 
2117 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2118 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2119 						     machine);
2120 		if (err < 0) {
2121 			pr_warning("Couldn't synthesize cgroup events.\n");
2122 			err = 0;
2123 		}
2124 	}
2125 
2126 	if (rec->opts.nr_threads_synthesize > 1) {
2127 		mutex_init(&synth_lock);
2128 		perf_set_multithreaded();
2129 		f = process_locked_synthesized_event;
2130 	}
2131 
2132 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2133 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2134 
2135 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2136 						    rec->evlist->core.threads,
2137 						    f, needs_mmap, opts->sample_address,
2138 						    rec->opts.nr_threads_synthesize);
2139 	}
2140 
2141 	if (rec->opts.nr_threads_synthesize > 1) {
2142 		perf_set_singlethreaded();
2143 		mutex_destroy(&synth_lock);
2144 	}
2145 
2146 out:
2147 	return err;
2148 }
2149 
2150 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2151 {
2152 	struct record *rec = data;
2153 	pthread_kill(rec->thread_id, SIGUSR2);
2154 	return 0;
2155 }
2156 
2157 static int record__setup_sb_evlist(struct record *rec)
2158 {
2159 	struct record_opts *opts = &rec->opts;
2160 
2161 	if (rec->sb_evlist != NULL) {
2162 		/*
2163 		 * We get here if --switch-output-event populated the
2164 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2165 		 * to the main thread.
2166 		 */
2167 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2168 		rec->thread_id = pthread_self();
2169 	}
2170 #ifdef HAVE_LIBBPF_SUPPORT
2171 	if (!opts->no_bpf_event) {
2172 		if (rec->sb_evlist == NULL) {
2173 			rec->sb_evlist = evlist__new();
2174 
2175 			if (rec->sb_evlist == NULL) {
2176 				pr_err("Couldn't create side band evlist.\n.");
2177 				return -1;
2178 			}
2179 		}
2180 
2181 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2182 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2183 			return -1;
2184 		}
2185 	}
2186 #endif
2187 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2188 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2189 		opts->no_bpf_event = true;
2190 	}
2191 
2192 	return 0;
2193 }
2194 
2195 static int record__init_clock(struct record *rec)
2196 {
2197 	struct perf_session *session = rec->session;
2198 	struct timespec ref_clockid;
2199 	struct timeval ref_tod;
2200 	u64 ref;
2201 
2202 	if (!rec->opts.use_clockid)
2203 		return 0;
2204 
2205 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2206 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2207 
2208 	session->header.env.clock.clockid = rec->opts.clockid;
2209 
2210 	if (gettimeofday(&ref_tod, NULL) != 0) {
2211 		pr_err("gettimeofday failed, cannot set reference time.\n");
2212 		return -1;
2213 	}
2214 
2215 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2216 		pr_err("clock_gettime failed, cannot set reference time.\n");
2217 		return -1;
2218 	}
2219 
2220 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2221 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2222 
2223 	session->header.env.clock.tod_ns = ref;
2224 
2225 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2226 	      (u64) ref_clockid.tv_nsec;
2227 
2228 	session->header.env.clock.clockid_ns = ref;
2229 	return 0;
2230 }
2231 
2232 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2233 {
2234 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2235 		trigger_hit(&auxtrace_snapshot_trigger);
2236 		auxtrace_record__snapshot_started = 1;
2237 		if (auxtrace_record__snapshot_start(rec->itr))
2238 			trigger_error(&auxtrace_snapshot_trigger);
2239 	}
2240 }
2241 
2242 static int record__terminate_thread(struct record_thread *thread_data)
2243 {
2244 	int err;
2245 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2246 	pid_t tid = thread_data->tid;
2247 
2248 	close(thread_data->pipes.msg[1]);
2249 	thread_data->pipes.msg[1] = -1;
2250 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2251 	if (err > 0)
2252 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2253 	else
2254 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2255 			   thread->tid, tid);
2256 
2257 	return 0;
2258 }
2259 
2260 static int record__start_threads(struct record *rec)
2261 {
2262 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2263 	struct record_thread *thread_data = rec->thread_data;
2264 	sigset_t full, mask;
2265 	pthread_t handle;
2266 	pthread_attr_t attrs;
2267 
2268 	thread = &thread_data[0];
2269 
2270 	if (!record__threads_enabled(rec))
2271 		return 0;
2272 
2273 	sigfillset(&full);
2274 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2275 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2276 		return -1;
2277 	}
2278 
2279 	pthread_attr_init(&attrs);
2280 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2281 
2282 	for (t = 1; t < nr_threads; t++) {
2283 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2284 
2285 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2286 		pthread_attr_setaffinity_np(&attrs,
2287 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2288 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2289 #endif
2290 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2291 			for (tt = 1; tt < t; tt++)
2292 				record__terminate_thread(&thread_data[t]);
2293 			pr_err("Failed to start threads: %s\n", strerror(errno));
2294 			ret = -1;
2295 			goto out_err;
2296 		}
2297 
2298 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2299 		if (err > 0)
2300 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2301 				  thread_msg_tags[msg]);
2302 		else
2303 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2304 				   thread->tid, rec->thread_data[t].tid);
2305 	}
2306 
2307 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2308 			(cpu_set_t *)thread->mask->affinity.bits);
2309 
2310 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2311 
2312 out_err:
2313 	pthread_attr_destroy(&attrs);
2314 
2315 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2316 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2317 		ret = -1;
2318 	}
2319 
2320 	return ret;
2321 }
2322 
2323 static int record__stop_threads(struct record *rec)
2324 {
2325 	int t;
2326 	struct record_thread *thread_data = rec->thread_data;
2327 
2328 	for (t = 1; t < rec->nr_threads; t++)
2329 		record__terminate_thread(&thread_data[t]);
2330 
2331 	for (t = 0; t < rec->nr_threads; t++) {
2332 		rec->samples += thread_data[t].samples;
2333 		if (!record__threads_enabled(rec))
2334 			continue;
2335 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2336 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2337 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2338 			 thread_data[t].samples, thread_data[t].waking);
2339 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2340 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2341 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2342 		else
2343 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2344 	}
2345 
2346 	return 0;
2347 }
2348 
2349 static unsigned long record__waking(struct record *rec)
2350 {
2351 	int t;
2352 	unsigned long waking = 0;
2353 	struct record_thread *thread_data = rec->thread_data;
2354 
2355 	for (t = 0; t < rec->nr_threads; t++)
2356 		waking += thread_data[t].waking;
2357 
2358 	return waking;
2359 }
2360 
2361 static int __cmd_record(struct record *rec, int argc, const char **argv)
2362 {
2363 	int err;
2364 	int status = 0;
2365 	const bool forks = argc > 0;
2366 	struct perf_tool *tool = &rec->tool;
2367 	struct record_opts *opts = &rec->opts;
2368 	struct perf_data *data = &rec->data;
2369 	struct perf_session *session;
2370 	bool disabled = false, draining = false;
2371 	int fd;
2372 	float ratio = 0;
2373 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2374 
2375 	atexit(record__sig_exit);
2376 	signal(SIGCHLD, sig_handler);
2377 	signal(SIGINT, sig_handler);
2378 	signal(SIGTERM, sig_handler);
2379 	signal(SIGSEGV, sigsegv_handler);
2380 
2381 	if (rec->opts.record_cgroup) {
2382 #ifndef HAVE_FILE_HANDLE
2383 		pr_err("cgroup tracking is not supported\n");
2384 		return -1;
2385 #endif
2386 	}
2387 
2388 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2389 		signal(SIGUSR2, snapshot_sig_handler);
2390 		if (rec->opts.auxtrace_snapshot_mode)
2391 			trigger_on(&auxtrace_snapshot_trigger);
2392 		if (rec->switch_output.enabled)
2393 			trigger_on(&switch_output_trigger);
2394 	} else {
2395 		signal(SIGUSR2, SIG_IGN);
2396 	}
2397 
2398 	perf_tool__init(tool, /*ordered_events=*/true);
2399 	tool->sample		= process_sample_event;
2400 	tool->fork		= perf_event__process_fork;
2401 	tool->exit		= perf_event__process_exit;
2402 	tool->comm		= perf_event__process_comm;
2403 	tool->namespaces	= perf_event__process_namespaces;
2404 	tool->mmap		= build_id__process_mmap;
2405 	tool->mmap2		= build_id__process_mmap2;
2406 	tool->itrace_start	= process_timestamp_boundary;
2407 	tool->aux		= process_timestamp_boundary;
2408 	tool->namespace_events	= rec->opts.record_namespaces;
2409 	tool->cgroup_events	= rec->opts.record_cgroup;
2410 	session = perf_session__new(data, tool);
2411 	if (IS_ERR(session)) {
2412 		pr_err("Perf session creation failed.\n");
2413 		return PTR_ERR(session);
2414 	}
2415 
2416 	if (record__threads_enabled(rec)) {
2417 		if (perf_data__is_pipe(&rec->data)) {
2418 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2419 			return -1;
2420 		}
2421 		if (rec->opts.full_auxtrace) {
2422 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2423 			return -1;
2424 		}
2425 	}
2426 
2427 	fd = perf_data__fd(data);
2428 	rec->session = session;
2429 
2430 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2431 		pr_err("Compression initialization failed.\n");
2432 		return -1;
2433 	}
2434 #ifdef HAVE_EVENTFD_SUPPORT
2435 	done_fd = eventfd(0, EFD_NONBLOCK);
2436 	if (done_fd < 0) {
2437 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2438 		status = -1;
2439 		goto out_delete_session;
2440 	}
2441 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2442 	if (err < 0) {
2443 		pr_err("Failed to add wakeup eventfd to poll list\n");
2444 		status = err;
2445 		goto out_delete_session;
2446 	}
2447 #endif // HAVE_EVENTFD_SUPPORT
2448 
2449 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2450 	session->header.env.comp_level = rec->opts.comp_level;
2451 
2452 	if (rec->opts.kcore &&
2453 	    !record__kcore_readable(&session->machines.host)) {
2454 		pr_err("ERROR: kcore is not readable.\n");
2455 		return -1;
2456 	}
2457 
2458 	if (record__init_clock(rec))
2459 		return -1;
2460 
2461 	record__init_features(rec);
2462 
2463 	if (forks) {
2464 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2465 					       workload_exec_failed_signal);
2466 		if (err < 0) {
2467 			pr_err("Couldn't run the workload!\n");
2468 			status = err;
2469 			goto out_delete_session;
2470 		}
2471 	}
2472 
2473 	/*
2474 	 * If we have just single event and are sending data
2475 	 * through pipe, we need to force the ids allocation,
2476 	 * because we synthesize event name through the pipe
2477 	 * and need the id for that.
2478 	 */
2479 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2480 		rec->opts.sample_id = true;
2481 
2482 	if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2483 		rec->timestamp_filename = false;
2484 		pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2485 	}
2486 
2487 	evlist__uniquify_name(rec->evlist);
2488 
2489 	evlist__config(rec->evlist, opts, &callchain_param);
2490 
2491 	/* Debug message used by test scripts */
2492 	pr_debug3("perf record opening and mmapping events\n");
2493 	if (record__open(rec) != 0) {
2494 		err = -1;
2495 		goto out_free_threads;
2496 	}
2497 	/* Debug message used by test scripts */
2498 	pr_debug3("perf record done opening and mmapping events\n");
2499 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2500 
2501 	if (rec->opts.kcore) {
2502 		err = record__kcore_copy(&session->machines.host, data);
2503 		if (err) {
2504 			pr_err("ERROR: Failed to copy kcore\n");
2505 			goto out_free_threads;
2506 		}
2507 	}
2508 
2509 	/*
2510 	 * Normally perf_session__new would do this, but it doesn't have the
2511 	 * evlist.
2512 	 */
2513 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2514 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2515 		rec->tool.ordered_events = false;
2516 	}
2517 
2518 	if (evlist__nr_groups(rec->evlist) == 0)
2519 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2520 
2521 	if (data->is_pipe) {
2522 		err = perf_header__write_pipe(fd);
2523 		if (err < 0)
2524 			goto out_free_threads;
2525 	} else {
2526 		err = perf_session__write_header(session, rec->evlist, fd, false);
2527 		if (err < 0)
2528 			goto out_free_threads;
2529 	}
2530 
2531 	err = -1;
2532 	if (!rec->no_buildid
2533 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2534 		pr_err("Couldn't generate buildids. "
2535 		       "Use --no-buildid to profile anyway.\n");
2536 		goto out_free_threads;
2537 	}
2538 
2539 	if (!evlist__needs_bpf_sb_event(rec->evlist))
2540 		opts->no_bpf_event = true;
2541 
2542 	err = record__setup_sb_evlist(rec);
2543 	if (err)
2544 		goto out_free_threads;
2545 
2546 	err = record__synthesize(rec, false);
2547 	if (err < 0)
2548 		goto out_free_threads;
2549 
2550 	if (rec->realtime_prio) {
2551 		struct sched_param param;
2552 
2553 		param.sched_priority = rec->realtime_prio;
2554 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2555 			pr_err("Could not set realtime priority.\n");
2556 			err = -1;
2557 			goto out_free_threads;
2558 		}
2559 	}
2560 
2561 	if (record__start_threads(rec))
2562 		goto out_free_threads;
2563 
2564 	/*
2565 	 * When perf is starting the traced process, all the events
2566 	 * (apart from group members) have enable_on_exec=1 set,
2567 	 * so don't spoil it by prematurely enabling them.
2568 	 */
2569 	if (!target__none(&opts->target) && !opts->target.initial_delay)
2570 		evlist__enable(rec->evlist);
2571 
2572 	/*
2573 	 * offcpu-time does not call execve, so enable_on_exe wouldn't work
2574 	 * when recording a workload, do it manually
2575 	 */
2576 	if (rec->off_cpu)
2577 		evlist__enable_evsel(rec->evlist, (char *)OFFCPU_EVENT);
2578 
2579 	/*
2580 	 * Let the child rip
2581 	 */
2582 	if (forks) {
2583 		struct machine *machine = &session->machines.host;
2584 		union perf_event *event;
2585 		pid_t tgid;
2586 
2587 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2588 		if (event == NULL) {
2589 			err = -ENOMEM;
2590 			goto out_child;
2591 		}
2592 
2593 		/*
2594 		 * Some H/W events are generated before COMM event
2595 		 * which is emitted during exec(), so perf script
2596 		 * cannot see a correct process name for those events.
2597 		 * Synthesize COMM event to prevent it.
2598 		 */
2599 		tgid = perf_event__synthesize_comm(tool, event,
2600 						   rec->evlist->workload.pid,
2601 						   process_synthesized_event,
2602 						   machine);
2603 		free(event);
2604 
2605 		if (tgid == -1)
2606 			goto out_child;
2607 
2608 		event = malloc(sizeof(event->namespaces) +
2609 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2610 			       machine->id_hdr_size);
2611 		if (event == NULL) {
2612 			err = -ENOMEM;
2613 			goto out_child;
2614 		}
2615 
2616 		/*
2617 		 * Synthesize NAMESPACES event for the command specified.
2618 		 */
2619 		perf_event__synthesize_namespaces(tool, event,
2620 						  rec->evlist->workload.pid,
2621 						  tgid, process_synthesized_event,
2622 						  machine);
2623 		free(event);
2624 
2625 		evlist__start_workload(rec->evlist);
2626 	}
2627 
2628 	if (opts->target.initial_delay) {
2629 		pr_info(EVLIST_DISABLED_MSG);
2630 		if (opts->target.initial_delay > 0) {
2631 			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2632 			evlist__enable(rec->evlist);
2633 			pr_info(EVLIST_ENABLED_MSG);
2634 		}
2635 	}
2636 
2637 	err = event_enable_timer__start(rec->evlist->eet);
2638 	if (err)
2639 		goto out_child;
2640 
2641 	/* Debug message used by test scripts */
2642 	pr_debug3("perf record has started\n");
2643 	fflush(stderr);
2644 
2645 	trigger_ready(&auxtrace_snapshot_trigger);
2646 	trigger_ready(&switch_output_trigger);
2647 	perf_hooks__invoke_record_start();
2648 
2649 	/*
2650 	 * Must write FINISHED_INIT so it will be seen after all other
2651 	 * synthesized user events, but before any regular events.
2652 	 */
2653 	err = write_finished_init(rec, false);
2654 	if (err < 0)
2655 		goto out_child;
2656 
2657 	for (;;) {
2658 		unsigned long long hits = thread->samples;
2659 
2660 		/*
2661 		 * rec->evlist->bkw_mmap_state is possible to be
2662 		 * BKW_MMAP_EMPTY here: when done == true and
2663 		 * hits != rec->samples in previous round.
2664 		 *
2665 		 * evlist__toggle_bkw_mmap ensure we never
2666 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2667 		 */
2668 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2669 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2670 
2671 		if (record__mmap_read_all(rec, false) < 0) {
2672 			trigger_error(&auxtrace_snapshot_trigger);
2673 			trigger_error(&switch_output_trigger);
2674 			err = -1;
2675 			goto out_child;
2676 		}
2677 
2678 		if (auxtrace_record__snapshot_started) {
2679 			auxtrace_record__snapshot_started = 0;
2680 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2681 				record__read_auxtrace_snapshot(rec, false);
2682 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2683 				pr_err("AUX area tracing snapshot failed\n");
2684 				err = -1;
2685 				goto out_child;
2686 			}
2687 		}
2688 
2689 		if (trigger_is_hit(&switch_output_trigger)) {
2690 			/*
2691 			 * If switch_output_trigger is hit, the data in
2692 			 * overwritable ring buffer should have been collected,
2693 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2694 			 *
2695 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2696 			 * record__mmap_read_all() didn't collect data from
2697 			 * overwritable ring buffer. Read again.
2698 			 */
2699 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2700 				continue;
2701 			trigger_ready(&switch_output_trigger);
2702 
2703 			/*
2704 			 * Reenable events in overwrite ring buffer after
2705 			 * record__mmap_read_all(): we should have collected
2706 			 * data from it.
2707 			 */
2708 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2709 
2710 			if (!quiet)
2711 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2712 					record__waking(rec));
2713 			thread->waking = 0;
2714 			fd = record__switch_output(rec, false);
2715 			if (fd < 0) {
2716 				pr_err("Failed to switch to new file\n");
2717 				trigger_error(&switch_output_trigger);
2718 				err = fd;
2719 				goto out_child;
2720 			}
2721 
2722 			/* re-arm the alarm */
2723 			if (rec->switch_output.time)
2724 				alarm(rec->switch_output.time);
2725 		}
2726 
2727 		if (hits == thread->samples) {
2728 			if (done || draining)
2729 				break;
2730 			err = fdarray__poll(&thread->pollfd, -1);
2731 			/*
2732 			 * Propagate error, only if there's any. Ignore positive
2733 			 * number of returned events and interrupt error.
2734 			 */
2735 			if (err > 0 || (err < 0 && errno == EINTR))
2736 				err = 0;
2737 			thread->waking++;
2738 
2739 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2740 					    record__thread_munmap_filtered, NULL) == 0)
2741 				draining = true;
2742 
2743 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2744 			if (err)
2745 				goto out_child;
2746 		}
2747 
2748 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2749 			switch (cmd) {
2750 			case EVLIST_CTL_CMD_SNAPSHOT:
2751 				hit_auxtrace_snapshot_trigger(rec);
2752 				evlist__ctlfd_ack(rec->evlist);
2753 				break;
2754 			case EVLIST_CTL_CMD_STOP:
2755 				done = 1;
2756 				break;
2757 			case EVLIST_CTL_CMD_ACK:
2758 			case EVLIST_CTL_CMD_UNSUPPORTED:
2759 			case EVLIST_CTL_CMD_ENABLE:
2760 			case EVLIST_CTL_CMD_DISABLE:
2761 			case EVLIST_CTL_CMD_EVLIST:
2762 			case EVLIST_CTL_CMD_PING:
2763 			default:
2764 				break;
2765 			}
2766 		}
2767 
2768 		err = event_enable_timer__process(rec->evlist->eet);
2769 		if (err < 0)
2770 			goto out_child;
2771 		if (err) {
2772 			err = 0;
2773 			done = 1;
2774 		}
2775 
2776 		/*
2777 		 * When perf is starting the traced process, at the end events
2778 		 * die with the process and we wait for that. Thus no need to
2779 		 * disable events in this case.
2780 		 */
2781 		if (done && !disabled && !target__none(&opts->target)) {
2782 			trigger_off(&auxtrace_snapshot_trigger);
2783 			evlist__disable(rec->evlist);
2784 			disabled = true;
2785 		}
2786 	}
2787 
2788 	trigger_off(&auxtrace_snapshot_trigger);
2789 	trigger_off(&switch_output_trigger);
2790 
2791 	if (opts->auxtrace_snapshot_on_exit)
2792 		record__auxtrace_snapshot_exit(rec);
2793 
2794 	if (forks && workload_exec_errno) {
2795 		char msg[STRERR_BUFSIZE];
2796 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2797 		struct strbuf sb = STRBUF_INIT;
2798 
2799 		evlist__format_evsels(rec->evlist, &sb, 2048);
2800 
2801 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2802 			sb.buf, argv[0], emsg);
2803 		strbuf_release(&sb);
2804 		err = -1;
2805 		goto out_child;
2806 	}
2807 
2808 	if (!quiet)
2809 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2810 			record__waking(rec));
2811 
2812 	write_finished_init(rec, true);
2813 
2814 	if (target__none(&rec->opts.target))
2815 		record__synthesize_workload(rec, true);
2816 
2817 out_child:
2818 	record__stop_threads(rec);
2819 	record__mmap_read_all(rec, true);
2820 out_free_threads:
2821 	record__free_thread_data(rec);
2822 	evlist__finalize_ctlfd(rec->evlist);
2823 	record__aio_mmap_read_sync(rec);
2824 
2825 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2826 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2827 		session->header.env.comp_ratio = ratio + 0.5;
2828 	}
2829 
2830 	if (forks) {
2831 		int exit_status;
2832 
2833 		if (!child_finished)
2834 			kill(rec->evlist->workload.pid, SIGTERM);
2835 
2836 		wait(&exit_status);
2837 
2838 		if (err < 0)
2839 			status = err;
2840 		else if (WIFEXITED(exit_status))
2841 			status = WEXITSTATUS(exit_status);
2842 		else if (WIFSIGNALED(exit_status))
2843 			signr = WTERMSIG(exit_status);
2844 	} else
2845 		status = err;
2846 
2847 	if (rec->off_cpu)
2848 		rec->bytes_written += off_cpu_write(rec->session);
2849 
2850 	record__read_lost_samples(rec);
2851 	record__synthesize(rec, true);
2852 	/* this will be recalculated during process_buildids() */
2853 	rec->samples = 0;
2854 
2855 	if (!err) {
2856 		if (!rec->timestamp_filename) {
2857 			record__finish_output(rec);
2858 		} else {
2859 			fd = record__switch_output(rec, true);
2860 			if (fd < 0) {
2861 				status = fd;
2862 				goto out_delete_session;
2863 			}
2864 		}
2865 	}
2866 
2867 	perf_hooks__invoke_record_end();
2868 
2869 	if (!err && !quiet) {
2870 		char samples[128];
2871 		const char *postfix = rec->timestamp_filename ?
2872 					".<timestamp>" : "";
2873 
2874 		if (rec->samples && !rec->opts.full_auxtrace)
2875 			scnprintf(samples, sizeof(samples),
2876 				  " (%" PRIu64 " samples)", rec->samples);
2877 		else
2878 			samples[0] = '\0';
2879 
2880 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2881 			perf_data__size(data) / 1024.0 / 1024.0,
2882 			data->path, postfix, samples);
2883 		if (ratio) {
2884 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2885 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2886 					ratio);
2887 		}
2888 		fprintf(stderr, " ]\n");
2889 	}
2890 
2891 out_delete_session:
2892 #ifdef HAVE_EVENTFD_SUPPORT
2893 	if (done_fd >= 0) {
2894 		fd = done_fd;
2895 		done_fd = -1;
2896 
2897 		close(fd);
2898 	}
2899 #endif
2900 	zstd_fini(&session->zstd_data);
2901 	if (!opts->no_bpf_event)
2902 		evlist__stop_sb_thread(rec->sb_evlist);
2903 
2904 	perf_session__delete(session);
2905 	return status;
2906 }
2907 
2908 static void callchain_debug(struct callchain_param *callchain)
2909 {
2910 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2911 
2912 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2913 
2914 	if (callchain->record_mode == CALLCHAIN_DWARF)
2915 		pr_debug("callchain: stack dump size %d\n",
2916 			 callchain->dump_size);
2917 }
2918 
2919 int record_opts__parse_callchain(struct record_opts *record,
2920 				 struct callchain_param *callchain,
2921 				 const char *arg, bool unset)
2922 {
2923 	int ret;
2924 	callchain->enabled = !unset;
2925 
2926 	/* --no-call-graph */
2927 	if (unset) {
2928 		callchain->record_mode = CALLCHAIN_NONE;
2929 		pr_debug("callchain: disabled\n");
2930 		return 0;
2931 	}
2932 
2933 	ret = parse_callchain_record_opt(arg, callchain);
2934 	if (!ret) {
2935 		/* Enable data address sampling for DWARF unwind. */
2936 		if (callchain->record_mode == CALLCHAIN_DWARF)
2937 			record->sample_address = true;
2938 		callchain_debug(callchain);
2939 	}
2940 
2941 	return ret;
2942 }
2943 
2944 int record_parse_callchain_opt(const struct option *opt,
2945 			       const char *arg,
2946 			       int unset)
2947 {
2948 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2949 }
2950 
2951 int record_callchain_opt(const struct option *opt,
2952 			 const char *arg __maybe_unused,
2953 			 int unset __maybe_unused)
2954 {
2955 	struct callchain_param *callchain = opt->value;
2956 
2957 	callchain->enabled = true;
2958 
2959 	if (callchain->record_mode == CALLCHAIN_NONE)
2960 		callchain->record_mode = CALLCHAIN_FP;
2961 
2962 	callchain_debug(callchain);
2963 	return 0;
2964 }
2965 
2966 static int perf_record_config(const char *var, const char *value, void *cb)
2967 {
2968 	struct record *rec = cb;
2969 
2970 	if (!strcmp(var, "record.build-id")) {
2971 		if (!strcmp(value, "cache"))
2972 			rec->no_buildid_cache = false;
2973 		else if (!strcmp(value, "no-cache"))
2974 			rec->no_buildid_cache = true;
2975 		else if (!strcmp(value, "skip"))
2976 			rec->no_buildid = true;
2977 		else if (!strcmp(value, "mmap"))
2978 			rec->buildid_mmap = true;
2979 		else
2980 			return -1;
2981 		return 0;
2982 	}
2983 	if (!strcmp(var, "record.call-graph")) {
2984 		var = "call-graph.record-mode";
2985 		return perf_default_config(var, value, cb);
2986 	}
2987 #ifdef HAVE_AIO_SUPPORT
2988 	if (!strcmp(var, "record.aio")) {
2989 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2990 		if (!rec->opts.nr_cblocks)
2991 			rec->opts.nr_cblocks = nr_cblocks_default;
2992 	}
2993 #endif
2994 	if (!strcmp(var, "record.debuginfod")) {
2995 		rec->debuginfod.urls = strdup(value);
2996 		if (!rec->debuginfod.urls)
2997 			return -ENOMEM;
2998 		rec->debuginfod.set = true;
2999 	}
3000 
3001 	return 0;
3002 }
3003 
3004 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
3005 {
3006 	struct record *rec = (struct record *)opt->value;
3007 
3008 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
3009 }
3010 
3011 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
3012 {
3013 	struct record_opts *opts = (struct record_opts *)opt->value;
3014 
3015 	if (unset || !str)
3016 		return 0;
3017 
3018 	if (!strcasecmp(str, "node"))
3019 		opts->affinity = PERF_AFFINITY_NODE;
3020 	else if (!strcasecmp(str, "cpu"))
3021 		opts->affinity = PERF_AFFINITY_CPU;
3022 
3023 	return 0;
3024 }
3025 
3026 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3027 {
3028 	mask->nbits = nr_bits;
3029 	mask->bits = bitmap_zalloc(mask->nbits);
3030 	if (!mask->bits)
3031 		return -ENOMEM;
3032 
3033 	return 0;
3034 }
3035 
3036 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3037 {
3038 	bitmap_free(mask->bits);
3039 	mask->nbits = 0;
3040 }
3041 
3042 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3043 {
3044 	int ret;
3045 
3046 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3047 	if (ret) {
3048 		mask->affinity.bits = NULL;
3049 		return ret;
3050 	}
3051 
3052 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3053 	if (ret) {
3054 		record__mmap_cpu_mask_free(&mask->maps);
3055 		mask->maps.bits = NULL;
3056 	}
3057 
3058 	return ret;
3059 }
3060 
3061 static void record__thread_mask_free(struct thread_mask *mask)
3062 {
3063 	record__mmap_cpu_mask_free(&mask->maps);
3064 	record__mmap_cpu_mask_free(&mask->affinity);
3065 }
3066 
3067 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3068 {
3069 	int s;
3070 	struct record_opts *opts = opt->value;
3071 
3072 	if (unset || !str || !strlen(str)) {
3073 		opts->threads_spec = THREAD_SPEC__CPU;
3074 	} else {
3075 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3076 			if (s == THREAD_SPEC__USER) {
3077 				opts->threads_user_spec = strdup(str);
3078 				if (!opts->threads_user_spec)
3079 					return -ENOMEM;
3080 				opts->threads_spec = THREAD_SPEC__USER;
3081 				break;
3082 			}
3083 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3084 				opts->threads_spec = s;
3085 				break;
3086 			}
3087 		}
3088 	}
3089 
3090 	if (opts->threads_spec == THREAD_SPEC__USER)
3091 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3092 	else
3093 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3094 
3095 	return 0;
3096 }
3097 
3098 static int parse_output_max_size(const struct option *opt,
3099 				 const char *str, int unset)
3100 {
3101 	unsigned long *s = (unsigned long *)opt->value;
3102 	static struct parse_tag tags_size[] = {
3103 		{ .tag  = 'B', .mult = 1       },
3104 		{ .tag  = 'K', .mult = 1 << 10 },
3105 		{ .tag  = 'M', .mult = 1 << 20 },
3106 		{ .tag  = 'G', .mult = 1 << 30 },
3107 		{ .tag  = 0 },
3108 	};
3109 	unsigned long val;
3110 
3111 	if (unset) {
3112 		*s = 0;
3113 		return 0;
3114 	}
3115 
3116 	val = parse_tag_value(str, tags_size);
3117 	if (val != (unsigned long) -1) {
3118 		*s = val;
3119 		return 0;
3120 	}
3121 
3122 	return -1;
3123 }
3124 
3125 static int record__parse_mmap_pages(const struct option *opt,
3126 				    const char *str,
3127 				    int unset __maybe_unused)
3128 {
3129 	struct record_opts *opts = opt->value;
3130 	char *s, *p;
3131 	unsigned int mmap_pages;
3132 	int ret;
3133 
3134 	if (!str)
3135 		return -EINVAL;
3136 
3137 	s = strdup(str);
3138 	if (!s)
3139 		return -ENOMEM;
3140 
3141 	p = strchr(s, ',');
3142 	if (p)
3143 		*p = '\0';
3144 
3145 	if (*s) {
3146 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3147 		if (ret)
3148 			goto out_free;
3149 		opts->mmap_pages = mmap_pages;
3150 	}
3151 
3152 	if (!p) {
3153 		ret = 0;
3154 		goto out_free;
3155 	}
3156 
3157 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3158 	if (ret)
3159 		goto out_free;
3160 
3161 	opts->auxtrace_mmap_pages = mmap_pages;
3162 
3163 out_free:
3164 	free(s);
3165 	return ret;
3166 }
3167 
3168 static int record__parse_off_cpu_thresh(const struct option *opt,
3169 					const char *str,
3170 					int unset __maybe_unused)
3171 {
3172 	struct record_opts *opts = opt->value;
3173 	char *endptr;
3174 	u64 off_cpu_thresh_ms;
3175 
3176 	if (!str)
3177 		return -EINVAL;
3178 
3179 	off_cpu_thresh_ms = strtoull(str, &endptr, 10);
3180 
3181 	/* the threshold isn't string "0", yet strtoull() returns 0, parsing failed */
3182 	if (*endptr || (off_cpu_thresh_ms == 0 && strcmp(str, "0")))
3183 		return -EINVAL;
3184 	else
3185 		opts->off_cpu_thresh_ns = off_cpu_thresh_ms * NSEC_PER_MSEC;
3186 
3187 	return 0;
3188 }
3189 
3190 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3191 {
3192 }
3193 
3194 static int parse_control_option(const struct option *opt,
3195 				const char *str,
3196 				int unset __maybe_unused)
3197 {
3198 	struct record_opts *opts = opt->value;
3199 
3200 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3201 }
3202 
3203 static void switch_output_size_warn(struct record *rec)
3204 {
3205 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3206 	struct switch_output *s = &rec->switch_output;
3207 
3208 	wakeup_size /= 2;
3209 
3210 	if (s->size < wakeup_size) {
3211 		char buf[100];
3212 
3213 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3214 		pr_warning("WARNING: switch-output data size lower than "
3215 			   "wakeup kernel buffer size (%s) "
3216 			   "expect bigger perf.data sizes\n", buf);
3217 	}
3218 }
3219 
3220 static int switch_output_setup(struct record *rec)
3221 {
3222 	struct switch_output *s = &rec->switch_output;
3223 	static struct parse_tag tags_size[] = {
3224 		{ .tag  = 'B', .mult = 1       },
3225 		{ .tag  = 'K', .mult = 1 << 10 },
3226 		{ .tag  = 'M', .mult = 1 << 20 },
3227 		{ .tag  = 'G', .mult = 1 << 30 },
3228 		{ .tag  = 0 },
3229 	};
3230 	static struct parse_tag tags_time[] = {
3231 		{ .tag  = 's', .mult = 1        },
3232 		{ .tag  = 'm', .mult = 60       },
3233 		{ .tag  = 'h', .mult = 60*60    },
3234 		{ .tag  = 'd', .mult = 60*60*24 },
3235 		{ .tag  = 0 },
3236 	};
3237 	unsigned long val;
3238 
3239 	/*
3240 	 * If we're using --switch-output-events, then we imply its
3241 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3242 	 *  thread to its parent.
3243 	 */
3244 	if (rec->switch_output_event_set) {
3245 		if (record__threads_enabled(rec)) {
3246 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3247 			return 0;
3248 		}
3249 		goto do_signal;
3250 	}
3251 
3252 	if (!s->set)
3253 		return 0;
3254 
3255 	if (record__threads_enabled(rec)) {
3256 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3257 		return 0;
3258 	}
3259 
3260 	if (!strcmp(s->str, "signal")) {
3261 do_signal:
3262 		s->signal = true;
3263 		pr_debug("switch-output with SIGUSR2 signal\n");
3264 		goto enabled;
3265 	}
3266 
3267 	val = parse_tag_value(s->str, tags_size);
3268 	if (val != (unsigned long) -1) {
3269 		s->size = val;
3270 		pr_debug("switch-output with %s size threshold\n", s->str);
3271 		goto enabled;
3272 	}
3273 
3274 	val = parse_tag_value(s->str, tags_time);
3275 	if (val != (unsigned long) -1) {
3276 		s->time = val;
3277 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3278 			 s->str, s->time);
3279 		goto enabled;
3280 	}
3281 
3282 	return -1;
3283 
3284 enabled:
3285 	rec->timestamp_filename = true;
3286 	s->enabled              = true;
3287 
3288 	if (s->size && !rec->opts.no_buffering)
3289 		switch_output_size_warn(rec);
3290 
3291 	return 0;
3292 }
3293 
3294 static const char * const __record_usage[] = {
3295 	"perf record [<options>] [<command>]",
3296 	"perf record [<options>] -- <command> [<options>]",
3297 	NULL
3298 };
3299 const char * const *record_usage = __record_usage;
3300 
3301 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
3302 				  struct perf_sample *sample, struct machine *machine)
3303 {
3304 	/*
3305 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3306 	 * no need to add them twice.
3307 	 */
3308 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3309 		return 0;
3310 	return perf_event__process_mmap(tool, event, sample, machine);
3311 }
3312 
3313 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
3314 				   struct perf_sample *sample, struct machine *machine)
3315 {
3316 	/*
3317 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3318 	 * no need to add them twice.
3319 	 */
3320 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3321 		return 0;
3322 
3323 	return perf_event__process_mmap2(tool, event, sample, machine);
3324 }
3325 
3326 static int process_timestamp_boundary(const struct perf_tool *tool,
3327 				      union perf_event *event __maybe_unused,
3328 				      struct perf_sample *sample,
3329 				      struct machine *machine __maybe_unused)
3330 {
3331 	struct record *rec = container_of(tool, struct record, tool);
3332 
3333 	set_timestamp_boundary(rec, sample->time);
3334 	return 0;
3335 }
3336 
3337 static int parse_record_synth_option(const struct option *opt,
3338 				     const char *str,
3339 				     int unset __maybe_unused)
3340 {
3341 	struct record_opts *opts = opt->value;
3342 	char *p = strdup(str);
3343 
3344 	if (p == NULL)
3345 		return -1;
3346 
3347 	opts->synth = parse_synth_opt(p);
3348 	free(p);
3349 
3350 	if (opts->synth < 0) {
3351 		pr_err("Invalid synth option: %s\n", str);
3352 		return -1;
3353 	}
3354 	return 0;
3355 }
3356 
3357 /*
3358  * XXX Ideally would be local to cmd_record() and passed to a record__new
3359  * because we need to have access to it in record__exit, that is called
3360  * after cmd_record() exits, but since record_options need to be accessible to
3361  * builtin-script, leave it here.
3362  *
3363  * At least we don't ouch it in all the other functions here directly.
3364  *
3365  * Just say no to tons of global variables, sigh.
3366  */
3367 static struct record record = {
3368 	.opts = {
3369 		.sample_time	     = true,
3370 		.mmap_pages	     = UINT_MAX,
3371 		.user_freq	     = UINT_MAX,
3372 		.user_interval	     = ULLONG_MAX,
3373 		.freq		     = 4000,
3374 		.target		     = {
3375 			.uses_mmap   = true,
3376 			.default_per_cpu = true,
3377 		},
3378 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3379 		.nr_threads_synthesize = 1,
3380 		.ctl_fd              = -1,
3381 		.ctl_fd_ack          = -1,
3382 		.synth               = PERF_SYNTH_ALL,
3383 		.off_cpu_thresh_ns   = OFFCPU_THRESH,
3384 	},
3385 };
3386 
3387 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3388 	"\n\t\t\t\tDefault: fp";
3389 
3390 static bool dry_run;
3391 
3392 static struct parse_events_option_args parse_events_option_args = {
3393 	.evlistp = &record.evlist,
3394 };
3395 
3396 static struct parse_events_option_args switch_output_parse_events_option_args = {
3397 	.evlistp = &record.sb_evlist,
3398 };
3399 
3400 /*
3401  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3402  * with it and switch to use the library functions in perf_evlist that came
3403  * from builtin-record.c, i.e. use record_opts,
3404  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3405  * using pipes, etc.
3406  */
3407 static struct option __record_options[] = {
3408 	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3409 		     "event selector. use 'perf list' to list available events",
3410 		     parse_events_option),
3411 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3412 		     "event filter", parse_filter),
3413 	OPT_BOOLEAN(0, "latency", &record.latency,
3414 		    "Enable data collection for latency profiling.\n"
3415 		    "\t\t\t  Use perf report --latency for latency-centric profile."),
3416 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3417 			   NULL, "don't record events from perf itself",
3418 			   exclude_perf),
3419 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3420 		    "record events on existing process id"),
3421 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3422 		    "record events on existing thread id"),
3423 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3424 		    "collect data with this RT SCHED_FIFO priority"),
3425 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3426 		    "collect data without buffering"),
3427 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3428 		    "collect raw sample records from all opened counters"),
3429 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3430 			    "system-wide collection from all CPUs"),
3431 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3432 		    "list of cpus to monitor"),
3433 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3434 	OPT_STRING('o', "output", &record.data.path, "file",
3435 		    "output file name"),
3436 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3437 			&record.opts.no_inherit_set,
3438 			"child tasks do not inherit counters"),
3439 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3440 		    "synthesize non-sample events at the end of output"),
3441 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3442 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3443 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3444 		    "Fail if the specified frequency can't be used"),
3445 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3446 		     "profile at this frequency",
3447 		      record__parse_freq),
3448 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3449 		     "number of mmap data pages and AUX area tracing mmap pages",
3450 		     record__parse_mmap_pages),
3451 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3452 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3453 		     record__mmap_flush_parse),
3454 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3455 			   NULL, "enables call-graph recording" ,
3456 			   &record_callchain_opt),
3457 	OPT_CALLBACK(0, "call-graph", &record.opts,
3458 		     "record_mode[,record_size]", record_callchain_help,
3459 		     &record_parse_callchain_opt),
3460 	OPT_INCR('v', "verbose", &verbose,
3461 		    "be more verbose (show counter open errors, etc)"),
3462 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3463 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3464 		    "per thread counts"),
3465 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3466 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3467 		    "Record the sample physical addresses"),
3468 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3469 		    "Record the sampled data address data page size"),
3470 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3471 		    "Record the sampled code address (ip) page size"),
3472 	OPT_BOOLEAN(0, "sample-mem-info", &record.opts.sample_data_src,
3473 		    "Record the data source for memory operations"),
3474 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3475 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3476 		    "Record the sample identifier"),
3477 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3478 			&record.opts.sample_time_set,
3479 			"Record the sample timestamps"),
3480 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3481 			"Record the sample period"),
3482 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3483 		    "don't sample"),
3484 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3485 			&record.no_buildid_cache_set,
3486 			"do not update the buildid cache"),
3487 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3488 			&record.no_buildid_set,
3489 			"do not collect buildids in perf.data"),
3490 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3491 		     "monitor event in cgroup name only",
3492 		     parse_cgroups),
3493 	OPT_CALLBACK('D', "delay", &record, "ms",
3494 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3495 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3496 		     record__parse_event_enable_time),
3497 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3498 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3499 		   "user to profile"),
3500 
3501 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3502 		     "branch any", "sample any taken branches",
3503 		     parse_branch_stack),
3504 
3505 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3506 		     "branch filter mask", "branch stack filter modes",
3507 		     parse_branch_stack),
3508 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3509 		    "sample by weight (on special events only)"),
3510 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3511 		    "sample transaction flags (special events only)"),
3512 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3513 		    "use per-thread mmaps"),
3514 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3515 		    "sample selected machine registers on interrupt,"
3516 		    " use '-I?' to list register names", parse_intr_regs),
3517 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3518 		    "sample selected machine registers on interrupt,"
3519 		    " use '--user-regs=?' to list register names", parse_user_regs),
3520 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3521 		    "Record running/enabled time of read (:S) events"),
3522 	OPT_CALLBACK('k', "clockid", &record.opts,
3523 	"clockid", "clockid to use for events, see clock_gettime()",
3524 	parse_clockid),
3525 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3526 			  "opts", "AUX area tracing Snapshot Mode", ""),
3527 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3528 			  "opts", "sample AUX area", ""),
3529 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3530 			"per thread proc mmap processing timeout in ms"),
3531 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3532 		    "Record namespaces events"),
3533 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3534 		    "Record cgroup events"),
3535 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3536 			&record.opts.record_switch_events_set,
3537 			"Record context switch events"),
3538 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3539 			 "Configure all used events to run in kernel space.",
3540 			 PARSE_OPT_EXCLUSIVE),
3541 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3542 			 "Configure all used events to run in user space.",
3543 			 PARSE_OPT_EXCLUSIVE),
3544 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3545 		    "collect kernel callchains"),
3546 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3547 		    "collect user callchains"),
3548 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3549 		   "file", "vmlinux pathname"),
3550 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3551 		    "Record build-id of all DSOs regardless of hits"),
3552 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3553 		    "Record build-id in map events"),
3554 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3555 		    "append timestamp to output filename"),
3556 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3557 		    "Record timestamp boundary (time of first/last samples)"),
3558 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3559 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3560 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3561 			  "signal"),
3562 	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3563 			 &record.switch_output_event_set, "switch output event",
3564 			 "switch output event selector. use 'perf list' to list available events",
3565 			 parse_events_option_new_evlist),
3566 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3567 		   "Limit number of switch output generated files"),
3568 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3569 		    "Parse options then exit"),
3570 #ifdef HAVE_AIO_SUPPORT
3571 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3572 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3573 		     record__aio_parse),
3574 #endif
3575 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3576 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3577 		     record__parse_affinity),
3578 #ifdef HAVE_ZSTD_SUPPORT
3579 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3580 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3581 			    record__parse_comp_level),
3582 #endif
3583 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3584 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3585 	OPT_UINTEGER(0, "num-thread-synthesize",
3586 		     &record.opts.nr_threads_synthesize,
3587 		     "number of threads to run for event synthesis"),
3588 #ifdef HAVE_LIBPFM
3589 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3590 		"libpfm4 event selector. use 'perf list' to list available events",
3591 		parse_libpfm_events_option),
3592 #endif
3593 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3594 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3595 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3596 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3597 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3598 		      parse_control_option),
3599 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3600 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3601 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3602 			  &record.debuginfod.set, "debuginfod urls",
3603 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3604 			  "system"),
3605 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3606 			    "write collected trace data into several data files using parallel threads",
3607 			    record__parse_threads),
3608 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3609 	OPT_STRING(0, "setup-filter", &record.filter_action, "pin|unpin",
3610 		   "BPF filter action"),
3611 	OPT_CALLBACK(0, "off-cpu-thresh", &record.opts, "ms",
3612 		     "Dump off-cpu samples if off-cpu time exceeds this threshold (in milliseconds). (Default: 500ms)",
3613 		     record__parse_off_cpu_thresh),
3614 	OPT_END()
3615 };
3616 
3617 struct option *record_options = __record_options;
3618 
3619 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3620 {
3621 	struct perf_cpu cpu;
3622 	int idx;
3623 
3624 	if (cpu_map__is_dummy(cpus))
3625 		return 0;
3626 
3627 	perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3628 		/* Return ENODEV is input cpu is greater than max cpu */
3629 		if ((unsigned long)cpu.cpu > mask->nbits)
3630 			return -ENODEV;
3631 		__set_bit(cpu.cpu, mask->bits);
3632 	}
3633 
3634 	return 0;
3635 }
3636 
3637 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3638 {
3639 	struct perf_cpu_map *cpus;
3640 
3641 	cpus = perf_cpu_map__new(mask_spec);
3642 	if (!cpus)
3643 		return -ENOMEM;
3644 
3645 	bitmap_zero(mask->bits, mask->nbits);
3646 	if (record__mmap_cpu_mask_init(mask, cpus))
3647 		return -ENODEV;
3648 
3649 	perf_cpu_map__put(cpus);
3650 
3651 	return 0;
3652 }
3653 
3654 static void record__free_thread_masks(struct record *rec, int nr_threads)
3655 {
3656 	int t;
3657 
3658 	if (rec->thread_masks)
3659 		for (t = 0; t < nr_threads; t++)
3660 			record__thread_mask_free(&rec->thread_masks[t]);
3661 
3662 	zfree(&rec->thread_masks);
3663 }
3664 
3665 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3666 {
3667 	int t, ret;
3668 
3669 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3670 	if (!rec->thread_masks) {
3671 		pr_err("Failed to allocate thread masks\n");
3672 		return -ENOMEM;
3673 	}
3674 
3675 	for (t = 0; t < nr_threads; t++) {
3676 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3677 		if (ret) {
3678 			pr_err("Failed to allocate thread masks[%d]\n", t);
3679 			goto out_free;
3680 		}
3681 	}
3682 
3683 	return 0;
3684 
3685 out_free:
3686 	record__free_thread_masks(rec, nr_threads);
3687 
3688 	return ret;
3689 }
3690 
3691 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3692 {
3693 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3694 
3695 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3696 	if (ret)
3697 		return ret;
3698 
3699 	rec->nr_threads = nr_cpus;
3700 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3701 
3702 	for (t = 0; t < rec->nr_threads; t++) {
3703 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3704 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3705 		if (verbose > 0) {
3706 			pr_debug("thread_masks[%d]: ", t);
3707 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3708 			pr_debug("thread_masks[%d]: ", t);
3709 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3710 		}
3711 	}
3712 
3713 	return 0;
3714 }
3715 
3716 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3717 					  const char **maps_spec, const char **affinity_spec,
3718 					  u32 nr_spec)
3719 {
3720 	u32 s;
3721 	int ret = 0, t = 0;
3722 	struct mmap_cpu_mask cpus_mask;
3723 	struct thread_mask thread_mask, full_mask, *thread_masks;
3724 
3725 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3726 	if (ret) {
3727 		pr_err("Failed to allocate CPUs mask\n");
3728 		return ret;
3729 	}
3730 
3731 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3732 	if (ret) {
3733 		pr_err("Failed to init cpu mask\n");
3734 		goto out_free_cpu_mask;
3735 	}
3736 
3737 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3738 	if (ret) {
3739 		pr_err("Failed to allocate full mask\n");
3740 		goto out_free_cpu_mask;
3741 	}
3742 
3743 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3744 	if (ret) {
3745 		pr_err("Failed to allocate thread mask\n");
3746 		goto out_free_full_and_cpu_masks;
3747 	}
3748 
3749 	for (s = 0; s < nr_spec; s++) {
3750 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3751 		if (ret) {
3752 			pr_err("Failed to initialize maps thread mask\n");
3753 			goto out_free;
3754 		}
3755 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3756 		if (ret) {
3757 			pr_err("Failed to initialize affinity thread mask\n");
3758 			goto out_free;
3759 		}
3760 
3761 		/* ignore invalid CPUs but do not allow empty masks */
3762 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3763 				cpus_mask.bits, thread_mask.maps.nbits)) {
3764 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3765 			ret = -EINVAL;
3766 			goto out_free;
3767 		}
3768 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3769 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3770 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3771 			ret = -EINVAL;
3772 			goto out_free;
3773 		}
3774 
3775 		/* do not allow intersection with other masks (full_mask) */
3776 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3777 				      thread_mask.maps.nbits)) {
3778 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3779 			ret = -EINVAL;
3780 			goto out_free;
3781 		}
3782 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3783 				      thread_mask.affinity.nbits)) {
3784 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3785 			ret = -EINVAL;
3786 			goto out_free;
3787 		}
3788 
3789 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3790 			  thread_mask.maps.bits, full_mask.maps.nbits);
3791 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3792 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3793 
3794 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3795 		if (!thread_masks) {
3796 			pr_err("Failed to reallocate thread masks\n");
3797 			ret = -ENOMEM;
3798 			goto out_free;
3799 		}
3800 		rec->thread_masks = thread_masks;
3801 		rec->thread_masks[t] = thread_mask;
3802 		if (verbose > 0) {
3803 			pr_debug("thread_masks[%d]: ", t);
3804 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3805 			pr_debug("thread_masks[%d]: ", t);
3806 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3807 		}
3808 		t++;
3809 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3810 		if (ret) {
3811 			pr_err("Failed to allocate thread mask\n");
3812 			goto out_free_full_and_cpu_masks;
3813 		}
3814 	}
3815 	rec->nr_threads = t;
3816 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3817 	if (!rec->nr_threads)
3818 		ret = -EINVAL;
3819 
3820 out_free:
3821 	record__thread_mask_free(&thread_mask);
3822 out_free_full_and_cpu_masks:
3823 	record__thread_mask_free(&full_mask);
3824 out_free_cpu_mask:
3825 	record__mmap_cpu_mask_free(&cpus_mask);
3826 
3827 	return ret;
3828 }
3829 
3830 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3831 {
3832 	int ret;
3833 	struct cpu_topology *topo;
3834 
3835 	topo = cpu_topology__new();
3836 	if (!topo) {
3837 		pr_err("Failed to allocate CPU topology\n");
3838 		return -ENOMEM;
3839 	}
3840 
3841 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3842 					     topo->core_cpus_list, topo->core_cpus_lists);
3843 	cpu_topology__delete(topo);
3844 
3845 	return ret;
3846 }
3847 
3848 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3849 {
3850 	int ret;
3851 	struct cpu_topology *topo;
3852 
3853 	topo = cpu_topology__new();
3854 	if (!topo) {
3855 		pr_err("Failed to allocate CPU topology\n");
3856 		return -ENOMEM;
3857 	}
3858 
3859 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3860 					     topo->package_cpus_list, topo->package_cpus_lists);
3861 	cpu_topology__delete(topo);
3862 
3863 	return ret;
3864 }
3865 
3866 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3867 {
3868 	u32 s;
3869 	int ret;
3870 	const char **spec;
3871 	struct numa_topology *topo;
3872 
3873 	topo = numa_topology__new();
3874 	if (!topo) {
3875 		pr_err("Failed to allocate NUMA topology\n");
3876 		return -ENOMEM;
3877 	}
3878 
3879 	spec = zalloc(topo->nr * sizeof(char *));
3880 	if (!spec) {
3881 		pr_err("Failed to allocate NUMA spec\n");
3882 		ret = -ENOMEM;
3883 		goto out_delete_topo;
3884 	}
3885 	for (s = 0; s < topo->nr; s++)
3886 		spec[s] = topo->nodes[s].cpus;
3887 
3888 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3889 
3890 	zfree(&spec);
3891 
3892 out_delete_topo:
3893 	numa_topology__delete(topo);
3894 
3895 	return ret;
3896 }
3897 
3898 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3899 {
3900 	int t, ret;
3901 	u32 s, nr_spec = 0;
3902 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3903 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3904 
3905 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3906 		spec = strtok_r(user_spec, ":", &spec_ptr);
3907 		if (spec == NULL)
3908 			break;
3909 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3910 		mask = strtok_r(spec, "/", &mask_ptr);
3911 		if (mask == NULL)
3912 			break;
3913 		pr_debug2("  maps mask: %s\n", mask);
3914 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3915 		if (!tmp_spec) {
3916 			pr_err("Failed to reallocate maps spec\n");
3917 			ret = -ENOMEM;
3918 			goto out_free;
3919 		}
3920 		maps_spec = tmp_spec;
3921 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3922 		if (!maps_spec[nr_spec]) {
3923 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3924 			ret = -ENOMEM;
3925 			goto out_free;
3926 		}
3927 		mask = strtok_r(NULL, "/", &mask_ptr);
3928 		if (mask == NULL) {
3929 			pr_err("Invalid thread maps or affinity specs\n");
3930 			ret = -EINVAL;
3931 			goto out_free;
3932 		}
3933 		pr_debug2("  affinity mask: %s\n", mask);
3934 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3935 		if (!tmp_spec) {
3936 			pr_err("Failed to reallocate affinity spec\n");
3937 			ret = -ENOMEM;
3938 			goto out_free;
3939 		}
3940 		affinity_spec = tmp_spec;
3941 		affinity_spec[nr_spec] = strdup(mask);
3942 		if (!affinity_spec[nr_spec]) {
3943 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3944 			ret = -ENOMEM;
3945 			goto out_free;
3946 		}
3947 		dup_mask = NULL;
3948 		nr_spec++;
3949 	}
3950 
3951 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3952 					     (const char **)affinity_spec, nr_spec);
3953 
3954 out_free:
3955 	free(dup_mask);
3956 	for (s = 0; s < nr_spec; s++) {
3957 		if (maps_spec)
3958 			free(maps_spec[s]);
3959 		if (affinity_spec)
3960 			free(affinity_spec[s]);
3961 	}
3962 	free(affinity_spec);
3963 	free(maps_spec);
3964 
3965 	return ret;
3966 }
3967 
3968 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3969 {
3970 	int ret;
3971 
3972 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3973 	if (ret)
3974 		return ret;
3975 
3976 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3977 		return -ENODEV;
3978 
3979 	rec->nr_threads = 1;
3980 
3981 	return 0;
3982 }
3983 
3984 static int record__init_thread_masks(struct record *rec)
3985 {
3986 	int ret = 0;
3987 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3988 
3989 	if (!record__threads_enabled(rec))
3990 		return record__init_thread_default_masks(rec, cpus);
3991 
3992 	if (evlist__per_thread(rec->evlist)) {
3993 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3994 		return -EINVAL;
3995 	}
3996 
3997 	switch (rec->opts.threads_spec) {
3998 	case THREAD_SPEC__CPU:
3999 		ret = record__init_thread_cpu_masks(rec, cpus);
4000 		break;
4001 	case THREAD_SPEC__CORE:
4002 		ret = record__init_thread_core_masks(rec, cpus);
4003 		break;
4004 	case THREAD_SPEC__PACKAGE:
4005 		ret = record__init_thread_package_masks(rec, cpus);
4006 		break;
4007 	case THREAD_SPEC__NUMA:
4008 		ret = record__init_thread_numa_masks(rec, cpus);
4009 		break;
4010 	case THREAD_SPEC__USER:
4011 		ret = record__init_thread_user_masks(rec, cpus);
4012 		break;
4013 	default:
4014 		break;
4015 	}
4016 
4017 	return ret;
4018 }
4019 
4020 int cmd_record(int argc, const char **argv)
4021 {
4022 	int err;
4023 	struct record *rec = &record;
4024 	char errbuf[BUFSIZ];
4025 
4026 	setlocale(LC_ALL, "");
4027 
4028 #ifndef HAVE_BPF_SKEL
4029 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
4030 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
4031 # undef set_nobuild
4032 #endif
4033 
4034 	/* Disable eager loading of kernel symbols that adds overhead to perf record. */
4035 	symbol_conf.lazy_load_kernel_maps = true;
4036 	rec->opts.affinity = PERF_AFFINITY_SYS;
4037 
4038 	rec->evlist = evlist__new();
4039 	if (rec->evlist == NULL)
4040 		return -ENOMEM;
4041 
4042 	err = perf_config(perf_record_config, rec);
4043 	if (err)
4044 		return err;
4045 
4046 	argc = parse_options(argc, argv, record_options, record_usage,
4047 			    PARSE_OPT_STOP_AT_NON_OPTION);
4048 	if (quiet)
4049 		perf_quiet_option();
4050 
4051 	err = symbol__validate_sym_arguments();
4052 	if (err)
4053 		return err;
4054 
4055 	perf_debuginfod_setup(&record.debuginfod);
4056 
4057 	/* Make system wide (-a) the default target. */
4058 	if (!argc && target__none(&rec->opts.target))
4059 		rec->opts.target.system_wide = true;
4060 
4061 	if (nr_cgroups && !rec->opts.target.system_wide) {
4062 		usage_with_options_msg(record_usage, record_options,
4063 			"cgroup monitoring only available in system-wide mode");
4064 
4065 	}
4066 
4067 	if (record.latency) {
4068 		/*
4069 		 * There is no fundamental reason why latency profiling
4070 		 * can't work for system-wide mode, but exact semantics
4071 		 * and details are to be defined.
4072 		 * See the following thread for details:
4073 		 * https://lore.kernel.org/all/Z4XDJyvjiie3howF@google.com/
4074 		 */
4075 		if (record.opts.target.system_wide) {
4076 			pr_err("Failed: latency profiling is not supported with system-wide collection.\n");
4077 			err = -EINVAL;
4078 			goto out_opts;
4079 		}
4080 		record.opts.record_switch_events = true;
4081 	}
4082 
4083 	if (rec->buildid_mmap) {
4084 		if (!perf_can_record_build_id()) {
4085 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4086 			err = -EINVAL;
4087 			goto out_opts;
4088 		}
4089 		pr_debug("Enabling build id in mmap2 events.\n");
4090 		/* Enable mmap build id synthesizing. */
4091 		symbol_conf.buildid_mmap2 = true;
4092 		/* Enable perf_event_attr::build_id bit. */
4093 		rec->opts.build_id = true;
4094 		/* Disable build id cache. */
4095 		rec->no_buildid = true;
4096 	}
4097 
4098 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4099 		pr_err("Kernel has no cgroup sampling support.\n");
4100 		err = -EINVAL;
4101 		goto out_opts;
4102 	}
4103 
4104 	if (rec->opts.kcore)
4105 		rec->opts.text_poke = true;
4106 
4107 	if (rec->opts.kcore || record__threads_enabled(rec))
4108 		rec->data.is_dir = true;
4109 
4110 	if (record__threads_enabled(rec)) {
4111 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4112 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4113 			goto out_opts;
4114 		}
4115 		if (record__aio_enabled(rec)) {
4116 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4117 			goto out_opts;
4118 		}
4119 	}
4120 
4121 	if (rec->opts.comp_level != 0) {
4122 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4123 		rec->no_buildid = true;
4124 	}
4125 
4126 	if (rec->opts.record_switch_events &&
4127 	    !perf_can_record_switch_events()) {
4128 		ui__error("kernel does not support recording context switch events\n");
4129 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4130 		err = -EINVAL;
4131 		goto out_opts;
4132 	}
4133 
4134 	if (switch_output_setup(rec)) {
4135 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4136 		err = -EINVAL;
4137 		goto out_opts;
4138 	}
4139 
4140 	if (rec->switch_output.time) {
4141 		signal(SIGALRM, alarm_sig_handler);
4142 		alarm(rec->switch_output.time);
4143 	}
4144 
4145 	if (rec->switch_output.num_files) {
4146 		rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4147 						      sizeof(char *));
4148 		if (!rec->switch_output.filenames) {
4149 			err = -EINVAL;
4150 			goto out_opts;
4151 		}
4152 	}
4153 
4154 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4155 		rec->timestamp_filename = false;
4156 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4157 	}
4158 
4159 	if (rec->filter_action) {
4160 		if (!strcmp(rec->filter_action, "pin"))
4161 			err = perf_bpf_filter__pin();
4162 		else if (!strcmp(rec->filter_action, "unpin"))
4163 			err = perf_bpf_filter__unpin();
4164 		else {
4165 			pr_warning("Unknown BPF filter action: %s\n", rec->filter_action);
4166 			err = -EINVAL;
4167 		}
4168 		goto out_opts;
4169 	}
4170 
4171 	/* For backward compatibility, -d implies --mem-info */
4172 	if (rec->opts.sample_address)
4173 		rec->opts.sample_data_src = true;
4174 
4175 	/*
4176 	 * Allow aliases to facilitate the lookup of symbols for address
4177 	 * filters. Refer to auxtrace_parse_filters().
4178 	 */
4179 	symbol_conf.allow_aliases = true;
4180 
4181 	symbol__init(NULL);
4182 
4183 	err = record__auxtrace_init(rec);
4184 	if (err)
4185 		goto out;
4186 
4187 	if (dry_run)
4188 		goto out;
4189 
4190 	err = -ENOMEM;
4191 
4192 	if (rec->no_buildid_cache || rec->no_buildid) {
4193 		disable_buildid_cache();
4194 	} else if (rec->switch_output.enabled) {
4195 		/*
4196 		 * In 'perf record --switch-output', disable buildid
4197 		 * generation by default to reduce data file switching
4198 		 * overhead. Still generate buildid if they are required
4199 		 * explicitly using
4200 		 *
4201 		 *  perf record --switch-output --no-no-buildid \
4202 		 *              --no-no-buildid-cache
4203 		 *
4204 		 * Following code equals to:
4205 		 *
4206 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4207 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4208 		 *         disable_buildid_cache();
4209 		 */
4210 		bool disable = true;
4211 
4212 		if (rec->no_buildid_set && !rec->no_buildid)
4213 			disable = false;
4214 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4215 			disable = false;
4216 		if (disable) {
4217 			rec->no_buildid = true;
4218 			rec->no_buildid_cache = true;
4219 			disable_buildid_cache();
4220 		}
4221 	}
4222 
4223 	if (record.opts.overwrite)
4224 		record.opts.tail_synthesize = true;
4225 
4226 	if (rec->evlist->core.nr_entries == 0) {
4227 		err = parse_event(rec->evlist, "cycles:P");
4228 		if (err)
4229 			goto out;
4230 	}
4231 
4232 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4233 		rec->opts.no_inherit = true;
4234 
4235 	err = target__validate(&rec->opts.target);
4236 	if (err) {
4237 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4238 		ui__warning("%s\n", errbuf);
4239 	}
4240 
4241 	err = target__parse_uid(&rec->opts.target);
4242 	if (err) {
4243 		int saved_errno = errno;
4244 
4245 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4246 		ui__error("%s", errbuf);
4247 
4248 		err = -saved_errno;
4249 		goto out;
4250 	}
4251 
4252 	/* Enable ignoring missing threads when -u/-p option is defined. */
4253 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4254 
4255 	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4256 
4257 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4258 		arch__add_leaf_frame_record_opts(&rec->opts);
4259 
4260 	err = -ENOMEM;
4261 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4262 		if (rec->opts.target.pid != NULL) {
4263 			pr_err("Couldn't create thread/CPU maps: %s\n",
4264 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4265 			goto out;
4266 		}
4267 		else
4268 			usage_with_options(record_usage, record_options);
4269 	}
4270 
4271 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4272 	if (err)
4273 		goto out;
4274 
4275 	/*
4276 	 * We take all buildids when the file contains
4277 	 * AUX area tracing data because we do not decode the
4278 	 * trace because it would take too long.
4279 	 */
4280 	if (rec->opts.full_auxtrace)
4281 		rec->buildid_all = true;
4282 
4283 	if (rec->opts.text_poke) {
4284 		err = record__config_text_poke(rec->evlist);
4285 		if (err) {
4286 			pr_err("record__config_text_poke failed, error %d\n", err);
4287 			goto out;
4288 		}
4289 	}
4290 
4291 	if (rec->off_cpu) {
4292 		err = record__config_off_cpu(rec);
4293 		if (err) {
4294 			pr_err("record__config_off_cpu failed, error %d\n", err);
4295 			goto out;
4296 		}
4297 	}
4298 
4299 	if (record_opts__config(&rec->opts)) {
4300 		err = -EINVAL;
4301 		goto out;
4302 	}
4303 
4304 	err = record__config_tracking_events(rec);
4305 	if (err) {
4306 		pr_err("record__config_tracking_events failed, error %d\n", err);
4307 		goto out;
4308 	}
4309 
4310 	err = record__init_thread_masks(rec);
4311 	if (err) {
4312 		pr_err("Failed to initialize parallel data streaming masks\n");
4313 		goto out;
4314 	}
4315 
4316 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4317 		rec->opts.nr_cblocks = nr_cblocks_max;
4318 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4319 
4320 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4321 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4322 
4323 	if (rec->opts.comp_level > comp_level_max)
4324 		rec->opts.comp_level = comp_level_max;
4325 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4326 
4327 	err = __cmd_record(&record, argc, argv);
4328 out:
4329 	record__free_thread_masks(rec, rec->nr_threads);
4330 	rec->nr_threads = 0;
4331 	symbol__exit();
4332 	auxtrace_record__free(rec->itr);
4333 out_opts:
4334 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4335 	evlist__delete(rec->evlist);
4336 	return err;
4337 }
4338 
4339 static void snapshot_sig_handler(int sig __maybe_unused)
4340 {
4341 	struct record *rec = &record;
4342 
4343 	hit_auxtrace_snapshot_trigger(rec);
4344 
4345 	if (switch_output_signal(rec))
4346 		trigger_hit(&switch_output_trigger);
4347 }
4348 
4349 static void alarm_sig_handler(int sig __maybe_unused)
4350 {
4351 	struct record *rec = &record;
4352 
4353 	if (switch_output_time(rec))
4354 		trigger_hit(&switch_output_trigger);
4355 }
4356