xref: /linux/tools/perf/builtin-record.c (revision be59dba332e1e8edd3e88d991ba0e4795ae2bcb2)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/stat.h"
30 #include "util/symbol.h"
31 #include "util/record.h"
32 #include "util/cpumap.h"
33 #include "util/thread_map.h"
34 #include "util/data.h"
35 #include "util/perf_regs.h"
36 #include "util/auxtrace.h"
37 #include "util/tsc.h"
38 #include "util/parse-branch-options.h"
39 #include "util/parse-regs-options.h"
40 #include "util/perf_api_probe.h"
41 #include "util/trigger.h"
42 #include "util/perf-hooks.h"
43 #include "util/cpu-set-sched.h"
44 #include "util/synthetic-events.h"
45 #include "util/time-utils.h"
46 #include "util/units.h"
47 #include "util/bpf-event.h"
48 #include "util/util.h"
49 #include "util/pfm.h"
50 #include "util/pmu.h"
51 #include "util/pmus.h"
52 #include "util/clockid.h"
53 #include "util/off_cpu.h"
54 #include "util/bpf-filter.h"
55 #include "util/strbuf.h"
56 #include "asm/bug.h"
57 #include "perf.h"
58 #include "cputopo.h"
59 
60 #include <errno.h>
61 #include <inttypes.h>
62 #include <locale.h>
63 #include <poll.h>
64 #include <pthread.h>
65 #include <unistd.h>
66 #ifndef HAVE_GETTID
67 #include <syscall.h>
68 #endif
69 #include <sched.h>
70 #include <signal.h>
71 #ifdef HAVE_EVENTFD_SUPPORT
72 #include <sys/eventfd.h>
73 #endif
74 #include <sys/mman.h>
75 #include <sys/wait.h>
76 #include <sys/types.h>
77 #include <sys/stat.h>
78 #include <fcntl.h>
79 #include <linux/err.h>
80 #include <linux/string.h>
81 #include <linux/time64.h>
82 #include <linux/zalloc.h>
83 #include <linux/bitmap.h>
84 #include <sys/time.h>
85 
86 struct switch_output {
87 	bool		 enabled;
88 	bool		 signal;
89 	unsigned long	 size;
90 	unsigned long	 time;
91 	const char	*str;
92 	bool		 set;
93 	char		 **filenames;
94 	int		 num_files;
95 	int		 cur_file;
96 };
97 
98 struct thread_mask {
99 	struct mmap_cpu_mask	maps;
100 	struct mmap_cpu_mask	affinity;
101 };
102 
103 struct record_thread {
104 	pid_t			tid;
105 	struct thread_mask	*mask;
106 	struct {
107 		int		msg[2];
108 		int		ack[2];
109 	} pipes;
110 	struct fdarray		pollfd;
111 	int			ctlfd_pos;
112 	int			nr_mmaps;
113 	struct mmap		**maps;
114 	struct mmap		**overwrite_maps;
115 	struct record		*rec;
116 	unsigned long long	samples;
117 	unsigned long		waking;
118 	u64			bytes_written;
119 	u64			bytes_transferred;
120 	u64			bytes_compressed;
121 };
122 
123 static __thread struct record_thread *thread;
124 
125 enum thread_msg {
126 	THREAD_MSG__UNDEFINED = 0,
127 	THREAD_MSG__READY,
128 	THREAD_MSG__MAX,
129 };
130 
131 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
132 	"UNDEFINED", "READY"
133 };
134 
135 enum thread_spec {
136 	THREAD_SPEC__UNDEFINED = 0,
137 	THREAD_SPEC__CPU,
138 	THREAD_SPEC__CORE,
139 	THREAD_SPEC__PACKAGE,
140 	THREAD_SPEC__NUMA,
141 	THREAD_SPEC__USER,
142 	THREAD_SPEC__MAX,
143 };
144 
145 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
146 	"undefined", "cpu", "core", "package", "numa", "user"
147 };
148 
149 struct pollfd_index_map {
150 	int evlist_pollfd_index;
151 	int thread_pollfd_index;
152 };
153 
154 struct record {
155 	struct perf_tool	tool;
156 	struct record_opts	opts;
157 	u64			bytes_written;
158 	u64			thread_bytes_written;
159 	struct perf_data	data;
160 	struct auxtrace_record	*itr;
161 	struct evlist	*evlist;
162 	struct perf_session	*session;
163 	struct evlist		*sb_evlist;
164 	pthread_t		thread_id;
165 	int			realtime_prio;
166 	bool			latency;
167 	bool			switch_output_event_set;
168 	bool			no_buildid;
169 	bool			no_buildid_set;
170 	bool			no_buildid_cache;
171 	bool			no_buildid_cache_set;
172 	bool			buildid_all;
173 	bool			buildid_mmap;
174 	bool			timestamp_filename;
175 	bool			timestamp_boundary;
176 	bool			off_cpu;
177 	const char		*filter_action;
178 	const char		*uid_str;
179 	struct switch_output	switch_output;
180 	unsigned long long	samples;
181 	unsigned long		output_max_size;	/* = 0: unlimited */
182 	struct perf_debuginfod	debuginfod;
183 	int			nr_threads;
184 	struct thread_mask	*thread_masks;
185 	struct record_thread	*thread_data;
186 	struct pollfd_index_map	*index_map;
187 	size_t			index_map_sz;
188 	size_t			index_map_cnt;
189 };
190 
191 static volatile int done;
192 
193 static volatile int auxtrace_record__snapshot_started;
194 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
195 static DEFINE_TRIGGER(switch_output_trigger);
196 
197 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
198 	"SYS", "NODE", "CPU"
199 };
200 
201 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
202 				  struct perf_sample *sample, struct machine *machine);
203 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
204 				   struct perf_sample *sample, struct machine *machine);
205 static int process_timestamp_boundary(const struct perf_tool *tool,
206 				      union perf_event *event,
207 				      struct perf_sample *sample,
208 				      struct machine *machine);
209 
210 #ifndef HAVE_GETTID
211 static inline pid_t gettid(void)
212 {
213 	return (pid_t)syscall(__NR_gettid);
214 }
215 #endif
216 
217 static int record__threads_enabled(struct record *rec)
218 {
219 	return rec->opts.threads_spec;
220 }
221 
222 static bool switch_output_signal(struct record *rec)
223 {
224 	return rec->switch_output.signal &&
225 	       trigger_is_ready(&switch_output_trigger);
226 }
227 
228 static bool switch_output_size(struct record *rec)
229 {
230 	return rec->switch_output.size &&
231 	       trigger_is_ready(&switch_output_trigger) &&
232 	       (rec->bytes_written >= rec->switch_output.size);
233 }
234 
235 static bool switch_output_time(struct record *rec)
236 {
237 	return rec->switch_output.time &&
238 	       trigger_is_ready(&switch_output_trigger);
239 }
240 
241 static u64 record__bytes_written(struct record *rec)
242 {
243 	return rec->bytes_written + rec->thread_bytes_written;
244 }
245 
246 static bool record__output_max_size_exceeded(struct record *rec)
247 {
248 	return rec->output_max_size &&
249 	       (record__bytes_written(rec) >= rec->output_max_size);
250 }
251 
252 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
253 			 void *bf, size_t size)
254 {
255 	struct perf_data_file *file = &rec->session->data->file;
256 
257 	if (map && map->file)
258 		file = map->file;
259 
260 	if (perf_data_file__write(file, bf, size) < 0) {
261 		pr_err("failed to write perf data, error: %m\n");
262 		return -1;
263 	}
264 
265 	if (map && map->file) {
266 		thread->bytes_written += size;
267 		rec->thread_bytes_written += size;
268 	} else {
269 		rec->bytes_written += size;
270 	}
271 
272 	if (record__output_max_size_exceeded(rec) && !done) {
273 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
274 				" stopping session ]\n",
275 				record__bytes_written(rec) >> 10);
276 		done = 1;
277 	}
278 
279 	if (switch_output_size(rec))
280 		trigger_hit(&switch_output_trigger);
281 
282 	return 0;
283 }
284 
285 static int record__aio_enabled(struct record *rec);
286 static int record__comp_enabled(struct record *rec);
287 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
288 			    void *dst, size_t dst_size, void *src, size_t src_size);
289 
290 #ifdef HAVE_AIO_SUPPORT
291 static int record__aio_write(struct aiocb *cblock, int trace_fd,
292 		void *buf, size_t size, off_t off)
293 {
294 	int rc;
295 
296 	cblock->aio_fildes = trace_fd;
297 	cblock->aio_buf    = buf;
298 	cblock->aio_nbytes = size;
299 	cblock->aio_offset = off;
300 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
301 
302 	do {
303 		rc = aio_write(cblock);
304 		if (rc == 0) {
305 			break;
306 		} else if (errno != EAGAIN) {
307 			cblock->aio_fildes = -1;
308 			pr_err("failed to queue perf data, error: %m\n");
309 			break;
310 		}
311 	} while (1);
312 
313 	return rc;
314 }
315 
316 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
317 {
318 	void *rem_buf;
319 	off_t rem_off;
320 	size_t rem_size;
321 	int rc, aio_errno;
322 	ssize_t aio_ret, written;
323 
324 	aio_errno = aio_error(cblock);
325 	if (aio_errno == EINPROGRESS)
326 		return 0;
327 
328 	written = aio_ret = aio_return(cblock);
329 	if (aio_ret < 0) {
330 		if (aio_errno != EINTR)
331 			pr_err("failed to write perf data, error: %m\n");
332 		written = 0;
333 	}
334 
335 	rem_size = cblock->aio_nbytes - written;
336 
337 	if (rem_size == 0) {
338 		cblock->aio_fildes = -1;
339 		/*
340 		 * md->refcount is incremented in record__aio_pushfn() for
341 		 * every aio write request started in record__aio_push() so
342 		 * decrement it because the request is now complete.
343 		 */
344 		perf_mmap__put(&md->core);
345 		rc = 1;
346 	} else {
347 		/*
348 		 * aio write request may require restart with the
349 		 * remainder if the kernel didn't write whole
350 		 * chunk at once.
351 		 */
352 		rem_off = cblock->aio_offset + written;
353 		rem_buf = (void *)(cblock->aio_buf + written);
354 		record__aio_write(cblock, cblock->aio_fildes,
355 				rem_buf, rem_size, rem_off);
356 		rc = 0;
357 	}
358 
359 	return rc;
360 }
361 
362 static int record__aio_sync(struct mmap *md, bool sync_all)
363 {
364 	struct aiocb **aiocb = md->aio.aiocb;
365 	struct aiocb *cblocks = md->aio.cblocks;
366 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
367 	int i, do_suspend;
368 
369 	do {
370 		do_suspend = 0;
371 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
372 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
373 				if (sync_all)
374 					aiocb[i] = NULL;
375 				else
376 					return i;
377 			} else {
378 				/*
379 				 * Started aio write is not complete yet
380 				 * so it has to be waited before the
381 				 * next allocation.
382 				 */
383 				aiocb[i] = &cblocks[i];
384 				do_suspend = 1;
385 			}
386 		}
387 		if (!do_suspend)
388 			return -1;
389 
390 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
391 			if (!(errno == EAGAIN || errno == EINTR))
392 				pr_err("failed to sync perf data, error: %m\n");
393 		}
394 	} while (1);
395 }
396 
397 struct record_aio {
398 	struct record	*rec;
399 	void		*data;
400 	size_t		size;
401 };
402 
403 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
404 {
405 	struct record_aio *aio = to;
406 
407 	/*
408 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
409 	 * to release space in the kernel buffer as fast as possible, calling
410 	 * perf_mmap__consume() from perf_mmap__push() function.
411 	 *
412 	 * That lets the kernel to proceed with storing more profiling data into
413 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
414 	 *
415 	 * Coping can be done in two steps in case the chunk of profiling data
416 	 * crosses the upper bound of the kernel buffer. In this case we first move
417 	 * part of data from map->start till the upper bound and then the remainder
418 	 * from the beginning of the kernel buffer till the end of the data chunk.
419 	 */
420 
421 	if (record__comp_enabled(aio->rec)) {
422 		ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
423 						   mmap__mmap_len(map) - aio->size,
424 						   buf, size);
425 		if (compressed < 0)
426 			return (int)compressed;
427 
428 		size = compressed;
429 	} else {
430 		memcpy(aio->data + aio->size, buf, size);
431 	}
432 
433 	if (!aio->size) {
434 		/*
435 		 * Increment map->refcount to guard map->aio.data[] buffer
436 		 * from premature deallocation because map object can be
437 		 * released earlier than aio write request started on
438 		 * map->aio.data[] buffer is complete.
439 		 *
440 		 * perf_mmap__put() is done at record__aio_complete()
441 		 * after started aio request completion or at record__aio_push()
442 		 * if the request failed to start.
443 		 */
444 		perf_mmap__get(&map->core);
445 	}
446 
447 	aio->size += size;
448 
449 	return size;
450 }
451 
452 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
453 {
454 	int ret, idx;
455 	int trace_fd = rec->session->data->file.fd;
456 	struct record_aio aio = { .rec = rec, .size = 0 };
457 
458 	/*
459 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
460 	 * becomes available after previous aio write operation.
461 	 */
462 
463 	idx = record__aio_sync(map, false);
464 	aio.data = map->aio.data[idx];
465 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
466 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
467 		return ret;
468 
469 	rec->samples++;
470 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
471 	if (!ret) {
472 		*off += aio.size;
473 		rec->bytes_written += aio.size;
474 		if (switch_output_size(rec))
475 			trigger_hit(&switch_output_trigger);
476 	} else {
477 		/*
478 		 * Decrement map->refcount incremented in record__aio_pushfn()
479 		 * back if record__aio_write() operation failed to start, otherwise
480 		 * map->refcount is decremented in record__aio_complete() after
481 		 * aio write operation finishes successfully.
482 		 */
483 		perf_mmap__put(&map->core);
484 	}
485 
486 	return ret;
487 }
488 
489 static off_t record__aio_get_pos(int trace_fd)
490 {
491 	return lseek(trace_fd, 0, SEEK_CUR);
492 }
493 
494 static void record__aio_set_pos(int trace_fd, off_t pos)
495 {
496 	lseek(trace_fd, pos, SEEK_SET);
497 }
498 
499 static void record__aio_mmap_read_sync(struct record *rec)
500 {
501 	int i;
502 	struct evlist *evlist = rec->evlist;
503 	struct mmap *maps = evlist->mmap;
504 
505 	if (!record__aio_enabled(rec))
506 		return;
507 
508 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
509 		struct mmap *map = &maps[i];
510 
511 		if (map->core.base)
512 			record__aio_sync(map, true);
513 	}
514 }
515 
516 static int nr_cblocks_default = 1;
517 static int nr_cblocks_max = 4;
518 
519 static int record__aio_parse(const struct option *opt,
520 			     const char *str,
521 			     int unset)
522 {
523 	struct record_opts *opts = (struct record_opts *)opt->value;
524 
525 	if (unset) {
526 		opts->nr_cblocks = 0;
527 	} else {
528 		if (str)
529 			opts->nr_cblocks = strtol(str, NULL, 0);
530 		if (!opts->nr_cblocks)
531 			opts->nr_cblocks = nr_cblocks_default;
532 	}
533 
534 	return 0;
535 }
536 #else /* HAVE_AIO_SUPPORT */
537 static int nr_cblocks_max = 0;
538 
539 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
540 			    off_t *off __maybe_unused)
541 {
542 	return -1;
543 }
544 
545 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
546 {
547 	return -1;
548 }
549 
550 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
551 {
552 }
553 
554 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
555 {
556 }
557 #endif
558 
559 static int record__aio_enabled(struct record *rec)
560 {
561 	return rec->opts.nr_cblocks > 0;
562 }
563 
564 #define MMAP_FLUSH_DEFAULT 1
565 static int record__mmap_flush_parse(const struct option *opt,
566 				    const char *str,
567 				    int unset)
568 {
569 	int flush_max;
570 	struct record_opts *opts = (struct record_opts *)opt->value;
571 	static struct parse_tag tags[] = {
572 			{ .tag  = 'B', .mult = 1       },
573 			{ .tag  = 'K', .mult = 1 << 10 },
574 			{ .tag  = 'M', .mult = 1 << 20 },
575 			{ .tag  = 'G', .mult = 1 << 30 },
576 			{ .tag  = 0 },
577 	};
578 
579 	if (unset)
580 		return 0;
581 
582 	if (str) {
583 		opts->mmap_flush = parse_tag_value(str, tags);
584 		if (opts->mmap_flush == (int)-1)
585 			opts->mmap_flush = strtol(str, NULL, 0);
586 	}
587 
588 	if (!opts->mmap_flush)
589 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
590 
591 	flush_max = evlist__mmap_size(opts->mmap_pages);
592 	flush_max /= 4;
593 	if (opts->mmap_flush > flush_max)
594 		opts->mmap_flush = flush_max;
595 
596 	return 0;
597 }
598 
599 #ifdef HAVE_ZSTD_SUPPORT
600 static unsigned int comp_level_default = 1;
601 
602 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
603 {
604 	struct record_opts *opts = opt->value;
605 
606 	if (unset) {
607 		opts->comp_level = 0;
608 	} else {
609 		if (str)
610 			opts->comp_level = strtol(str, NULL, 0);
611 		if (!opts->comp_level)
612 			opts->comp_level = comp_level_default;
613 	}
614 
615 	return 0;
616 }
617 #endif
618 static unsigned int comp_level_max = 22;
619 
620 static int record__comp_enabled(struct record *rec)
621 {
622 	return rec->opts.comp_level > 0;
623 }
624 
625 static int process_synthesized_event(const struct perf_tool *tool,
626 				     union perf_event *event,
627 				     struct perf_sample *sample __maybe_unused,
628 				     struct machine *machine __maybe_unused)
629 {
630 	struct record *rec = container_of(tool, struct record, tool);
631 	return record__write(rec, NULL, event, event->header.size);
632 }
633 
634 static struct mutex synth_lock;
635 
636 static int process_locked_synthesized_event(const struct perf_tool *tool,
637 				     union perf_event *event,
638 				     struct perf_sample *sample __maybe_unused,
639 				     struct machine *machine __maybe_unused)
640 {
641 	int ret;
642 
643 	mutex_lock(&synth_lock);
644 	ret = process_synthesized_event(tool, event, sample, machine);
645 	mutex_unlock(&synth_lock);
646 	return ret;
647 }
648 
649 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
650 {
651 	struct record *rec = to;
652 
653 	if (record__comp_enabled(rec)) {
654 		struct perf_record_compressed2 *event = map->data;
655 		size_t padding = 0;
656 		u8 pad[8] = {0};
657 		ssize_t compressed = zstd_compress(rec->session, map, map->data,
658 						   mmap__mmap_len(map), bf, size);
659 
660 		if (compressed < 0)
661 			return (int)compressed;
662 
663 		bf = event;
664 		thread->samples++;
665 
666 		/*
667 		 * The record from `zstd_compress` is not 8 bytes aligned, which would cause asan
668 		 * error. We make it aligned here.
669 		 */
670 		event->data_size = compressed - sizeof(struct perf_record_compressed2);
671 		event->header.size = PERF_ALIGN(compressed, sizeof(u64));
672 		padding = event->header.size - compressed;
673 		return record__write(rec, map, bf, compressed) ||
674 		       record__write(rec, map, &pad, padding);
675 	}
676 
677 	thread->samples++;
678 	return record__write(rec, map, bf, size);
679 }
680 
681 static volatile sig_atomic_t signr = -1;
682 static volatile sig_atomic_t child_finished;
683 #ifdef HAVE_EVENTFD_SUPPORT
684 static volatile sig_atomic_t done_fd = -1;
685 #endif
686 
687 static void sig_handler(int sig)
688 {
689 	if (sig == SIGCHLD)
690 		child_finished = 1;
691 	else
692 		signr = sig;
693 
694 	done = 1;
695 #ifdef HAVE_EVENTFD_SUPPORT
696 	if (done_fd >= 0) {
697 		u64 tmp = 1;
698 		int orig_errno = errno;
699 
700 		/*
701 		 * It is possible for this signal handler to run after done is
702 		 * checked in the main loop, but before the perf counter fds are
703 		 * polled. If this happens, the poll() will continue to wait
704 		 * even though done is set, and will only break out if either
705 		 * another signal is received, or the counters are ready for
706 		 * read. To ensure the poll() doesn't sleep when done is set,
707 		 * use an eventfd (done_fd) to wake up the poll().
708 		 */
709 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
710 			pr_err("failed to signal wakeup fd, error: %m\n");
711 
712 		errno = orig_errno;
713 	}
714 #endif // HAVE_EVENTFD_SUPPORT
715 }
716 
717 static void sigsegv_handler(int sig)
718 {
719 	perf_hooks__recover();
720 	sighandler_dump_stack(sig);
721 }
722 
723 static void record__sig_exit(void)
724 {
725 	if (signr == -1)
726 		return;
727 
728 	signal(signr, SIG_DFL);
729 	raise(signr);
730 }
731 
732 #ifdef HAVE_AUXTRACE_SUPPORT
733 
734 static int record__process_auxtrace(const struct perf_tool *tool,
735 				    struct mmap *map,
736 				    union perf_event *event, void *data1,
737 				    size_t len1, void *data2, size_t len2)
738 {
739 	struct record *rec = container_of(tool, struct record, tool);
740 	struct perf_data *data = &rec->data;
741 	size_t padding;
742 	u8 pad[8] = {0};
743 
744 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
745 		off_t file_offset;
746 		int fd = perf_data__fd(data);
747 		int err;
748 
749 		file_offset = lseek(fd, 0, SEEK_CUR);
750 		if (file_offset == -1)
751 			return -1;
752 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
753 						     event, file_offset);
754 		if (err)
755 			return err;
756 	}
757 
758 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
759 	padding = (len1 + len2) & 7;
760 	if (padding)
761 		padding = 8 - padding;
762 
763 	record__write(rec, map, event, event->header.size);
764 	record__write(rec, map, data1, len1);
765 	if (len2)
766 		record__write(rec, map, data2, len2);
767 	record__write(rec, map, &pad, padding);
768 
769 	return 0;
770 }
771 
772 static int record__auxtrace_mmap_read(struct record *rec,
773 				      struct mmap *map)
774 {
775 	int ret;
776 
777 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
778 				  record__process_auxtrace);
779 	if (ret < 0)
780 		return ret;
781 
782 	if (ret)
783 		rec->samples++;
784 
785 	return 0;
786 }
787 
788 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
789 					       struct mmap *map)
790 {
791 	int ret;
792 
793 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
794 					   record__process_auxtrace,
795 					   rec->opts.auxtrace_snapshot_size);
796 	if (ret < 0)
797 		return ret;
798 
799 	if (ret)
800 		rec->samples++;
801 
802 	return 0;
803 }
804 
805 static int record__auxtrace_read_snapshot_all(struct record *rec)
806 {
807 	int i;
808 	int rc = 0;
809 
810 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
811 		struct mmap *map = &rec->evlist->mmap[i];
812 
813 		if (!map->auxtrace_mmap.base)
814 			continue;
815 
816 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
817 			rc = -1;
818 			goto out;
819 		}
820 	}
821 out:
822 	return rc;
823 }
824 
825 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
826 {
827 	pr_debug("Recording AUX area tracing snapshot\n");
828 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
829 		trigger_error(&auxtrace_snapshot_trigger);
830 	} else {
831 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
832 			trigger_error(&auxtrace_snapshot_trigger);
833 		else
834 			trigger_ready(&auxtrace_snapshot_trigger);
835 	}
836 }
837 
838 static int record__auxtrace_snapshot_exit(struct record *rec)
839 {
840 	if (trigger_is_error(&auxtrace_snapshot_trigger))
841 		return 0;
842 
843 	if (!auxtrace_record__snapshot_started &&
844 	    auxtrace_record__snapshot_start(rec->itr))
845 		return -1;
846 
847 	record__read_auxtrace_snapshot(rec, true);
848 	if (trigger_is_error(&auxtrace_snapshot_trigger))
849 		return -1;
850 
851 	return 0;
852 }
853 
854 static int record__auxtrace_init(struct record *rec)
855 {
856 	int err;
857 
858 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
859 	    && record__threads_enabled(rec)) {
860 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
861 		return -EINVAL;
862 	}
863 
864 	if (!rec->itr) {
865 		rec->itr = auxtrace_record__init(rec->evlist, &err);
866 		if (err)
867 			return err;
868 	}
869 
870 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
871 					      rec->opts.auxtrace_snapshot_opts);
872 	if (err)
873 		return err;
874 
875 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
876 					    rec->opts.auxtrace_sample_opts);
877 	if (err)
878 		return err;
879 
880 	err = auxtrace_parse_aux_action(rec->evlist);
881 	if (err)
882 		return err;
883 
884 	return auxtrace_parse_filters(rec->evlist);
885 }
886 
887 #else
888 
889 static inline
890 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
891 			       struct mmap *map __maybe_unused)
892 {
893 	return 0;
894 }
895 
896 static inline
897 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
898 				    bool on_exit __maybe_unused)
899 {
900 }
901 
902 static inline
903 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
904 {
905 	return 0;
906 }
907 
908 static inline
909 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
910 {
911 	return 0;
912 }
913 
914 static int record__auxtrace_init(struct record *rec __maybe_unused)
915 {
916 	return 0;
917 }
918 
919 #endif
920 
921 static int record__config_text_poke(struct evlist *evlist)
922 {
923 	struct evsel *evsel;
924 
925 	/* Nothing to do if text poke is already configured */
926 	evlist__for_each_entry(evlist, evsel) {
927 		if (evsel->core.attr.text_poke)
928 			return 0;
929 	}
930 
931 	evsel = evlist__add_dummy_on_all_cpus(evlist);
932 	if (!evsel)
933 		return -ENOMEM;
934 
935 	evsel->core.attr.text_poke = 1;
936 	evsel->core.attr.ksymbol = 1;
937 	evsel->immediate = true;
938 	evsel__set_sample_bit(evsel, TIME);
939 
940 	return 0;
941 }
942 
943 static int record__config_off_cpu(struct record *rec)
944 {
945 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
946 }
947 
948 static bool record__tracking_system_wide(struct record *rec)
949 {
950 	struct evlist *evlist = rec->evlist;
951 	struct evsel *evsel;
952 
953 	/*
954 	 * If non-dummy evsel exists, system_wide sideband is need to
955 	 * help parse sample information.
956 	 * For example, PERF_EVENT_MMAP event to help parse symbol,
957 	 * and PERF_EVENT_COMM event to help parse task executable name.
958 	 */
959 	evlist__for_each_entry(evlist, evsel) {
960 		if (!evsel__is_dummy_event(evsel))
961 			return true;
962 	}
963 
964 	return false;
965 }
966 
967 static int record__config_tracking_events(struct record *rec)
968 {
969 	struct record_opts *opts = &rec->opts;
970 	struct evlist *evlist = rec->evlist;
971 	bool system_wide = false;
972 	struct evsel *evsel;
973 
974 	/*
975 	 * For initial_delay, system wide or a hybrid system, we need to add
976 	 * tracking event so that we can track PERF_RECORD_MMAP to cover the
977 	 * delay of waiting or event synthesis.
978 	 */
979 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
980 	    perf_pmus__num_core_pmus() > 1) {
981 
982 		/*
983 		 * User space tasks can migrate between CPUs, so when tracing
984 		 * selected CPUs, sideband for all CPUs is still needed.
985 		 */
986 		if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
987 			system_wide = true;
988 
989 		evsel = evlist__findnew_tracking_event(evlist, system_wide);
990 		if (!evsel)
991 			return -ENOMEM;
992 
993 		/*
994 		 * Enable the tracking event when the process is forked for
995 		 * initial_delay, immediately for system wide.
996 		 */
997 		if (opts->target.initial_delay && !evsel->immediate &&
998 		    !target__has_cpu(&opts->target))
999 			evsel->core.attr.enable_on_exec = 1;
1000 		else
1001 			evsel->immediate = 1;
1002 	}
1003 
1004 	return 0;
1005 }
1006 
1007 static bool record__kcore_readable(struct machine *machine)
1008 {
1009 	char kcore[PATH_MAX];
1010 	int fd;
1011 
1012 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
1013 
1014 	fd = open(kcore, O_RDONLY);
1015 	if (fd < 0)
1016 		return false;
1017 
1018 	close(fd);
1019 
1020 	return true;
1021 }
1022 
1023 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
1024 {
1025 	char from_dir[PATH_MAX];
1026 	char kcore_dir[PATH_MAX];
1027 	int ret;
1028 
1029 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
1030 
1031 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1032 	if (ret)
1033 		return ret;
1034 
1035 	return kcore_copy(from_dir, kcore_dir);
1036 }
1037 
1038 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1039 {
1040 	thread_data->pipes.msg[0] = -1;
1041 	thread_data->pipes.msg[1] = -1;
1042 	thread_data->pipes.ack[0] = -1;
1043 	thread_data->pipes.ack[1] = -1;
1044 }
1045 
1046 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1047 {
1048 	if (pipe(thread_data->pipes.msg))
1049 		return -EINVAL;
1050 
1051 	if (pipe(thread_data->pipes.ack)) {
1052 		close(thread_data->pipes.msg[0]);
1053 		thread_data->pipes.msg[0] = -1;
1054 		close(thread_data->pipes.msg[1]);
1055 		thread_data->pipes.msg[1] = -1;
1056 		return -EINVAL;
1057 	}
1058 
1059 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1060 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1061 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1062 
1063 	return 0;
1064 }
1065 
1066 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1067 {
1068 	if (thread_data->pipes.msg[0] != -1) {
1069 		close(thread_data->pipes.msg[0]);
1070 		thread_data->pipes.msg[0] = -1;
1071 	}
1072 	if (thread_data->pipes.msg[1] != -1) {
1073 		close(thread_data->pipes.msg[1]);
1074 		thread_data->pipes.msg[1] = -1;
1075 	}
1076 	if (thread_data->pipes.ack[0] != -1) {
1077 		close(thread_data->pipes.ack[0]);
1078 		thread_data->pipes.ack[0] = -1;
1079 	}
1080 	if (thread_data->pipes.ack[1] != -1) {
1081 		close(thread_data->pipes.ack[1]);
1082 		thread_data->pipes.ack[1] = -1;
1083 	}
1084 }
1085 
1086 static bool evlist__per_thread(struct evlist *evlist)
1087 {
1088 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1089 }
1090 
1091 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1092 {
1093 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1094 	struct mmap *mmap = evlist->mmap;
1095 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1096 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1097 	bool per_thread = evlist__per_thread(evlist);
1098 
1099 	if (per_thread)
1100 		thread_data->nr_mmaps = nr_mmaps;
1101 	else
1102 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1103 						      thread_data->mask->maps.nbits);
1104 	if (mmap) {
1105 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1106 		if (!thread_data->maps)
1107 			return -ENOMEM;
1108 	}
1109 	if (overwrite_mmap) {
1110 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1111 		if (!thread_data->overwrite_maps) {
1112 			zfree(&thread_data->maps);
1113 			return -ENOMEM;
1114 		}
1115 	}
1116 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1117 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1118 
1119 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1120 		if (per_thread ||
1121 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1122 			if (thread_data->maps) {
1123 				thread_data->maps[tm] = &mmap[m];
1124 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1125 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1126 			}
1127 			if (thread_data->overwrite_maps) {
1128 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1129 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1130 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1131 			}
1132 			tm++;
1133 		}
1134 	}
1135 
1136 	return 0;
1137 }
1138 
1139 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1140 {
1141 	int f, tm, pos;
1142 	struct mmap *map, *overwrite_map;
1143 
1144 	fdarray__init(&thread_data->pollfd, 64);
1145 
1146 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1147 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1148 		overwrite_map = thread_data->overwrite_maps ?
1149 				thread_data->overwrite_maps[tm] : NULL;
1150 
1151 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1152 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1153 
1154 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1155 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1156 							      &evlist->core.pollfd);
1157 				if (pos < 0)
1158 					return pos;
1159 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1160 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1161 			}
1162 		}
1163 	}
1164 
1165 	return 0;
1166 }
1167 
1168 static void record__free_thread_data(struct record *rec)
1169 {
1170 	int t;
1171 	struct record_thread *thread_data = rec->thread_data;
1172 
1173 	if (thread_data == NULL)
1174 		return;
1175 
1176 	for (t = 0; t < rec->nr_threads; t++) {
1177 		record__thread_data_close_pipes(&thread_data[t]);
1178 		zfree(&thread_data[t].maps);
1179 		zfree(&thread_data[t].overwrite_maps);
1180 		fdarray__exit(&thread_data[t].pollfd);
1181 	}
1182 
1183 	zfree(&rec->thread_data);
1184 }
1185 
1186 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1187 						    int evlist_pollfd_index,
1188 						    int thread_pollfd_index)
1189 {
1190 	size_t x = rec->index_map_cnt;
1191 
1192 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1193 		return -ENOMEM;
1194 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1195 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1196 	rec->index_map_cnt += 1;
1197 	return 0;
1198 }
1199 
1200 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1201 						    struct evlist *evlist,
1202 						    struct record_thread *thread_data)
1203 {
1204 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1205 	struct pollfd *t_entries = thread_data->pollfd.entries;
1206 	int err = 0;
1207 	size_t i;
1208 
1209 	for (i = 0; i < rec->index_map_cnt; i++) {
1210 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1211 		int t_pos = rec->index_map[i].thread_pollfd_index;
1212 
1213 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1214 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1215 			pr_err("Thread and evlist pollfd index mismatch\n");
1216 			err = -EINVAL;
1217 			continue;
1218 		}
1219 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1220 	}
1221 	return err;
1222 }
1223 
1224 static int record__dup_non_perf_events(struct record *rec,
1225 				       struct evlist *evlist,
1226 				       struct record_thread *thread_data)
1227 {
1228 	struct fdarray *fda = &evlist->core.pollfd;
1229 	int i, ret;
1230 
1231 	for (i = 0; i < fda->nr; i++) {
1232 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1233 			continue;
1234 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1235 		if (ret < 0) {
1236 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1237 			return ret;
1238 		}
1239 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1240 			  thread_data, ret, fda->entries[i].fd);
1241 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1242 		if (ret < 0) {
1243 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1244 			return ret;
1245 		}
1246 	}
1247 	return 0;
1248 }
1249 
1250 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1251 {
1252 	int t, ret;
1253 	struct record_thread *thread_data;
1254 
1255 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1256 	if (!rec->thread_data) {
1257 		pr_err("Failed to allocate thread data\n");
1258 		return -ENOMEM;
1259 	}
1260 	thread_data = rec->thread_data;
1261 
1262 	for (t = 0; t < rec->nr_threads; t++)
1263 		record__thread_data_init_pipes(&thread_data[t]);
1264 
1265 	for (t = 0; t < rec->nr_threads; t++) {
1266 		thread_data[t].rec = rec;
1267 		thread_data[t].mask = &rec->thread_masks[t];
1268 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1269 		if (ret) {
1270 			pr_err("Failed to initialize thread[%d] maps\n", t);
1271 			goto out_free;
1272 		}
1273 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1274 		if (ret) {
1275 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1276 			goto out_free;
1277 		}
1278 		if (t) {
1279 			thread_data[t].tid = -1;
1280 			ret = record__thread_data_open_pipes(&thread_data[t]);
1281 			if (ret) {
1282 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1283 				goto out_free;
1284 			}
1285 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1286 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1287 			if (ret < 0) {
1288 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1289 				goto out_free;
1290 			}
1291 			thread_data[t].ctlfd_pos = ret;
1292 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1293 				 thread_data, thread_data[t].ctlfd_pos,
1294 				 thread_data[t].pipes.msg[0]);
1295 		} else {
1296 			thread_data[t].tid = gettid();
1297 
1298 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1299 			if (ret < 0)
1300 				goto out_free;
1301 
1302 			thread_data[t].ctlfd_pos = -1; /* Not used */
1303 		}
1304 	}
1305 
1306 	return 0;
1307 
1308 out_free:
1309 	record__free_thread_data(rec);
1310 
1311 	return ret;
1312 }
1313 
1314 static int record__mmap_evlist(struct record *rec,
1315 			       struct evlist *evlist)
1316 {
1317 	int i, ret;
1318 	struct record_opts *opts = &rec->opts;
1319 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1320 				  opts->auxtrace_sample_mode;
1321 	char msg[512];
1322 
1323 	if (opts->affinity != PERF_AFFINITY_SYS)
1324 		cpu__setup_cpunode_map();
1325 
1326 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1327 				 opts->auxtrace_mmap_pages,
1328 				 auxtrace_overwrite,
1329 				 opts->nr_cblocks, opts->affinity,
1330 				 opts->mmap_flush, opts->comp_level) < 0) {
1331 		if (errno == EPERM) {
1332 			pr_err("Permission error mapping pages.\n"
1333 			       "Consider increasing "
1334 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1335 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1336 			       "(current value: %u,%u)\n",
1337 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1338 			return -errno;
1339 		} else {
1340 			pr_err("failed to mmap with %d (%s)\n", errno,
1341 				str_error_r(errno, msg, sizeof(msg)));
1342 			if (errno)
1343 				return -errno;
1344 			else
1345 				return -EINVAL;
1346 		}
1347 	}
1348 
1349 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1350 		return -1;
1351 
1352 	ret = record__alloc_thread_data(rec, evlist);
1353 	if (ret)
1354 		return ret;
1355 
1356 	if (record__threads_enabled(rec)) {
1357 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1358 		if (ret) {
1359 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1360 			return ret;
1361 		}
1362 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1363 			if (evlist->mmap)
1364 				evlist->mmap[i].file = &rec->data.dir.files[i];
1365 			if (evlist->overwrite_mmap)
1366 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1367 		}
1368 	}
1369 
1370 	return 0;
1371 }
1372 
1373 static int record__mmap(struct record *rec)
1374 {
1375 	return record__mmap_evlist(rec, rec->evlist);
1376 }
1377 
1378 static int record__open(struct record *rec)
1379 {
1380 	char msg[BUFSIZ];
1381 	struct evsel *pos;
1382 	struct evlist *evlist = rec->evlist;
1383 	struct perf_session *session = rec->session;
1384 	struct record_opts *opts = &rec->opts;
1385 	int rc = 0;
1386 
1387 	evlist__for_each_entry(evlist, pos) {
1388 try_again:
1389 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1390 			if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1391 				if (verbose > 0)
1392 					ui__warning("%s\n", msg);
1393 				goto try_again;
1394 			}
1395 			if ((errno == EINVAL || errno == EBADF) &&
1396 			    pos->core.leader != &pos->core &&
1397 			    pos->weak_group) {
1398 			        pos = evlist__reset_weak_group(evlist, pos, true);
1399 				goto try_again;
1400 			}
1401 			rc = -errno;
1402 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1403 			ui__error("%s\n", msg);
1404 			goto out;
1405 		}
1406 
1407 		pos->supported = true;
1408 	}
1409 
1410 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1411 		pr_warning(
1412 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1413 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1414 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1415 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1416 "Samples in kernel modules won't be resolved at all.\n\n"
1417 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1418 "even with a suitable vmlinux or kallsyms file.\n\n");
1419 	}
1420 
1421 	if (evlist__apply_filters(evlist, &pos, &opts->target)) {
1422 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1423 			pos->filter ?: "BPF", evsel__name(pos), errno,
1424 			str_error_r(errno, msg, sizeof(msg)));
1425 		rc = -1;
1426 		goto out;
1427 	}
1428 
1429 	rc = record__mmap(rec);
1430 	if (rc)
1431 		goto out;
1432 
1433 	session->evlist = evlist;
1434 	perf_session__set_id_hdr_size(session);
1435 out:
1436 	return rc;
1437 }
1438 
1439 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1440 {
1441 	if (rec->evlist->first_sample_time == 0)
1442 		rec->evlist->first_sample_time = sample_time;
1443 
1444 	if (sample_time)
1445 		rec->evlist->last_sample_time = sample_time;
1446 }
1447 
1448 static int process_sample_event(const struct perf_tool *tool,
1449 				union perf_event *event,
1450 				struct perf_sample *sample,
1451 				struct evsel *evsel,
1452 				struct machine *machine)
1453 {
1454 	struct record *rec = container_of(tool, struct record, tool);
1455 
1456 	set_timestamp_boundary(rec, sample->time);
1457 
1458 	if (rec->buildid_all)
1459 		return 0;
1460 
1461 	rec->samples++;
1462 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1463 }
1464 
1465 static int process_buildids(struct record *rec)
1466 {
1467 	struct perf_session *session = rec->session;
1468 
1469 	if (perf_data__size(&rec->data) == 0)
1470 		return 0;
1471 
1472 	/*
1473 	 * During this process, it'll load kernel map and replace the
1474 	 * dso->long_name to a real pathname it found.  In this case
1475 	 * we prefer the vmlinux path like
1476 	 *   /lib/modules/3.16.4/build/vmlinux
1477 	 *
1478 	 * rather than build-id path (in debug directory).
1479 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1480 	 */
1481 	symbol_conf.ignore_vmlinux_buildid = true;
1482 
1483 	/*
1484 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1485 	 * so no need to process samples. But if timestamp_boundary is enabled,
1486 	 * it still needs to walk on all samples to get the timestamps of
1487 	 * first/last samples.
1488 	 */
1489 	if (rec->buildid_all && !rec->timestamp_boundary)
1490 		rec->tool.sample = process_event_sample_stub;
1491 
1492 	return perf_session__process_events(session);
1493 }
1494 
1495 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1496 {
1497 	int err;
1498 	struct perf_tool *tool = data;
1499 	/*
1500 	 *As for guest kernel when processing subcommand record&report,
1501 	 *we arrange module mmap prior to guest kernel mmap and trigger
1502 	 *a preload dso because default guest module symbols are loaded
1503 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1504 	 *method is used to avoid symbol missing when the first addr is
1505 	 *in module instead of in guest kernel.
1506 	 */
1507 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1508 					     machine);
1509 	if (err < 0)
1510 		pr_err("Couldn't record guest kernel [%d]'s reference"
1511 		       " relocation symbol.\n", machine->pid);
1512 
1513 	/*
1514 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1515 	 * have no _text sometimes.
1516 	 */
1517 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1518 						 machine);
1519 	if (err < 0)
1520 		pr_err("Couldn't record guest kernel [%d]'s reference"
1521 		       " relocation symbol.\n", machine->pid);
1522 }
1523 
1524 static struct perf_event_header finished_round_event = {
1525 	.size = sizeof(struct perf_event_header),
1526 	.type = PERF_RECORD_FINISHED_ROUND,
1527 };
1528 
1529 static struct perf_event_header finished_init_event = {
1530 	.size = sizeof(struct perf_event_header),
1531 	.type = PERF_RECORD_FINISHED_INIT,
1532 };
1533 
1534 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1535 {
1536 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1537 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1538 			  thread->mask->affinity.nbits)) {
1539 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1540 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1541 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1542 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1543 					(cpu_set_t *)thread->mask->affinity.bits);
1544 		if (verbose == 2) {
1545 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1546 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1547 		}
1548 	}
1549 }
1550 
1551 static size_t process_comp_header(void *record, size_t increment)
1552 {
1553 	struct perf_record_compressed2 *event = record;
1554 	size_t size = sizeof(*event);
1555 
1556 	if (increment) {
1557 		event->header.size += increment;
1558 		return increment;
1559 	}
1560 
1561 	event->header.type = PERF_RECORD_COMPRESSED2;
1562 	event->header.size = size;
1563 
1564 	return size;
1565 }
1566 
1567 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1568 			    void *dst, size_t dst_size, void *src, size_t src_size)
1569 {
1570 	ssize_t compressed;
1571 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed2) - 1;
1572 	struct zstd_data *zstd_data = &session->zstd_data;
1573 
1574 	if (map && map->file)
1575 		zstd_data = &map->zstd_data;
1576 
1577 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1578 						     max_record_size, process_comp_header);
1579 	if (compressed < 0)
1580 		return compressed;
1581 
1582 	if (map && map->file) {
1583 		thread->bytes_transferred += src_size;
1584 		thread->bytes_compressed  += compressed;
1585 	} else {
1586 		session->bytes_transferred += src_size;
1587 		session->bytes_compressed  += compressed;
1588 	}
1589 
1590 	return compressed;
1591 }
1592 
1593 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1594 				    bool overwrite, bool synch)
1595 {
1596 	u64 bytes_written = rec->bytes_written;
1597 	int i;
1598 	int rc = 0;
1599 	int nr_mmaps;
1600 	struct mmap **maps;
1601 	int trace_fd = rec->data.file.fd;
1602 	off_t off = 0;
1603 
1604 	if (!evlist)
1605 		return 0;
1606 
1607 	nr_mmaps = thread->nr_mmaps;
1608 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1609 
1610 	if (!maps)
1611 		return 0;
1612 
1613 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1614 		return 0;
1615 
1616 	if (record__aio_enabled(rec))
1617 		off = record__aio_get_pos(trace_fd);
1618 
1619 	for (i = 0; i < nr_mmaps; i++) {
1620 		u64 flush = 0;
1621 		struct mmap *map = maps[i];
1622 
1623 		if (map->core.base) {
1624 			record__adjust_affinity(rec, map);
1625 			if (synch) {
1626 				flush = map->core.flush;
1627 				map->core.flush = 1;
1628 			}
1629 			if (!record__aio_enabled(rec)) {
1630 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1631 					if (synch)
1632 						map->core.flush = flush;
1633 					rc = -1;
1634 					goto out;
1635 				}
1636 			} else {
1637 				if (record__aio_push(rec, map, &off) < 0) {
1638 					record__aio_set_pos(trace_fd, off);
1639 					if (synch)
1640 						map->core.flush = flush;
1641 					rc = -1;
1642 					goto out;
1643 				}
1644 			}
1645 			if (synch)
1646 				map->core.flush = flush;
1647 		}
1648 
1649 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1650 		    !rec->opts.auxtrace_sample_mode &&
1651 		    record__auxtrace_mmap_read(rec, map) != 0) {
1652 			rc = -1;
1653 			goto out;
1654 		}
1655 	}
1656 
1657 	if (record__aio_enabled(rec))
1658 		record__aio_set_pos(trace_fd, off);
1659 
1660 	/*
1661 	 * Mark the round finished in case we wrote
1662 	 * at least one event.
1663 	 *
1664 	 * No need for round events in directory mode,
1665 	 * because per-cpu maps and files have data
1666 	 * sorted by kernel.
1667 	 */
1668 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1669 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1670 
1671 	if (overwrite)
1672 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1673 out:
1674 	return rc;
1675 }
1676 
1677 static int record__mmap_read_all(struct record *rec, bool synch)
1678 {
1679 	int err;
1680 
1681 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1682 	if (err)
1683 		return err;
1684 
1685 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1686 }
1687 
1688 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1689 					   void *arg __maybe_unused)
1690 {
1691 	struct perf_mmap *map = fda->priv[fd].ptr;
1692 
1693 	if (map)
1694 		perf_mmap__put(map);
1695 }
1696 
1697 static void *record__thread(void *arg)
1698 {
1699 	enum thread_msg msg = THREAD_MSG__READY;
1700 	bool terminate = false;
1701 	struct fdarray *pollfd;
1702 	int err, ctlfd_pos;
1703 
1704 	thread = arg;
1705 	thread->tid = gettid();
1706 
1707 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1708 	if (err == -1)
1709 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1710 			   thread->tid, strerror(errno));
1711 
1712 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1713 
1714 	pollfd = &thread->pollfd;
1715 	ctlfd_pos = thread->ctlfd_pos;
1716 
1717 	for (;;) {
1718 		unsigned long long hits = thread->samples;
1719 
1720 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1721 			break;
1722 
1723 		if (hits == thread->samples) {
1724 
1725 			err = fdarray__poll(pollfd, -1);
1726 			/*
1727 			 * Propagate error, only if there's any. Ignore positive
1728 			 * number of returned events and interrupt error.
1729 			 */
1730 			if (err > 0 || (err < 0 && errno == EINTR))
1731 				err = 0;
1732 			thread->waking++;
1733 
1734 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1735 					    record__thread_munmap_filtered, NULL) == 0)
1736 				break;
1737 		}
1738 
1739 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1740 			terminate = true;
1741 			close(thread->pipes.msg[0]);
1742 			thread->pipes.msg[0] = -1;
1743 			pollfd->entries[ctlfd_pos].fd = -1;
1744 			pollfd->entries[ctlfd_pos].events = 0;
1745 		}
1746 
1747 		pollfd->entries[ctlfd_pos].revents = 0;
1748 	}
1749 	record__mmap_read_all(thread->rec, true);
1750 
1751 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1752 	if (err == -1)
1753 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1754 			   thread->tid, strerror(errno));
1755 
1756 	return NULL;
1757 }
1758 
1759 static void record__init_features(struct record *rec)
1760 {
1761 	struct perf_session *session = rec->session;
1762 	int feat;
1763 
1764 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1765 		perf_header__set_feat(&session->header, feat);
1766 
1767 	if (rec->no_buildid)
1768 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1769 
1770 	if (!have_tracepoints(&rec->evlist->core.entries))
1771 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1772 
1773 	if (!rec->opts.branch_stack)
1774 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1775 
1776 	if (!rec->opts.full_auxtrace)
1777 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1778 
1779 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1780 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1781 
1782 	if (!rec->opts.use_clockid)
1783 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1784 
1785 	if (!record__threads_enabled(rec))
1786 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1787 
1788 	if (!record__comp_enabled(rec))
1789 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1790 
1791 	perf_header__clear_feat(&session->header, HEADER_STAT);
1792 }
1793 
1794 static void
1795 record__finish_output(struct record *rec)
1796 {
1797 	int i;
1798 	struct perf_data *data = &rec->data;
1799 	int fd = perf_data__fd(data);
1800 
1801 	if (data->is_pipe) {
1802 		/* Just to display approx. size */
1803 		data->file.size = rec->bytes_written;
1804 		return;
1805 	}
1806 
1807 	rec->session->header.data_size += rec->bytes_written;
1808 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1809 	if (record__threads_enabled(rec)) {
1810 		for (i = 0; i < data->dir.nr; i++)
1811 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1812 	}
1813 
1814 	if (!rec->no_buildid) {
1815 		process_buildids(rec);
1816 
1817 		if (rec->buildid_all)
1818 			perf_session__dsos_hit_all(rec->session);
1819 	}
1820 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1821 
1822 	return;
1823 }
1824 
1825 static int record__synthesize_workload(struct record *rec, bool tail)
1826 {
1827 	int err;
1828 	struct perf_thread_map *thread_map;
1829 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1830 
1831 	if (rec->opts.tail_synthesize != tail)
1832 		return 0;
1833 
1834 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1835 	if (thread_map == NULL)
1836 		return -1;
1837 
1838 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1839 						 process_synthesized_event,
1840 						 &rec->session->machines.host,
1841 						 needs_mmap,
1842 						 rec->opts.sample_address);
1843 	perf_thread_map__put(thread_map);
1844 	return err;
1845 }
1846 
1847 static int write_finished_init(struct record *rec, bool tail)
1848 {
1849 	if (rec->opts.tail_synthesize != tail)
1850 		return 0;
1851 
1852 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1853 }
1854 
1855 static int record__synthesize(struct record *rec, bool tail);
1856 
1857 static int
1858 record__switch_output(struct record *rec, bool at_exit)
1859 {
1860 	struct perf_data *data = &rec->data;
1861 	char *new_filename = NULL;
1862 	int fd, err;
1863 
1864 	/* Same Size:      "2015122520103046"*/
1865 	char timestamp[] = "InvalidTimestamp";
1866 
1867 	record__aio_mmap_read_sync(rec);
1868 
1869 	write_finished_init(rec, true);
1870 
1871 	record__synthesize(rec, true);
1872 	if (target__none(&rec->opts.target))
1873 		record__synthesize_workload(rec, true);
1874 
1875 	rec->samples = 0;
1876 	record__finish_output(rec);
1877 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1878 	if (err) {
1879 		pr_err("Failed to get current timestamp\n");
1880 		return -EINVAL;
1881 	}
1882 
1883 	fd = perf_data__switch(data, timestamp,
1884 			       rec->session->header.data_offset,
1885 			       at_exit, &new_filename);
1886 	if (fd >= 0 && !at_exit) {
1887 		rec->bytes_written = 0;
1888 		rec->session->header.data_size = 0;
1889 	}
1890 
1891 	if (!quiet) {
1892 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1893 			data->path, timestamp);
1894 	}
1895 
1896 	if (rec->switch_output.num_files) {
1897 		int n = rec->switch_output.cur_file + 1;
1898 
1899 		if (n >= rec->switch_output.num_files)
1900 			n = 0;
1901 		rec->switch_output.cur_file = n;
1902 		if (rec->switch_output.filenames[n]) {
1903 			remove(rec->switch_output.filenames[n]);
1904 			zfree(&rec->switch_output.filenames[n]);
1905 		}
1906 		rec->switch_output.filenames[n] = new_filename;
1907 	} else {
1908 		free(new_filename);
1909 	}
1910 
1911 	/* Output tracking events */
1912 	if (!at_exit) {
1913 		record__synthesize(rec, false);
1914 
1915 		/*
1916 		 * In 'perf record --switch-output' without -a,
1917 		 * record__synthesize() in record__switch_output() won't
1918 		 * generate tracking events because there's no thread_map
1919 		 * in evlist. Which causes newly created perf.data doesn't
1920 		 * contain map and comm information.
1921 		 * Create a fake thread_map and directly call
1922 		 * perf_event__synthesize_thread_map() for those events.
1923 		 */
1924 		if (target__none(&rec->opts.target))
1925 			record__synthesize_workload(rec, false);
1926 		write_finished_init(rec, false);
1927 	}
1928 	return fd;
1929 }
1930 
1931 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1932 					struct perf_record_lost_samples *lost,
1933 					int cpu_idx, int thread_idx, u64 lost_count,
1934 					u16 misc_flag)
1935 {
1936 	struct perf_sample_id *sid;
1937 	struct perf_sample sample;
1938 	int id_hdr_size;
1939 
1940 	perf_sample__init(&sample, /*all=*/true);
1941 	lost->lost = lost_count;
1942 	if (evsel->core.ids) {
1943 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1944 		sample.id = sid->id;
1945 	}
1946 
1947 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1948 						       evsel->core.attr.sample_type, &sample);
1949 	lost->header.size = sizeof(*lost) + id_hdr_size;
1950 	lost->header.misc = misc_flag;
1951 	record__write(rec, NULL, lost, lost->header.size);
1952 	perf_sample__exit(&sample);
1953 }
1954 
1955 static void record__read_lost_samples(struct record *rec)
1956 {
1957 	struct perf_session *session = rec->session;
1958 	struct perf_record_lost_samples_and_ids lost;
1959 	struct evsel *evsel;
1960 
1961 	/* there was an error during record__open */
1962 	if (session->evlist == NULL)
1963 		return;
1964 
1965 	evlist__for_each_entry(session->evlist, evsel) {
1966 		struct xyarray *xy = evsel->core.sample_id;
1967 		u64 lost_count;
1968 
1969 		if (xy == NULL || evsel->core.fd == NULL)
1970 			continue;
1971 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1972 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1973 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1974 			continue;
1975 		}
1976 
1977 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1978 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1979 				struct perf_counts_values count;
1980 
1981 				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1982 					pr_debug("read LOST count failed\n");
1983 					return;
1984 				}
1985 
1986 				if (count.lost) {
1987 					memset(&lost, 0, sizeof(lost));
1988 					lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
1989 					__record__save_lost_samples(rec, evsel, &lost.lost,
1990 								    x, y, count.lost, 0);
1991 				}
1992 			}
1993 		}
1994 
1995 		lost_count = perf_bpf_filter__lost_count(evsel);
1996 		if (lost_count) {
1997 			memset(&lost, 0, sizeof(lost));
1998 			lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
1999 			__record__save_lost_samples(rec, evsel, &lost.lost, 0, 0, lost_count,
2000 						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
2001 		}
2002 	}
2003 }
2004 
2005 static volatile sig_atomic_t workload_exec_errno;
2006 
2007 /*
2008  * evlist__prepare_workload will send a SIGUSR1
2009  * if the fork fails, since we asked by setting its
2010  * want_signal to true.
2011  */
2012 static void workload_exec_failed_signal(int signo __maybe_unused,
2013 					siginfo_t *info,
2014 					void *ucontext __maybe_unused)
2015 {
2016 	workload_exec_errno = info->si_value.sival_int;
2017 	done = 1;
2018 	child_finished = 1;
2019 }
2020 
2021 static void snapshot_sig_handler(int sig);
2022 static void alarm_sig_handler(int sig);
2023 
2024 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
2025 {
2026 	if (evlist) {
2027 		if (evlist->mmap && evlist->mmap[0].core.base)
2028 			return evlist->mmap[0].core.base;
2029 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2030 			return evlist->overwrite_mmap[0].core.base;
2031 	}
2032 	return NULL;
2033 }
2034 
2035 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2036 {
2037 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2038 	if (pc)
2039 		return pc;
2040 	return NULL;
2041 }
2042 
2043 static int record__synthesize(struct record *rec, bool tail)
2044 {
2045 	struct perf_session *session = rec->session;
2046 	struct machine *machine = &session->machines.host;
2047 	struct perf_data *data = &rec->data;
2048 	struct record_opts *opts = &rec->opts;
2049 	struct perf_tool *tool = &rec->tool;
2050 	int err = 0;
2051 	event_op f = process_synthesized_event;
2052 
2053 	if (rec->opts.tail_synthesize != tail)
2054 		return 0;
2055 
2056 	if (data->is_pipe) {
2057 		err = perf_event__synthesize_for_pipe(tool, session, data,
2058 						      process_synthesized_event);
2059 		if (err < 0)
2060 			goto out;
2061 
2062 		rec->bytes_written += err;
2063 	}
2064 
2065 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2066 					  process_synthesized_event, machine);
2067 	if (err)
2068 		goto out;
2069 
2070 	/* Synthesize id_index before auxtrace_info */
2071 	err = perf_event__synthesize_id_index(tool,
2072 					      process_synthesized_event,
2073 					      session->evlist, machine);
2074 	if (err)
2075 		goto out;
2076 
2077 	if (rec->opts.full_auxtrace) {
2078 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2079 					session, process_synthesized_event);
2080 		if (err)
2081 			goto out;
2082 	}
2083 
2084 	if (!evlist__exclude_kernel(rec->evlist)) {
2085 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2086 							 machine);
2087 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2088 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2089 				   "Check /proc/kallsyms permission or run as root.\n");
2090 
2091 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2092 						     machine);
2093 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2094 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2095 				   "Check /proc/modules permission or run as root.\n");
2096 	}
2097 
2098 	if (perf_guest) {
2099 		machines__process_guests(&session->machines,
2100 					 perf_event__synthesize_guest_os, tool);
2101 	}
2102 
2103 	err = perf_event__synthesize_extra_attr(&rec->tool,
2104 						rec->evlist,
2105 						process_synthesized_event,
2106 						data->is_pipe);
2107 	if (err)
2108 		goto out;
2109 
2110 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2111 						 process_synthesized_event,
2112 						NULL);
2113 	if (err < 0) {
2114 		pr_err("Couldn't synthesize thread map.\n");
2115 		return err;
2116 	}
2117 
2118 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2119 					     process_synthesized_event, NULL);
2120 	if (err < 0) {
2121 		pr_err("Couldn't synthesize cpu map.\n");
2122 		return err;
2123 	}
2124 
2125 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2126 						machine, opts);
2127 	if (err < 0) {
2128 		pr_warning("Couldn't synthesize bpf events.\n");
2129 		err = 0;
2130 	}
2131 
2132 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2133 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2134 						     machine);
2135 		if (err < 0) {
2136 			pr_warning("Couldn't synthesize cgroup events.\n");
2137 			err = 0;
2138 		}
2139 	}
2140 
2141 	if (rec->opts.nr_threads_synthesize > 1) {
2142 		mutex_init(&synth_lock);
2143 		perf_set_multithreaded();
2144 		f = process_locked_synthesized_event;
2145 	}
2146 
2147 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2148 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2149 
2150 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2151 						    rec->evlist->core.threads,
2152 						    f, needs_mmap, opts->sample_address,
2153 						    rec->opts.nr_threads_synthesize);
2154 	}
2155 
2156 	if (rec->opts.nr_threads_synthesize > 1) {
2157 		perf_set_singlethreaded();
2158 		mutex_destroy(&synth_lock);
2159 	}
2160 
2161 out:
2162 	return err;
2163 }
2164 
2165 static void record__synthesize_final_bpf_metadata(struct record *rec __maybe_unused)
2166 {
2167 #ifdef HAVE_LIBBPF_SUPPORT
2168 	perf_event__synthesize_final_bpf_metadata(rec->session,
2169 						  process_synthesized_event);
2170 #endif
2171 }
2172 
2173 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2174 {
2175 	struct record *rec = data;
2176 	pthread_kill(rec->thread_id, SIGUSR2);
2177 	return 0;
2178 }
2179 
2180 static int record__setup_sb_evlist(struct record *rec)
2181 {
2182 	struct record_opts *opts = &rec->opts;
2183 
2184 	if (rec->sb_evlist != NULL) {
2185 		/*
2186 		 * We get here if --switch-output-event populated the
2187 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2188 		 * to the main thread.
2189 		 */
2190 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2191 		rec->thread_id = pthread_self();
2192 	}
2193 #ifdef HAVE_LIBBPF_SUPPORT
2194 	if (!opts->no_bpf_event) {
2195 		if (rec->sb_evlist == NULL) {
2196 			rec->sb_evlist = evlist__new();
2197 
2198 			if (rec->sb_evlist == NULL) {
2199 				pr_err("Couldn't create side band evlist.\n.");
2200 				return -1;
2201 			}
2202 		}
2203 
2204 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2205 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2206 			return -1;
2207 		}
2208 	}
2209 #endif
2210 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2211 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2212 		opts->no_bpf_event = true;
2213 	}
2214 
2215 	return 0;
2216 }
2217 
2218 static int record__init_clock(struct record *rec)
2219 {
2220 	struct perf_session *session = rec->session;
2221 	struct timespec ref_clockid;
2222 	struct timeval ref_tod;
2223 	u64 ref;
2224 
2225 	if (!rec->opts.use_clockid)
2226 		return 0;
2227 
2228 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2229 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2230 
2231 	session->header.env.clock.clockid = rec->opts.clockid;
2232 
2233 	if (gettimeofday(&ref_tod, NULL) != 0) {
2234 		pr_err("gettimeofday failed, cannot set reference time.\n");
2235 		return -1;
2236 	}
2237 
2238 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2239 		pr_err("clock_gettime failed, cannot set reference time.\n");
2240 		return -1;
2241 	}
2242 
2243 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2244 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2245 
2246 	session->header.env.clock.tod_ns = ref;
2247 
2248 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2249 	      (u64) ref_clockid.tv_nsec;
2250 
2251 	session->header.env.clock.clockid_ns = ref;
2252 	return 0;
2253 }
2254 
2255 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2256 {
2257 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2258 		trigger_hit(&auxtrace_snapshot_trigger);
2259 		auxtrace_record__snapshot_started = 1;
2260 		if (auxtrace_record__snapshot_start(rec->itr))
2261 			trigger_error(&auxtrace_snapshot_trigger);
2262 	}
2263 }
2264 
2265 static int record__terminate_thread(struct record_thread *thread_data)
2266 {
2267 	int err;
2268 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2269 	pid_t tid = thread_data->tid;
2270 
2271 	close(thread_data->pipes.msg[1]);
2272 	thread_data->pipes.msg[1] = -1;
2273 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2274 	if (err > 0)
2275 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2276 	else
2277 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2278 			   thread->tid, tid);
2279 
2280 	return 0;
2281 }
2282 
2283 static int record__start_threads(struct record *rec)
2284 {
2285 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2286 	struct record_thread *thread_data = rec->thread_data;
2287 	sigset_t full, mask;
2288 	pthread_t handle;
2289 	pthread_attr_t attrs;
2290 
2291 	thread = &thread_data[0];
2292 
2293 	if (!record__threads_enabled(rec))
2294 		return 0;
2295 
2296 	sigfillset(&full);
2297 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2298 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2299 		return -1;
2300 	}
2301 
2302 	pthread_attr_init(&attrs);
2303 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2304 
2305 	for (t = 1; t < nr_threads; t++) {
2306 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2307 
2308 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2309 		pthread_attr_setaffinity_np(&attrs,
2310 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2311 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2312 #endif
2313 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2314 			for (tt = 1; tt < t; tt++)
2315 				record__terminate_thread(&thread_data[t]);
2316 			pr_err("Failed to start threads: %s\n", strerror(errno));
2317 			ret = -1;
2318 			goto out_err;
2319 		}
2320 
2321 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2322 		if (err > 0)
2323 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2324 				  thread_msg_tags[msg]);
2325 		else
2326 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2327 				   thread->tid, rec->thread_data[t].tid);
2328 	}
2329 
2330 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2331 			(cpu_set_t *)thread->mask->affinity.bits);
2332 
2333 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2334 
2335 out_err:
2336 	pthread_attr_destroy(&attrs);
2337 
2338 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2339 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2340 		ret = -1;
2341 	}
2342 
2343 	return ret;
2344 }
2345 
2346 static int record__stop_threads(struct record *rec)
2347 {
2348 	int t;
2349 	struct record_thread *thread_data = rec->thread_data;
2350 
2351 	for (t = 1; t < rec->nr_threads; t++)
2352 		record__terminate_thread(&thread_data[t]);
2353 
2354 	for (t = 0; t < rec->nr_threads; t++) {
2355 		rec->samples += thread_data[t].samples;
2356 		if (!record__threads_enabled(rec))
2357 			continue;
2358 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2359 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2360 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2361 			 thread_data[t].samples, thread_data[t].waking);
2362 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2363 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2364 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2365 		else
2366 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2367 	}
2368 
2369 	return 0;
2370 }
2371 
2372 static unsigned long record__waking(struct record *rec)
2373 {
2374 	int t;
2375 	unsigned long waking = 0;
2376 	struct record_thread *thread_data = rec->thread_data;
2377 
2378 	for (t = 0; t < rec->nr_threads; t++)
2379 		waking += thread_data[t].waking;
2380 
2381 	return waking;
2382 }
2383 
2384 static int __cmd_record(struct record *rec, int argc, const char **argv)
2385 {
2386 	int err;
2387 	int status = 0;
2388 	const bool forks = argc > 0;
2389 	struct perf_tool *tool = &rec->tool;
2390 	struct record_opts *opts = &rec->opts;
2391 	struct perf_data *data = &rec->data;
2392 	struct perf_session *session;
2393 	bool disabled = false, draining = false;
2394 	int fd;
2395 	float ratio = 0;
2396 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2397 
2398 	atexit(record__sig_exit);
2399 	signal(SIGCHLD, sig_handler);
2400 	signal(SIGINT, sig_handler);
2401 	signal(SIGTERM, sig_handler);
2402 	signal(SIGSEGV, sigsegv_handler);
2403 
2404 	if (rec->opts.record_cgroup) {
2405 #ifndef HAVE_FILE_HANDLE
2406 		pr_err("cgroup tracking is not supported\n");
2407 		return -1;
2408 #endif
2409 	}
2410 
2411 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2412 		signal(SIGUSR2, snapshot_sig_handler);
2413 		if (rec->opts.auxtrace_snapshot_mode)
2414 			trigger_on(&auxtrace_snapshot_trigger);
2415 		if (rec->switch_output.enabled)
2416 			trigger_on(&switch_output_trigger);
2417 	} else {
2418 		signal(SIGUSR2, SIG_IGN);
2419 	}
2420 
2421 	perf_tool__init(tool, /*ordered_events=*/true);
2422 	tool->sample		= process_sample_event;
2423 	tool->fork		= perf_event__process_fork;
2424 	tool->exit		= perf_event__process_exit;
2425 	tool->comm		= perf_event__process_comm;
2426 	tool->namespaces	= perf_event__process_namespaces;
2427 	tool->mmap		= build_id__process_mmap;
2428 	tool->mmap2		= build_id__process_mmap2;
2429 	tool->itrace_start	= process_timestamp_boundary;
2430 	tool->aux		= process_timestamp_boundary;
2431 	tool->namespace_events	= rec->opts.record_namespaces;
2432 	tool->cgroup_events	= rec->opts.record_cgroup;
2433 	session = perf_session__new(data, tool);
2434 	if (IS_ERR(session)) {
2435 		pr_err("Perf session creation failed.\n");
2436 		return PTR_ERR(session);
2437 	}
2438 
2439 	if (record__threads_enabled(rec)) {
2440 		if (perf_data__is_pipe(&rec->data)) {
2441 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2442 			return -1;
2443 		}
2444 		if (rec->opts.full_auxtrace) {
2445 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2446 			return -1;
2447 		}
2448 	}
2449 
2450 	fd = perf_data__fd(data);
2451 	rec->session = session;
2452 
2453 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2454 		pr_err("Compression initialization failed.\n");
2455 		return -1;
2456 	}
2457 #ifdef HAVE_EVENTFD_SUPPORT
2458 	done_fd = eventfd(0, EFD_NONBLOCK);
2459 	if (done_fd < 0) {
2460 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2461 		status = -1;
2462 		goto out_delete_session;
2463 	}
2464 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2465 	if (err < 0) {
2466 		pr_err("Failed to add wakeup eventfd to poll list\n");
2467 		status = err;
2468 		goto out_delete_session;
2469 	}
2470 #endif // HAVE_EVENTFD_SUPPORT
2471 
2472 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2473 	session->header.env.comp_level = rec->opts.comp_level;
2474 
2475 	if (rec->opts.kcore &&
2476 	    !record__kcore_readable(&session->machines.host)) {
2477 		pr_err("ERROR: kcore is not readable.\n");
2478 		return -1;
2479 	}
2480 
2481 	if (record__init_clock(rec))
2482 		return -1;
2483 
2484 	record__init_features(rec);
2485 
2486 	if (forks) {
2487 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2488 					       workload_exec_failed_signal);
2489 		if (err < 0) {
2490 			pr_err("Couldn't run the workload!\n");
2491 			status = err;
2492 			goto out_delete_session;
2493 		}
2494 	}
2495 
2496 	/*
2497 	 * If we have just single event and are sending data
2498 	 * through pipe, we need to force the ids allocation,
2499 	 * because we synthesize event name through the pipe
2500 	 * and need the id for that.
2501 	 */
2502 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2503 		rec->opts.sample_id = true;
2504 
2505 	if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2506 		rec->timestamp_filename = false;
2507 		pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2508 	}
2509 
2510 	/*
2511 	 * Use global stat_config that is zero meaning aggr_mode is AGGR_NONE
2512 	 * and hybrid_merge is false.
2513 	 */
2514 	evlist__uniquify_evsel_names(rec->evlist, &stat_config);
2515 
2516 	evlist__config(rec->evlist, opts, &callchain_param);
2517 
2518 	/* Debug message used by test scripts */
2519 	pr_debug3("perf record opening and mmapping events\n");
2520 	if (record__open(rec) != 0) {
2521 		err = -1;
2522 		goto out_free_threads;
2523 	}
2524 	/* Debug message used by test scripts */
2525 	pr_debug3("perf record done opening and mmapping events\n");
2526 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2527 
2528 	if (rec->opts.kcore) {
2529 		err = record__kcore_copy(&session->machines.host, data);
2530 		if (err) {
2531 			pr_err("ERROR: Failed to copy kcore\n");
2532 			goto out_free_threads;
2533 		}
2534 	}
2535 
2536 	/*
2537 	 * Normally perf_session__new would do this, but it doesn't have the
2538 	 * evlist.
2539 	 */
2540 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2541 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2542 		rec->tool.ordered_events = false;
2543 	}
2544 
2545 	if (evlist__nr_groups(rec->evlist) == 0)
2546 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2547 
2548 	if (data->is_pipe) {
2549 		err = perf_header__write_pipe(fd);
2550 		if (err < 0)
2551 			goto out_free_threads;
2552 	} else {
2553 		err = perf_session__write_header(session, rec->evlist, fd, false);
2554 		if (err < 0)
2555 			goto out_free_threads;
2556 	}
2557 
2558 	err = -1;
2559 	if (!rec->no_buildid
2560 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2561 		pr_err("Couldn't generate buildids. "
2562 		       "Use --no-buildid to profile anyway.\n");
2563 		goto out_free_threads;
2564 	}
2565 
2566 	if (!evlist__needs_bpf_sb_event(rec->evlist))
2567 		opts->no_bpf_event = true;
2568 
2569 	err = record__setup_sb_evlist(rec);
2570 	if (err)
2571 		goto out_free_threads;
2572 
2573 	err = record__synthesize(rec, false);
2574 	if (err < 0)
2575 		goto out_free_threads;
2576 
2577 	if (rec->realtime_prio) {
2578 		struct sched_param param;
2579 
2580 		param.sched_priority = rec->realtime_prio;
2581 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2582 			pr_err("Could not set realtime priority.\n");
2583 			err = -1;
2584 			goto out_free_threads;
2585 		}
2586 	}
2587 
2588 	if (record__start_threads(rec))
2589 		goto out_free_threads;
2590 
2591 	/*
2592 	 * When perf is starting the traced process, all the events
2593 	 * (apart from group members) have enable_on_exec=1 set,
2594 	 * so don't spoil it by prematurely enabling them.
2595 	 */
2596 	if (!target__none(&opts->target) && !opts->target.initial_delay)
2597 		evlist__enable(rec->evlist);
2598 
2599 	/*
2600 	 * offcpu-time does not call execve, so enable_on_exe wouldn't work
2601 	 * when recording a workload, do it manually
2602 	 */
2603 	if (rec->off_cpu)
2604 		evlist__enable_evsel(rec->evlist, (char *)OFFCPU_EVENT);
2605 
2606 	/*
2607 	 * Let the child rip
2608 	 */
2609 	if (forks) {
2610 		struct machine *machine = &session->machines.host;
2611 		union perf_event *event;
2612 		pid_t tgid;
2613 
2614 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2615 		if (event == NULL) {
2616 			err = -ENOMEM;
2617 			goto out_child;
2618 		}
2619 
2620 		/*
2621 		 * Some H/W events are generated before COMM event
2622 		 * which is emitted during exec(), so perf script
2623 		 * cannot see a correct process name for those events.
2624 		 * Synthesize COMM event to prevent it.
2625 		 */
2626 		tgid = perf_event__synthesize_comm(tool, event,
2627 						   rec->evlist->workload.pid,
2628 						   process_synthesized_event,
2629 						   machine);
2630 		free(event);
2631 
2632 		if (tgid == -1)
2633 			goto out_child;
2634 
2635 		event = malloc(sizeof(event->namespaces) +
2636 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2637 			       machine->id_hdr_size);
2638 		if (event == NULL) {
2639 			err = -ENOMEM;
2640 			goto out_child;
2641 		}
2642 
2643 		/*
2644 		 * Synthesize NAMESPACES event for the command specified.
2645 		 */
2646 		perf_event__synthesize_namespaces(tool, event,
2647 						  rec->evlist->workload.pid,
2648 						  tgid, process_synthesized_event,
2649 						  machine);
2650 		free(event);
2651 
2652 		evlist__start_workload(rec->evlist);
2653 	}
2654 
2655 	if (opts->target.initial_delay) {
2656 		pr_info(EVLIST_DISABLED_MSG);
2657 		if (opts->target.initial_delay > 0) {
2658 			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2659 			evlist__enable(rec->evlist);
2660 			pr_info(EVLIST_ENABLED_MSG);
2661 		}
2662 	}
2663 
2664 	err = event_enable_timer__start(rec->evlist->eet);
2665 	if (err)
2666 		goto out_child;
2667 
2668 	/* Debug message used by test scripts */
2669 	pr_debug3("perf record has started\n");
2670 	fflush(stderr);
2671 
2672 	trigger_ready(&auxtrace_snapshot_trigger);
2673 	trigger_ready(&switch_output_trigger);
2674 	perf_hooks__invoke_record_start();
2675 
2676 	/*
2677 	 * Must write FINISHED_INIT so it will be seen after all other
2678 	 * synthesized user events, but before any regular events.
2679 	 */
2680 	err = write_finished_init(rec, false);
2681 	if (err < 0)
2682 		goto out_child;
2683 
2684 	for (;;) {
2685 		unsigned long long hits = thread->samples;
2686 
2687 		/*
2688 		 * rec->evlist->bkw_mmap_state is possible to be
2689 		 * BKW_MMAP_EMPTY here: when done == true and
2690 		 * hits != rec->samples in previous round.
2691 		 *
2692 		 * evlist__toggle_bkw_mmap ensure we never
2693 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2694 		 */
2695 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2696 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2697 
2698 		if (record__mmap_read_all(rec, false) < 0) {
2699 			trigger_error(&auxtrace_snapshot_trigger);
2700 			trigger_error(&switch_output_trigger);
2701 			err = -1;
2702 			goto out_child;
2703 		}
2704 
2705 		if (auxtrace_record__snapshot_started) {
2706 			auxtrace_record__snapshot_started = 0;
2707 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2708 				record__read_auxtrace_snapshot(rec, false);
2709 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2710 				pr_err("AUX area tracing snapshot failed\n");
2711 				err = -1;
2712 				goto out_child;
2713 			}
2714 		}
2715 
2716 		if (trigger_is_hit(&switch_output_trigger)) {
2717 			/*
2718 			 * If switch_output_trigger is hit, the data in
2719 			 * overwritable ring buffer should have been collected,
2720 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2721 			 *
2722 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2723 			 * record__mmap_read_all() didn't collect data from
2724 			 * overwritable ring buffer. Read again.
2725 			 */
2726 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2727 				continue;
2728 			trigger_ready(&switch_output_trigger);
2729 
2730 			/*
2731 			 * Reenable events in overwrite ring buffer after
2732 			 * record__mmap_read_all(): we should have collected
2733 			 * data from it.
2734 			 */
2735 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2736 
2737 			if (!quiet)
2738 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2739 					record__waking(rec));
2740 			thread->waking = 0;
2741 			fd = record__switch_output(rec, false);
2742 			if (fd < 0) {
2743 				pr_err("Failed to switch to new file\n");
2744 				trigger_error(&switch_output_trigger);
2745 				err = fd;
2746 				goto out_child;
2747 			}
2748 
2749 			/* re-arm the alarm */
2750 			if (rec->switch_output.time)
2751 				alarm(rec->switch_output.time);
2752 		}
2753 
2754 		if (hits == thread->samples) {
2755 			if (done || draining)
2756 				break;
2757 			err = fdarray__poll(&thread->pollfd, -1);
2758 			/*
2759 			 * Propagate error, only if there's any. Ignore positive
2760 			 * number of returned events and interrupt error.
2761 			 */
2762 			if (err > 0 || (err < 0 && errno == EINTR))
2763 				err = 0;
2764 			thread->waking++;
2765 
2766 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2767 					    record__thread_munmap_filtered, NULL) == 0)
2768 				draining = true;
2769 
2770 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2771 			if (err)
2772 				goto out_child;
2773 		}
2774 
2775 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2776 			switch (cmd) {
2777 			case EVLIST_CTL_CMD_SNAPSHOT:
2778 				hit_auxtrace_snapshot_trigger(rec);
2779 				evlist__ctlfd_ack(rec->evlist);
2780 				break;
2781 			case EVLIST_CTL_CMD_STOP:
2782 				done = 1;
2783 				break;
2784 			case EVLIST_CTL_CMD_ACK:
2785 			case EVLIST_CTL_CMD_UNSUPPORTED:
2786 			case EVLIST_CTL_CMD_ENABLE:
2787 			case EVLIST_CTL_CMD_DISABLE:
2788 			case EVLIST_CTL_CMD_EVLIST:
2789 			case EVLIST_CTL_CMD_PING:
2790 			default:
2791 				break;
2792 			}
2793 		}
2794 
2795 		err = event_enable_timer__process(rec->evlist->eet);
2796 		if (err < 0)
2797 			goto out_child;
2798 		if (err) {
2799 			err = 0;
2800 			done = 1;
2801 		}
2802 
2803 		/*
2804 		 * When perf is starting the traced process, at the end events
2805 		 * die with the process and we wait for that. Thus no need to
2806 		 * disable events in this case.
2807 		 */
2808 		if (done && !disabled && !target__none(&opts->target)) {
2809 			trigger_off(&auxtrace_snapshot_trigger);
2810 			evlist__disable(rec->evlist);
2811 			disabled = true;
2812 		}
2813 	}
2814 
2815 	trigger_off(&auxtrace_snapshot_trigger);
2816 	trigger_off(&switch_output_trigger);
2817 
2818 	record__synthesize_final_bpf_metadata(rec);
2819 
2820 	if (opts->auxtrace_snapshot_on_exit)
2821 		record__auxtrace_snapshot_exit(rec);
2822 
2823 	if (forks && workload_exec_errno) {
2824 		char msg[STRERR_BUFSIZE];
2825 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2826 		struct strbuf sb = STRBUF_INIT;
2827 
2828 		evlist__format_evsels(rec->evlist, &sb, 2048);
2829 
2830 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2831 			sb.buf, argv[0], emsg);
2832 		strbuf_release(&sb);
2833 		err = -1;
2834 		goto out_child;
2835 	}
2836 
2837 	if (!quiet)
2838 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2839 			record__waking(rec));
2840 
2841 	write_finished_init(rec, true);
2842 
2843 	if (target__none(&rec->opts.target))
2844 		record__synthesize_workload(rec, true);
2845 
2846 out_child:
2847 	record__stop_threads(rec);
2848 	record__mmap_read_all(rec, true);
2849 out_free_threads:
2850 	record__free_thread_data(rec);
2851 	evlist__finalize_ctlfd(rec->evlist);
2852 	record__aio_mmap_read_sync(rec);
2853 
2854 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2855 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2856 		session->header.env.comp_ratio = ratio + 0.5;
2857 	}
2858 
2859 	if (forks) {
2860 		int exit_status;
2861 
2862 		if (!child_finished)
2863 			kill(rec->evlist->workload.pid, SIGTERM);
2864 
2865 		wait(&exit_status);
2866 
2867 		if (err < 0)
2868 			status = err;
2869 		else if (WIFEXITED(exit_status))
2870 			status = WEXITSTATUS(exit_status);
2871 		else if (WIFSIGNALED(exit_status))
2872 			signr = WTERMSIG(exit_status);
2873 	} else
2874 		status = err;
2875 
2876 	if (rec->off_cpu)
2877 		rec->bytes_written += off_cpu_write(rec->session);
2878 
2879 	record__read_lost_samples(rec);
2880 	record__synthesize(rec, true);
2881 	/* this will be recalculated during process_buildids() */
2882 	rec->samples = 0;
2883 
2884 	if (!err) {
2885 		if (!rec->timestamp_filename) {
2886 			record__finish_output(rec);
2887 		} else {
2888 			fd = record__switch_output(rec, true);
2889 			if (fd < 0) {
2890 				status = fd;
2891 				goto out_delete_session;
2892 			}
2893 		}
2894 	}
2895 
2896 	perf_hooks__invoke_record_end();
2897 
2898 	if (!err && !quiet) {
2899 		char samples[128];
2900 		const char *postfix = rec->timestamp_filename ?
2901 					".<timestamp>" : "";
2902 
2903 		if (rec->samples && !rec->opts.full_auxtrace)
2904 			scnprintf(samples, sizeof(samples),
2905 				  " (%" PRIu64 " samples)", rec->samples);
2906 		else
2907 			samples[0] = '\0';
2908 
2909 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2910 			perf_data__size(data) / 1024.0 / 1024.0,
2911 			data->path, postfix, samples);
2912 		if (ratio) {
2913 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2914 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2915 					ratio);
2916 		}
2917 		fprintf(stderr, " ]\n");
2918 	}
2919 
2920 out_delete_session:
2921 #ifdef HAVE_EVENTFD_SUPPORT
2922 	if (done_fd >= 0) {
2923 		fd = done_fd;
2924 		done_fd = -1;
2925 
2926 		close(fd);
2927 	}
2928 #endif
2929 	zstd_fini(&session->zstd_data);
2930 	if (!opts->no_bpf_event)
2931 		evlist__stop_sb_thread(rec->sb_evlist);
2932 
2933 	perf_session__delete(session);
2934 	return status;
2935 }
2936 
2937 static void callchain_debug(struct callchain_param *callchain)
2938 {
2939 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2940 
2941 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2942 
2943 	if (callchain->record_mode == CALLCHAIN_DWARF)
2944 		pr_debug("callchain: stack dump size %d\n",
2945 			 callchain->dump_size);
2946 }
2947 
2948 int record_opts__parse_callchain(struct record_opts *record,
2949 				 struct callchain_param *callchain,
2950 				 const char *arg, bool unset)
2951 {
2952 	int ret;
2953 	callchain->enabled = !unset;
2954 
2955 	/* --no-call-graph */
2956 	if (unset) {
2957 		callchain->record_mode = CALLCHAIN_NONE;
2958 		pr_debug("callchain: disabled\n");
2959 		return 0;
2960 	}
2961 
2962 	ret = parse_callchain_record_opt(arg, callchain);
2963 	if (!ret) {
2964 		/* Enable data address sampling for DWARF unwind. */
2965 		if (callchain->record_mode == CALLCHAIN_DWARF)
2966 			record->sample_address = true;
2967 		callchain_debug(callchain);
2968 	}
2969 
2970 	return ret;
2971 }
2972 
2973 int record_parse_callchain_opt(const struct option *opt,
2974 			       const char *arg,
2975 			       int unset)
2976 {
2977 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2978 }
2979 
2980 int record_callchain_opt(const struct option *opt,
2981 			 const char *arg __maybe_unused,
2982 			 int unset __maybe_unused)
2983 {
2984 	struct callchain_param *callchain = opt->value;
2985 
2986 	callchain->enabled = true;
2987 
2988 	if (callchain->record_mode == CALLCHAIN_NONE)
2989 		callchain->record_mode = CALLCHAIN_FP;
2990 
2991 	callchain_debug(callchain);
2992 	return 0;
2993 }
2994 
2995 static int perf_record_config(const char *var, const char *value, void *cb)
2996 {
2997 	struct record *rec = cb;
2998 
2999 	if (!strcmp(var, "record.build-id")) {
3000 		if (!strcmp(value, "cache"))
3001 			rec->no_buildid_cache = false;
3002 		else if (!strcmp(value, "no-cache"))
3003 			rec->no_buildid_cache = true;
3004 		else if (!strcmp(value, "skip"))
3005 			rec->no_buildid = true;
3006 		else if (!strcmp(value, "mmap"))
3007 			rec->buildid_mmap = true;
3008 		else
3009 			return -1;
3010 		return 0;
3011 	}
3012 	if (!strcmp(var, "record.call-graph")) {
3013 		var = "call-graph.record-mode";
3014 		return perf_default_config(var, value, cb);
3015 	}
3016 #ifdef HAVE_AIO_SUPPORT
3017 	if (!strcmp(var, "record.aio")) {
3018 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
3019 		if (!rec->opts.nr_cblocks)
3020 			rec->opts.nr_cblocks = nr_cblocks_default;
3021 	}
3022 #endif
3023 	if (!strcmp(var, "record.debuginfod")) {
3024 		rec->debuginfod.urls = strdup(value);
3025 		if (!rec->debuginfod.urls)
3026 			return -ENOMEM;
3027 		rec->debuginfod.set = true;
3028 	}
3029 
3030 	return 0;
3031 }
3032 
3033 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
3034 {
3035 	struct record *rec = (struct record *)opt->value;
3036 
3037 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
3038 }
3039 
3040 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
3041 {
3042 	struct record_opts *opts = (struct record_opts *)opt->value;
3043 
3044 	if (unset || !str)
3045 		return 0;
3046 
3047 	if (!strcasecmp(str, "node"))
3048 		opts->affinity = PERF_AFFINITY_NODE;
3049 	else if (!strcasecmp(str, "cpu"))
3050 		opts->affinity = PERF_AFFINITY_CPU;
3051 
3052 	return 0;
3053 }
3054 
3055 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3056 {
3057 	mask->nbits = nr_bits;
3058 	mask->bits = bitmap_zalloc(mask->nbits);
3059 	if (!mask->bits)
3060 		return -ENOMEM;
3061 
3062 	return 0;
3063 }
3064 
3065 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3066 {
3067 	bitmap_free(mask->bits);
3068 	mask->nbits = 0;
3069 }
3070 
3071 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3072 {
3073 	int ret;
3074 
3075 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3076 	if (ret) {
3077 		mask->affinity.bits = NULL;
3078 		return ret;
3079 	}
3080 
3081 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3082 	if (ret) {
3083 		record__mmap_cpu_mask_free(&mask->maps);
3084 		mask->maps.bits = NULL;
3085 	}
3086 
3087 	return ret;
3088 }
3089 
3090 static void record__thread_mask_free(struct thread_mask *mask)
3091 {
3092 	record__mmap_cpu_mask_free(&mask->maps);
3093 	record__mmap_cpu_mask_free(&mask->affinity);
3094 }
3095 
3096 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3097 {
3098 	int s;
3099 	struct record_opts *opts = opt->value;
3100 
3101 	if (unset || !str || !strlen(str)) {
3102 		opts->threads_spec = THREAD_SPEC__CPU;
3103 	} else {
3104 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3105 			if (s == THREAD_SPEC__USER) {
3106 				opts->threads_user_spec = strdup(str);
3107 				if (!opts->threads_user_spec)
3108 					return -ENOMEM;
3109 				opts->threads_spec = THREAD_SPEC__USER;
3110 				break;
3111 			}
3112 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3113 				opts->threads_spec = s;
3114 				break;
3115 			}
3116 		}
3117 	}
3118 
3119 	if (opts->threads_spec == THREAD_SPEC__USER)
3120 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3121 	else
3122 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3123 
3124 	return 0;
3125 }
3126 
3127 static int parse_output_max_size(const struct option *opt,
3128 				 const char *str, int unset)
3129 {
3130 	unsigned long *s = (unsigned long *)opt->value;
3131 	static struct parse_tag tags_size[] = {
3132 		{ .tag  = 'B', .mult = 1       },
3133 		{ .tag  = 'K', .mult = 1 << 10 },
3134 		{ .tag  = 'M', .mult = 1 << 20 },
3135 		{ .tag  = 'G', .mult = 1 << 30 },
3136 		{ .tag  = 0 },
3137 	};
3138 	unsigned long val;
3139 
3140 	if (unset) {
3141 		*s = 0;
3142 		return 0;
3143 	}
3144 
3145 	val = parse_tag_value(str, tags_size);
3146 	if (val != (unsigned long) -1) {
3147 		*s = val;
3148 		return 0;
3149 	}
3150 
3151 	return -1;
3152 }
3153 
3154 static int record__parse_mmap_pages(const struct option *opt,
3155 				    const char *str,
3156 				    int unset __maybe_unused)
3157 {
3158 	struct record_opts *opts = opt->value;
3159 	char *s, *p;
3160 	unsigned int mmap_pages;
3161 	int ret;
3162 
3163 	if (!str)
3164 		return -EINVAL;
3165 
3166 	s = strdup(str);
3167 	if (!s)
3168 		return -ENOMEM;
3169 
3170 	p = strchr(s, ',');
3171 	if (p)
3172 		*p = '\0';
3173 
3174 	if (*s) {
3175 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3176 		if (ret)
3177 			goto out_free;
3178 		opts->mmap_pages = mmap_pages;
3179 	}
3180 
3181 	if (!p) {
3182 		ret = 0;
3183 		goto out_free;
3184 	}
3185 
3186 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3187 	if (ret)
3188 		goto out_free;
3189 
3190 	opts->auxtrace_mmap_pages = mmap_pages;
3191 
3192 out_free:
3193 	free(s);
3194 	return ret;
3195 }
3196 
3197 static int record__parse_off_cpu_thresh(const struct option *opt,
3198 					const char *str,
3199 					int unset __maybe_unused)
3200 {
3201 	struct record_opts *opts = opt->value;
3202 	char *endptr;
3203 	u64 off_cpu_thresh_ms;
3204 
3205 	if (!str)
3206 		return -EINVAL;
3207 
3208 	off_cpu_thresh_ms = strtoull(str, &endptr, 10);
3209 
3210 	/* the threshold isn't string "0", yet strtoull() returns 0, parsing failed */
3211 	if (*endptr || (off_cpu_thresh_ms == 0 && strcmp(str, "0")))
3212 		return -EINVAL;
3213 	else
3214 		opts->off_cpu_thresh_ns = off_cpu_thresh_ms * NSEC_PER_MSEC;
3215 
3216 	return 0;
3217 }
3218 
3219 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3220 {
3221 }
3222 
3223 static int parse_control_option(const struct option *opt,
3224 				const char *str,
3225 				int unset __maybe_unused)
3226 {
3227 	struct record_opts *opts = opt->value;
3228 
3229 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3230 }
3231 
3232 static void switch_output_size_warn(struct record *rec)
3233 {
3234 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3235 	struct switch_output *s = &rec->switch_output;
3236 
3237 	wakeup_size /= 2;
3238 
3239 	if (s->size < wakeup_size) {
3240 		char buf[100];
3241 
3242 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3243 		pr_warning("WARNING: switch-output data size lower than "
3244 			   "wakeup kernel buffer size (%s) "
3245 			   "expect bigger perf.data sizes\n", buf);
3246 	}
3247 }
3248 
3249 static int switch_output_setup(struct record *rec)
3250 {
3251 	struct switch_output *s = &rec->switch_output;
3252 	static struct parse_tag tags_size[] = {
3253 		{ .tag  = 'B', .mult = 1       },
3254 		{ .tag  = 'K', .mult = 1 << 10 },
3255 		{ .tag  = 'M', .mult = 1 << 20 },
3256 		{ .tag  = 'G', .mult = 1 << 30 },
3257 		{ .tag  = 0 },
3258 	};
3259 	static struct parse_tag tags_time[] = {
3260 		{ .tag  = 's', .mult = 1        },
3261 		{ .tag  = 'm', .mult = 60       },
3262 		{ .tag  = 'h', .mult = 60*60    },
3263 		{ .tag  = 'd', .mult = 60*60*24 },
3264 		{ .tag  = 0 },
3265 	};
3266 	unsigned long val;
3267 
3268 	/*
3269 	 * If we're using --switch-output-events, then we imply its
3270 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3271 	 *  thread to its parent.
3272 	 */
3273 	if (rec->switch_output_event_set) {
3274 		if (record__threads_enabled(rec)) {
3275 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3276 			return 0;
3277 		}
3278 		goto do_signal;
3279 	}
3280 
3281 	if (!s->set)
3282 		return 0;
3283 
3284 	if (record__threads_enabled(rec)) {
3285 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3286 		return 0;
3287 	}
3288 
3289 	if (!strcmp(s->str, "signal")) {
3290 do_signal:
3291 		s->signal = true;
3292 		pr_debug("switch-output with SIGUSR2 signal\n");
3293 		goto enabled;
3294 	}
3295 
3296 	val = parse_tag_value(s->str, tags_size);
3297 	if (val != (unsigned long) -1) {
3298 		s->size = val;
3299 		pr_debug("switch-output with %s size threshold\n", s->str);
3300 		goto enabled;
3301 	}
3302 
3303 	val = parse_tag_value(s->str, tags_time);
3304 	if (val != (unsigned long) -1) {
3305 		s->time = val;
3306 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3307 			 s->str, s->time);
3308 		goto enabled;
3309 	}
3310 
3311 	return -1;
3312 
3313 enabled:
3314 	rec->timestamp_filename = true;
3315 	s->enabled              = true;
3316 
3317 	if (s->size && !rec->opts.no_buffering)
3318 		switch_output_size_warn(rec);
3319 
3320 	return 0;
3321 }
3322 
3323 static const char * const __record_usage[] = {
3324 	"perf record [<options>] [<command>]",
3325 	"perf record [<options>] -- <command> [<options>]",
3326 	NULL
3327 };
3328 const char * const *record_usage = __record_usage;
3329 
3330 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
3331 				  struct perf_sample *sample, struct machine *machine)
3332 {
3333 	/*
3334 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3335 	 * no need to add them twice.
3336 	 */
3337 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3338 		return 0;
3339 	return perf_event__process_mmap(tool, event, sample, machine);
3340 }
3341 
3342 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
3343 				   struct perf_sample *sample, struct machine *machine)
3344 {
3345 	/*
3346 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3347 	 * no need to add them twice.
3348 	 */
3349 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3350 		return 0;
3351 
3352 	return perf_event__process_mmap2(tool, event, sample, machine);
3353 }
3354 
3355 static int process_timestamp_boundary(const struct perf_tool *tool,
3356 				      union perf_event *event __maybe_unused,
3357 				      struct perf_sample *sample,
3358 				      struct machine *machine __maybe_unused)
3359 {
3360 	struct record *rec = container_of(tool, struct record, tool);
3361 
3362 	set_timestamp_boundary(rec, sample->time);
3363 	return 0;
3364 }
3365 
3366 static int parse_record_synth_option(const struct option *opt,
3367 				     const char *str,
3368 				     int unset __maybe_unused)
3369 {
3370 	struct record_opts *opts = opt->value;
3371 	char *p = strdup(str);
3372 
3373 	if (p == NULL)
3374 		return -1;
3375 
3376 	opts->synth = parse_synth_opt(p);
3377 	free(p);
3378 
3379 	if (opts->synth < 0) {
3380 		pr_err("Invalid synth option: %s\n", str);
3381 		return -1;
3382 	}
3383 	return 0;
3384 }
3385 
3386 /*
3387  * XXX Ideally would be local to cmd_record() and passed to a record__new
3388  * because we need to have access to it in record__exit, that is called
3389  * after cmd_record() exits, but since record_options need to be accessible to
3390  * builtin-script, leave it here.
3391  *
3392  * At least we don't ouch it in all the other functions here directly.
3393  *
3394  * Just say no to tons of global variables, sigh.
3395  */
3396 static struct record record = {
3397 	.opts = {
3398 		.sample_time	     = true,
3399 		.mmap_pages	     = UINT_MAX,
3400 		.user_freq	     = UINT_MAX,
3401 		.user_interval	     = ULLONG_MAX,
3402 		.freq		     = 4000,
3403 		.target		     = {
3404 			.uses_mmap   = true,
3405 			.default_per_cpu = true,
3406 		},
3407 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3408 		.nr_threads_synthesize = 1,
3409 		.ctl_fd              = -1,
3410 		.ctl_fd_ack          = -1,
3411 		.synth               = PERF_SYNTH_ALL,
3412 		.off_cpu_thresh_ns   = OFFCPU_THRESH,
3413 	},
3414 };
3415 
3416 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3417 	"\n\t\t\t\tDefault: fp";
3418 
3419 static bool dry_run;
3420 
3421 static struct parse_events_option_args parse_events_option_args = {
3422 	.evlistp = &record.evlist,
3423 };
3424 
3425 static struct parse_events_option_args switch_output_parse_events_option_args = {
3426 	.evlistp = &record.sb_evlist,
3427 };
3428 
3429 /*
3430  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3431  * with it and switch to use the library functions in perf_evlist that came
3432  * from builtin-record.c, i.e. use record_opts,
3433  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3434  * using pipes, etc.
3435  */
3436 static struct option __record_options[] = {
3437 	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3438 		     "event selector. use 'perf list' to list available events",
3439 		     parse_events_option),
3440 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3441 		     "event filter", parse_filter),
3442 	OPT_BOOLEAN(0, "latency", &record.latency,
3443 		    "Enable data collection for latency profiling.\n"
3444 		    "\t\t\t  Use perf report --latency for latency-centric profile."),
3445 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3446 			   NULL, "don't record events from perf itself",
3447 			   exclude_perf),
3448 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3449 		    "record events on existing process id"),
3450 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3451 		    "record events on existing thread id"),
3452 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3453 		    "collect data with this RT SCHED_FIFO priority"),
3454 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3455 		    "collect data without buffering"),
3456 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3457 		    "collect raw sample records from all opened counters"),
3458 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3459 			    "system-wide collection from all CPUs"),
3460 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3461 		    "list of cpus to monitor"),
3462 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3463 	OPT_STRING('o', "output", &record.data.path, "file",
3464 		    "output file name"),
3465 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3466 			&record.opts.no_inherit_set,
3467 			"child tasks do not inherit counters"),
3468 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3469 		    "synthesize non-sample events at the end of output"),
3470 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3471 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3472 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3473 		    "Fail if the specified frequency can't be used"),
3474 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3475 		     "profile at this frequency",
3476 		      record__parse_freq),
3477 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3478 		     "number of mmap data pages and AUX area tracing mmap pages",
3479 		     record__parse_mmap_pages),
3480 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3481 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3482 		     record__mmap_flush_parse),
3483 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3484 			   NULL, "enables call-graph recording" ,
3485 			   &record_callchain_opt),
3486 	OPT_CALLBACK(0, "call-graph", &record.opts,
3487 		     "record_mode[,record_size]", record_callchain_help,
3488 		     &record_parse_callchain_opt),
3489 	OPT_INCR('v', "verbose", &verbose,
3490 		    "be more verbose (show counter open errors, etc)"),
3491 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3492 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3493 		    "per thread counts"),
3494 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3495 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3496 		    "Record the sample physical addresses"),
3497 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3498 		    "Record the sampled data address data page size"),
3499 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3500 		    "Record the sampled code address (ip) page size"),
3501 	OPT_BOOLEAN(0, "sample-mem-info", &record.opts.sample_data_src,
3502 		    "Record the data source for memory operations"),
3503 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3504 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3505 		    "Record the sample identifier"),
3506 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3507 			&record.opts.sample_time_set,
3508 			"Record the sample timestamps"),
3509 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3510 			"Record the sample period"),
3511 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3512 		    "don't sample"),
3513 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3514 			&record.no_buildid_cache_set,
3515 			"do not update the buildid cache"),
3516 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3517 			&record.no_buildid_set,
3518 			"do not collect buildids in perf.data"),
3519 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3520 		     "monitor event in cgroup name only",
3521 		     parse_cgroups),
3522 	OPT_CALLBACK('D', "delay", &record, "ms",
3523 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3524 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3525 		     record__parse_event_enable_time),
3526 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3527 	OPT_STRING('u', "uid", &record.uid_str, "user", "user to profile"),
3528 
3529 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3530 		     "branch any", "sample any taken branches",
3531 		     parse_branch_stack),
3532 
3533 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3534 		     "branch filter mask", "branch stack filter modes",
3535 		     parse_branch_stack),
3536 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3537 		    "sample by weight (on special events only)"),
3538 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3539 		    "sample transaction flags (special events only)"),
3540 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3541 		    "use per-thread mmaps"),
3542 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3543 		    "sample selected machine registers on interrupt,"
3544 		    " use '-I?' to list register names", parse_intr_regs),
3545 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3546 		    "sample selected machine registers in user space,"
3547 		    " use '--user-regs=?' to list register names", parse_user_regs),
3548 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3549 		    "Record running/enabled time of read (:S) events"),
3550 	OPT_CALLBACK('k', "clockid", &record.opts,
3551 	"clockid", "clockid to use for events, see clock_gettime()",
3552 	parse_clockid),
3553 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3554 			  "opts", "AUX area tracing Snapshot Mode", ""),
3555 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3556 			  "opts", "sample AUX area", ""),
3557 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3558 			"per thread proc mmap processing timeout in ms"),
3559 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3560 		    "Record namespaces events"),
3561 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3562 		    "Record cgroup events"),
3563 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3564 			&record.opts.record_switch_events_set,
3565 			"Record context switch events"),
3566 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3567 			 "Configure all used events to run in kernel space.",
3568 			 PARSE_OPT_EXCLUSIVE),
3569 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3570 			 "Configure all used events to run in user space.",
3571 			 PARSE_OPT_EXCLUSIVE),
3572 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3573 		    "collect kernel callchains"),
3574 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3575 		    "collect user callchains"),
3576 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3577 		   "file", "vmlinux pathname"),
3578 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3579 		    "Record build-id of all DSOs regardless of hits"),
3580 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3581 		    "Record build-id in map events"),
3582 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3583 		    "append timestamp to output filename"),
3584 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3585 		    "Record timestamp boundary (time of first/last samples)"),
3586 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3587 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3588 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3589 			  "signal"),
3590 	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3591 			 &record.switch_output_event_set, "switch output event",
3592 			 "switch output event selector. use 'perf list' to list available events",
3593 			 parse_events_option_new_evlist),
3594 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3595 		   "Limit number of switch output generated files"),
3596 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3597 		    "Parse options then exit"),
3598 #ifdef HAVE_AIO_SUPPORT
3599 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3600 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3601 		     record__aio_parse),
3602 #endif
3603 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3604 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3605 		     record__parse_affinity),
3606 #ifdef HAVE_ZSTD_SUPPORT
3607 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3608 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3609 			    record__parse_comp_level),
3610 #endif
3611 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3612 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3613 	OPT_UINTEGER(0, "num-thread-synthesize",
3614 		     &record.opts.nr_threads_synthesize,
3615 		     "number of threads to run for event synthesis"),
3616 #ifdef HAVE_LIBPFM
3617 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3618 		"libpfm4 event selector. use 'perf list' to list available events",
3619 		parse_libpfm_events_option),
3620 #endif
3621 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3622 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3623 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3624 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3625 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3626 		      parse_control_option),
3627 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3628 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3629 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3630 			  &record.debuginfod.set, "debuginfod urls",
3631 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3632 			  "system"),
3633 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3634 			    "write collected trace data into several data files using parallel threads",
3635 			    record__parse_threads),
3636 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3637 	OPT_STRING(0, "setup-filter", &record.filter_action, "pin|unpin",
3638 		   "BPF filter action"),
3639 	OPT_CALLBACK(0, "off-cpu-thresh", &record.opts, "ms",
3640 		     "Dump off-cpu samples if off-cpu time exceeds this threshold (in milliseconds). (Default: 500ms)",
3641 		     record__parse_off_cpu_thresh),
3642 	OPT_END()
3643 };
3644 
3645 struct option *record_options = __record_options;
3646 
3647 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3648 {
3649 	struct perf_cpu cpu;
3650 	int idx;
3651 
3652 	if (cpu_map__is_dummy(cpus))
3653 		return 0;
3654 
3655 	perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3656 		/* Return ENODEV is input cpu is greater than max cpu */
3657 		if ((unsigned long)cpu.cpu > mask->nbits)
3658 			return -ENODEV;
3659 		__set_bit(cpu.cpu, mask->bits);
3660 	}
3661 
3662 	return 0;
3663 }
3664 
3665 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3666 {
3667 	struct perf_cpu_map *cpus;
3668 
3669 	cpus = perf_cpu_map__new(mask_spec);
3670 	if (!cpus)
3671 		return -ENOMEM;
3672 
3673 	bitmap_zero(mask->bits, mask->nbits);
3674 	if (record__mmap_cpu_mask_init(mask, cpus))
3675 		return -ENODEV;
3676 
3677 	perf_cpu_map__put(cpus);
3678 
3679 	return 0;
3680 }
3681 
3682 static void record__free_thread_masks(struct record *rec, int nr_threads)
3683 {
3684 	int t;
3685 
3686 	if (rec->thread_masks)
3687 		for (t = 0; t < nr_threads; t++)
3688 			record__thread_mask_free(&rec->thread_masks[t]);
3689 
3690 	zfree(&rec->thread_masks);
3691 }
3692 
3693 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3694 {
3695 	int t, ret;
3696 
3697 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3698 	if (!rec->thread_masks) {
3699 		pr_err("Failed to allocate thread masks\n");
3700 		return -ENOMEM;
3701 	}
3702 
3703 	for (t = 0; t < nr_threads; t++) {
3704 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3705 		if (ret) {
3706 			pr_err("Failed to allocate thread masks[%d]\n", t);
3707 			goto out_free;
3708 		}
3709 	}
3710 
3711 	return 0;
3712 
3713 out_free:
3714 	record__free_thread_masks(rec, nr_threads);
3715 
3716 	return ret;
3717 }
3718 
3719 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3720 {
3721 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3722 
3723 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3724 	if (ret)
3725 		return ret;
3726 
3727 	rec->nr_threads = nr_cpus;
3728 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3729 
3730 	for (t = 0; t < rec->nr_threads; t++) {
3731 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3732 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3733 		if (verbose > 0) {
3734 			pr_debug("thread_masks[%d]: ", t);
3735 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3736 			pr_debug("thread_masks[%d]: ", t);
3737 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3738 		}
3739 	}
3740 
3741 	return 0;
3742 }
3743 
3744 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3745 					  const char **maps_spec, const char **affinity_spec,
3746 					  u32 nr_spec)
3747 {
3748 	u32 s;
3749 	int ret = 0, t = 0;
3750 	struct mmap_cpu_mask cpus_mask;
3751 	struct thread_mask thread_mask, full_mask, *thread_masks;
3752 
3753 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3754 	if (ret) {
3755 		pr_err("Failed to allocate CPUs mask\n");
3756 		return ret;
3757 	}
3758 
3759 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3760 	if (ret) {
3761 		pr_err("Failed to init cpu mask\n");
3762 		goto out_free_cpu_mask;
3763 	}
3764 
3765 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3766 	if (ret) {
3767 		pr_err("Failed to allocate full mask\n");
3768 		goto out_free_cpu_mask;
3769 	}
3770 
3771 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3772 	if (ret) {
3773 		pr_err("Failed to allocate thread mask\n");
3774 		goto out_free_full_and_cpu_masks;
3775 	}
3776 
3777 	for (s = 0; s < nr_spec; s++) {
3778 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3779 		if (ret) {
3780 			pr_err("Failed to initialize maps thread mask\n");
3781 			goto out_free;
3782 		}
3783 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3784 		if (ret) {
3785 			pr_err("Failed to initialize affinity thread mask\n");
3786 			goto out_free;
3787 		}
3788 
3789 		/* ignore invalid CPUs but do not allow empty masks */
3790 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3791 				cpus_mask.bits, thread_mask.maps.nbits)) {
3792 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3793 			ret = -EINVAL;
3794 			goto out_free;
3795 		}
3796 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3797 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3798 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3799 			ret = -EINVAL;
3800 			goto out_free;
3801 		}
3802 
3803 		/* do not allow intersection with other masks (full_mask) */
3804 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3805 				      thread_mask.maps.nbits)) {
3806 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3807 			ret = -EINVAL;
3808 			goto out_free;
3809 		}
3810 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3811 				      thread_mask.affinity.nbits)) {
3812 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3813 			ret = -EINVAL;
3814 			goto out_free;
3815 		}
3816 
3817 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3818 			  thread_mask.maps.bits, full_mask.maps.nbits);
3819 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3820 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3821 
3822 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3823 		if (!thread_masks) {
3824 			pr_err("Failed to reallocate thread masks\n");
3825 			ret = -ENOMEM;
3826 			goto out_free;
3827 		}
3828 		rec->thread_masks = thread_masks;
3829 		rec->thread_masks[t] = thread_mask;
3830 		if (verbose > 0) {
3831 			pr_debug("thread_masks[%d]: ", t);
3832 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3833 			pr_debug("thread_masks[%d]: ", t);
3834 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3835 		}
3836 		t++;
3837 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3838 		if (ret) {
3839 			pr_err("Failed to allocate thread mask\n");
3840 			goto out_free_full_and_cpu_masks;
3841 		}
3842 	}
3843 	rec->nr_threads = t;
3844 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3845 	if (!rec->nr_threads)
3846 		ret = -EINVAL;
3847 
3848 out_free:
3849 	record__thread_mask_free(&thread_mask);
3850 out_free_full_and_cpu_masks:
3851 	record__thread_mask_free(&full_mask);
3852 out_free_cpu_mask:
3853 	record__mmap_cpu_mask_free(&cpus_mask);
3854 
3855 	return ret;
3856 }
3857 
3858 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3859 {
3860 	int ret;
3861 	struct cpu_topology *topo;
3862 
3863 	topo = cpu_topology__new();
3864 	if (!topo) {
3865 		pr_err("Failed to allocate CPU topology\n");
3866 		return -ENOMEM;
3867 	}
3868 
3869 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3870 					     topo->core_cpus_list, topo->core_cpus_lists);
3871 	cpu_topology__delete(topo);
3872 
3873 	return ret;
3874 }
3875 
3876 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3877 {
3878 	int ret;
3879 	struct cpu_topology *topo;
3880 
3881 	topo = cpu_topology__new();
3882 	if (!topo) {
3883 		pr_err("Failed to allocate CPU topology\n");
3884 		return -ENOMEM;
3885 	}
3886 
3887 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3888 					     topo->package_cpus_list, topo->package_cpus_lists);
3889 	cpu_topology__delete(topo);
3890 
3891 	return ret;
3892 }
3893 
3894 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3895 {
3896 	u32 s;
3897 	int ret;
3898 	const char **spec;
3899 	struct numa_topology *topo;
3900 
3901 	topo = numa_topology__new();
3902 	if (!topo) {
3903 		pr_err("Failed to allocate NUMA topology\n");
3904 		return -ENOMEM;
3905 	}
3906 
3907 	spec = zalloc(topo->nr * sizeof(char *));
3908 	if (!spec) {
3909 		pr_err("Failed to allocate NUMA spec\n");
3910 		ret = -ENOMEM;
3911 		goto out_delete_topo;
3912 	}
3913 	for (s = 0; s < topo->nr; s++)
3914 		spec[s] = topo->nodes[s].cpus;
3915 
3916 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3917 
3918 	zfree(&spec);
3919 
3920 out_delete_topo:
3921 	numa_topology__delete(topo);
3922 
3923 	return ret;
3924 }
3925 
3926 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3927 {
3928 	int t, ret;
3929 	u32 s, nr_spec = 0;
3930 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3931 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3932 
3933 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3934 		spec = strtok_r(user_spec, ":", &spec_ptr);
3935 		if (spec == NULL)
3936 			break;
3937 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3938 		mask = strtok_r(spec, "/", &mask_ptr);
3939 		if (mask == NULL)
3940 			break;
3941 		pr_debug2("  maps mask: %s\n", mask);
3942 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3943 		if (!tmp_spec) {
3944 			pr_err("Failed to reallocate maps spec\n");
3945 			ret = -ENOMEM;
3946 			goto out_free;
3947 		}
3948 		maps_spec = tmp_spec;
3949 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3950 		if (!maps_spec[nr_spec]) {
3951 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3952 			ret = -ENOMEM;
3953 			goto out_free;
3954 		}
3955 		mask = strtok_r(NULL, "/", &mask_ptr);
3956 		if (mask == NULL) {
3957 			pr_err("Invalid thread maps or affinity specs\n");
3958 			ret = -EINVAL;
3959 			goto out_free;
3960 		}
3961 		pr_debug2("  affinity mask: %s\n", mask);
3962 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3963 		if (!tmp_spec) {
3964 			pr_err("Failed to reallocate affinity spec\n");
3965 			ret = -ENOMEM;
3966 			goto out_free;
3967 		}
3968 		affinity_spec = tmp_spec;
3969 		affinity_spec[nr_spec] = strdup(mask);
3970 		if (!affinity_spec[nr_spec]) {
3971 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3972 			ret = -ENOMEM;
3973 			goto out_free;
3974 		}
3975 		dup_mask = NULL;
3976 		nr_spec++;
3977 	}
3978 
3979 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3980 					     (const char **)affinity_spec, nr_spec);
3981 
3982 out_free:
3983 	free(dup_mask);
3984 	for (s = 0; s < nr_spec; s++) {
3985 		if (maps_spec)
3986 			free(maps_spec[s]);
3987 		if (affinity_spec)
3988 			free(affinity_spec[s]);
3989 	}
3990 	free(affinity_spec);
3991 	free(maps_spec);
3992 
3993 	return ret;
3994 }
3995 
3996 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3997 {
3998 	int ret;
3999 
4000 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
4001 	if (ret)
4002 		return ret;
4003 
4004 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
4005 		return -ENODEV;
4006 
4007 	rec->nr_threads = 1;
4008 
4009 	return 0;
4010 }
4011 
4012 static int record__init_thread_masks(struct record *rec)
4013 {
4014 	int ret = 0;
4015 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
4016 
4017 	if (!record__threads_enabled(rec))
4018 		return record__init_thread_default_masks(rec, cpus);
4019 
4020 	if (evlist__per_thread(rec->evlist)) {
4021 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
4022 		return -EINVAL;
4023 	}
4024 
4025 	switch (rec->opts.threads_spec) {
4026 	case THREAD_SPEC__CPU:
4027 		ret = record__init_thread_cpu_masks(rec, cpus);
4028 		break;
4029 	case THREAD_SPEC__CORE:
4030 		ret = record__init_thread_core_masks(rec, cpus);
4031 		break;
4032 	case THREAD_SPEC__PACKAGE:
4033 		ret = record__init_thread_package_masks(rec, cpus);
4034 		break;
4035 	case THREAD_SPEC__NUMA:
4036 		ret = record__init_thread_numa_masks(rec, cpus);
4037 		break;
4038 	case THREAD_SPEC__USER:
4039 		ret = record__init_thread_user_masks(rec, cpus);
4040 		break;
4041 	default:
4042 		break;
4043 	}
4044 
4045 	return ret;
4046 }
4047 
4048 int cmd_record(int argc, const char **argv)
4049 {
4050 	int err;
4051 	struct record *rec = &record;
4052 	char errbuf[BUFSIZ];
4053 
4054 	setlocale(LC_ALL, "");
4055 
4056 #ifndef HAVE_BPF_SKEL
4057 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
4058 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
4059 # undef set_nobuild
4060 #endif
4061 
4062 	/* Disable eager loading of kernel symbols that adds overhead to perf record. */
4063 	symbol_conf.lazy_load_kernel_maps = true;
4064 	rec->opts.affinity = PERF_AFFINITY_SYS;
4065 
4066 	rec->evlist = evlist__new();
4067 	if (rec->evlist == NULL)
4068 		return -ENOMEM;
4069 
4070 	err = perf_config(perf_record_config, rec);
4071 	if (err)
4072 		return err;
4073 
4074 	argc = parse_options(argc, argv, record_options, record_usage,
4075 			    PARSE_OPT_STOP_AT_NON_OPTION);
4076 	if (quiet)
4077 		perf_quiet_option();
4078 
4079 	err = symbol__validate_sym_arguments();
4080 	if (err)
4081 		return err;
4082 
4083 	perf_debuginfod_setup(&record.debuginfod);
4084 
4085 	/* Make system wide (-a) the default target. */
4086 	if (!argc && target__none(&rec->opts.target))
4087 		rec->opts.target.system_wide = true;
4088 
4089 	if (nr_cgroups && !rec->opts.target.system_wide) {
4090 		usage_with_options_msg(record_usage, record_options,
4091 			"cgroup monitoring only available in system-wide mode");
4092 
4093 	}
4094 
4095 	if (record.latency) {
4096 		/*
4097 		 * There is no fundamental reason why latency profiling
4098 		 * can't work for system-wide mode, but exact semantics
4099 		 * and details are to be defined.
4100 		 * See the following thread for details:
4101 		 * https://lore.kernel.org/all/Z4XDJyvjiie3howF@google.com/
4102 		 */
4103 		if (record.opts.target.system_wide) {
4104 			pr_err("Failed: latency profiling is not supported with system-wide collection.\n");
4105 			err = -EINVAL;
4106 			goto out_opts;
4107 		}
4108 		record.opts.record_switch_events = true;
4109 	}
4110 
4111 	if (rec->buildid_mmap) {
4112 		if (!perf_can_record_build_id()) {
4113 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4114 			err = -EINVAL;
4115 			goto out_opts;
4116 		}
4117 		pr_debug("Enabling build id in mmap2 events.\n");
4118 		/* Enable mmap build id synthesizing. */
4119 		symbol_conf.buildid_mmap2 = true;
4120 		/* Enable perf_event_attr::build_id bit. */
4121 		rec->opts.build_id = true;
4122 		/* Disable build id cache. */
4123 		rec->no_buildid = true;
4124 	}
4125 
4126 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4127 		pr_err("Kernel has no cgroup sampling support.\n");
4128 		err = -EINVAL;
4129 		goto out_opts;
4130 	}
4131 
4132 	if (rec->opts.kcore)
4133 		rec->opts.text_poke = true;
4134 
4135 	if (rec->opts.kcore || record__threads_enabled(rec))
4136 		rec->data.is_dir = true;
4137 
4138 	if (record__threads_enabled(rec)) {
4139 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4140 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4141 			goto out_opts;
4142 		}
4143 		if (record__aio_enabled(rec)) {
4144 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4145 			goto out_opts;
4146 		}
4147 	}
4148 
4149 	if (rec->opts.comp_level != 0) {
4150 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4151 		rec->no_buildid = true;
4152 	}
4153 
4154 	if (rec->opts.record_switch_events &&
4155 	    !perf_can_record_switch_events()) {
4156 		ui__error("kernel does not support recording context switch events\n");
4157 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4158 		err = -EINVAL;
4159 		goto out_opts;
4160 	}
4161 
4162 	if (switch_output_setup(rec)) {
4163 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4164 		err = -EINVAL;
4165 		goto out_opts;
4166 	}
4167 
4168 	if (rec->switch_output.time) {
4169 		signal(SIGALRM, alarm_sig_handler);
4170 		alarm(rec->switch_output.time);
4171 	}
4172 
4173 	if (rec->switch_output.num_files) {
4174 		rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4175 						      sizeof(char *));
4176 		if (!rec->switch_output.filenames) {
4177 			err = -EINVAL;
4178 			goto out_opts;
4179 		}
4180 	}
4181 
4182 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4183 		rec->timestamp_filename = false;
4184 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4185 	}
4186 
4187 	if (rec->filter_action) {
4188 		if (!strcmp(rec->filter_action, "pin"))
4189 			err = perf_bpf_filter__pin();
4190 		else if (!strcmp(rec->filter_action, "unpin"))
4191 			err = perf_bpf_filter__unpin();
4192 		else {
4193 			pr_warning("Unknown BPF filter action: %s\n", rec->filter_action);
4194 			err = -EINVAL;
4195 		}
4196 		goto out_opts;
4197 	}
4198 
4199 	/* For backward compatibility, -d implies --mem-info */
4200 	if (rec->opts.sample_address)
4201 		rec->opts.sample_data_src = true;
4202 
4203 	/*
4204 	 * Allow aliases to facilitate the lookup of symbols for address
4205 	 * filters. Refer to auxtrace_parse_filters().
4206 	 */
4207 	symbol_conf.allow_aliases = true;
4208 
4209 	symbol__init(NULL);
4210 
4211 	err = record__auxtrace_init(rec);
4212 	if (err)
4213 		goto out;
4214 
4215 	if (dry_run)
4216 		goto out;
4217 
4218 	err = -ENOMEM;
4219 
4220 	if (rec->no_buildid_cache || rec->no_buildid) {
4221 		disable_buildid_cache();
4222 	} else if (rec->switch_output.enabled) {
4223 		/*
4224 		 * In 'perf record --switch-output', disable buildid
4225 		 * generation by default to reduce data file switching
4226 		 * overhead. Still generate buildid if they are required
4227 		 * explicitly using
4228 		 *
4229 		 *  perf record --switch-output --no-no-buildid \
4230 		 *              --no-no-buildid-cache
4231 		 *
4232 		 * Following code equals to:
4233 		 *
4234 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4235 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4236 		 *         disable_buildid_cache();
4237 		 */
4238 		bool disable = true;
4239 
4240 		if (rec->no_buildid_set && !rec->no_buildid)
4241 			disable = false;
4242 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4243 			disable = false;
4244 		if (disable) {
4245 			rec->no_buildid = true;
4246 			rec->no_buildid_cache = true;
4247 			disable_buildid_cache();
4248 		}
4249 	}
4250 
4251 	if (record.opts.overwrite)
4252 		record.opts.tail_synthesize = true;
4253 
4254 	if (rec->evlist->core.nr_entries == 0) {
4255 		err = parse_event(rec->evlist, "cycles:P");
4256 		if (err)
4257 			goto out;
4258 	}
4259 
4260 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4261 		rec->opts.no_inherit = true;
4262 
4263 	err = target__validate(&rec->opts.target);
4264 	if (err) {
4265 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4266 		ui__warning("%s\n", errbuf);
4267 	}
4268 
4269 	if (rec->uid_str) {
4270 		uid_t uid = parse_uid(rec->uid_str);
4271 
4272 		if (uid == UINT_MAX) {
4273 			ui__error("Invalid User: %s", rec->uid_str);
4274 			err = -EINVAL;
4275 			goto out;
4276 		}
4277 		err = parse_uid_filter(rec->evlist, uid);
4278 		if (err)
4279 			goto out;
4280 
4281 		/* User ID filtering implies system wide. */
4282 		rec->opts.target.system_wide = true;
4283 	}
4284 
4285 	/* Enable ignoring missing threads when -p option is defined. */
4286 	rec->opts.ignore_missing_thread = rec->opts.target.pid;
4287 
4288 	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4289 
4290 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4291 		arch__add_leaf_frame_record_opts(&rec->opts);
4292 
4293 	err = -ENOMEM;
4294 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4295 		if (rec->opts.target.pid != NULL) {
4296 			pr_err("Couldn't create thread/CPU maps: %s\n",
4297 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4298 			goto out;
4299 		}
4300 		else
4301 			usage_with_options(record_usage, record_options);
4302 	}
4303 
4304 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4305 	if (err)
4306 		goto out;
4307 
4308 	/*
4309 	 * We take all buildids when the file contains
4310 	 * AUX area tracing data because we do not decode the
4311 	 * trace because it would take too long.
4312 	 */
4313 	if (rec->opts.full_auxtrace)
4314 		rec->buildid_all = true;
4315 
4316 	if (rec->opts.text_poke) {
4317 		err = record__config_text_poke(rec->evlist);
4318 		if (err) {
4319 			pr_err("record__config_text_poke failed, error %d\n", err);
4320 			goto out;
4321 		}
4322 	}
4323 
4324 	if (rec->off_cpu) {
4325 		err = record__config_off_cpu(rec);
4326 		if (err) {
4327 			pr_err("record__config_off_cpu failed, error %d\n", err);
4328 			goto out;
4329 		}
4330 	}
4331 
4332 	if (record_opts__config(&rec->opts)) {
4333 		err = -EINVAL;
4334 		goto out;
4335 	}
4336 
4337 	err = record__config_tracking_events(rec);
4338 	if (err) {
4339 		pr_err("record__config_tracking_events failed, error %d\n", err);
4340 		goto out;
4341 	}
4342 
4343 	err = record__init_thread_masks(rec);
4344 	if (err) {
4345 		pr_err("Failed to initialize parallel data streaming masks\n");
4346 		goto out;
4347 	}
4348 
4349 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4350 		rec->opts.nr_cblocks = nr_cblocks_max;
4351 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4352 
4353 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4354 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4355 
4356 	if (rec->opts.comp_level > comp_level_max)
4357 		rec->opts.comp_level = comp_level_max;
4358 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4359 
4360 	err = __cmd_record(&record, argc, argv);
4361 out:
4362 	record__free_thread_masks(rec, rec->nr_threads);
4363 	rec->nr_threads = 0;
4364 	symbol__exit();
4365 	auxtrace_record__free(rec->itr);
4366 out_opts:
4367 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4368 	evlist__delete(rec->evlist);
4369 	return err;
4370 }
4371 
4372 static void snapshot_sig_handler(int sig __maybe_unused)
4373 {
4374 	struct record *rec = &record;
4375 
4376 	hit_auxtrace_snapshot_trigger(rec);
4377 
4378 	if (switch_output_signal(rec))
4379 		trigger_hit(&switch_output_trigger);
4380 }
4381 
4382 static void alarm_sig_handler(int sig __maybe_unused)
4383 {
4384 	struct record *rec = &record;
4385 
4386 	if (switch_output_time(rec))
4387 		trigger_hit(&switch_output_trigger);
4388 }
4389