xref: /linux/tools/perf/builtin-record.c (revision 25489a4f556414445d342951615178368ee45cde)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/stat.h"
30 #include "util/symbol.h"
31 #include "util/record.h"
32 #include "util/cpumap.h"
33 #include "util/thread_map.h"
34 #include "util/data.h"
35 #include "util/perf_regs.h"
36 #include "util/auxtrace.h"
37 #include "util/tsc.h"
38 #include "util/parse-branch-options.h"
39 #include "util/parse-regs-options.h"
40 #include "util/perf_api_probe.h"
41 #include "util/trigger.h"
42 #include "util/perf-hooks.h"
43 #include "util/cpu-set-sched.h"
44 #include "util/synthetic-events.h"
45 #include "util/time-utils.h"
46 #include "util/units.h"
47 #include "util/bpf-event.h"
48 #include "util/util.h"
49 #include "util/pfm.h"
50 #include "util/pmu.h"
51 #include "util/pmus.h"
52 #include "util/clockid.h"
53 #include "util/off_cpu.h"
54 #include "util/bpf-filter.h"
55 #include "util/strbuf.h"
56 #include "asm/bug.h"
57 #include "perf.h"
58 #include "cputopo.h"
59 
60 #include <errno.h>
61 #include <inttypes.h>
62 #include <locale.h>
63 #include <poll.h>
64 #include <pthread.h>
65 #include <unistd.h>
66 #ifndef HAVE_GETTID
67 #include <syscall.h>
68 #endif
69 #include <sched.h>
70 #include <signal.h>
71 #ifdef HAVE_EVENTFD_SUPPORT
72 #include <sys/eventfd.h>
73 #endif
74 #include <sys/mman.h>
75 #include <sys/wait.h>
76 #include <sys/types.h>
77 #include <sys/stat.h>
78 #include <fcntl.h>
79 #include <linux/err.h>
80 #include <linux/string.h>
81 #include <linux/time64.h>
82 #include <linux/zalloc.h>
83 #include <linux/bitmap.h>
84 #include <sys/time.h>
85 
86 struct switch_output {
87 	bool		 enabled;
88 	bool		 signal;
89 	unsigned long	 size;
90 	unsigned long	 time;
91 	const char	*str;
92 	bool		 set;
93 	char		 **filenames;
94 	int		 num_files;
95 	int		 cur_file;
96 };
97 
98 struct thread_mask {
99 	struct mmap_cpu_mask	maps;
100 	struct mmap_cpu_mask	affinity;
101 };
102 
103 struct record_thread {
104 	pid_t			tid;
105 	struct thread_mask	*mask;
106 	struct {
107 		int		msg[2];
108 		int		ack[2];
109 	} pipes;
110 	struct fdarray		pollfd;
111 	int			ctlfd_pos;
112 	int			nr_mmaps;
113 	struct mmap		**maps;
114 	struct mmap		**overwrite_maps;
115 	struct record		*rec;
116 	unsigned long long	samples;
117 	unsigned long		waking;
118 	u64			bytes_written;
119 	u64			bytes_transferred;
120 	u64			bytes_compressed;
121 };
122 
123 static __thread struct record_thread *thread;
124 
125 enum thread_msg {
126 	THREAD_MSG__UNDEFINED = 0,
127 	THREAD_MSG__READY,
128 	THREAD_MSG__MAX,
129 };
130 
131 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
132 	"UNDEFINED", "READY"
133 };
134 
135 enum thread_spec {
136 	THREAD_SPEC__UNDEFINED = 0,
137 	THREAD_SPEC__CPU,
138 	THREAD_SPEC__CORE,
139 	THREAD_SPEC__PACKAGE,
140 	THREAD_SPEC__NUMA,
141 	THREAD_SPEC__USER,
142 	THREAD_SPEC__MAX,
143 };
144 
145 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
146 	"undefined", "cpu", "core", "package", "numa", "user"
147 };
148 
149 struct pollfd_index_map {
150 	int evlist_pollfd_index;
151 	int thread_pollfd_index;
152 };
153 
154 struct record {
155 	struct perf_tool	tool;
156 	struct record_opts	opts;
157 	u64			bytes_written;
158 	u64			thread_bytes_written;
159 	struct perf_data	data;
160 	struct auxtrace_record	*itr;
161 	struct evlist	*evlist;
162 	struct perf_session	*session;
163 	struct evlist		*sb_evlist;
164 	pthread_t		thread_id;
165 	int			realtime_prio;
166 	bool			latency;
167 	bool			switch_output_event_set;
168 	bool			no_buildid;
169 	bool			no_buildid_set;
170 	bool			no_buildid_cache;
171 	bool			no_buildid_cache_set;
172 	bool			buildid_all;
173 	bool			buildid_mmap;
174 	bool			timestamp_filename;
175 	bool			timestamp_boundary;
176 	bool			off_cpu;
177 	const char		*filter_action;
178 	struct switch_output	switch_output;
179 	unsigned long long	samples;
180 	unsigned long		output_max_size;	/* = 0: unlimited */
181 	struct perf_debuginfod	debuginfod;
182 	int			nr_threads;
183 	struct thread_mask	*thread_masks;
184 	struct record_thread	*thread_data;
185 	struct pollfd_index_map	*index_map;
186 	size_t			index_map_sz;
187 	size_t			index_map_cnt;
188 };
189 
190 static volatile int done;
191 
192 static volatile int auxtrace_record__snapshot_started;
193 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
194 static DEFINE_TRIGGER(switch_output_trigger);
195 
196 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
197 	"SYS", "NODE", "CPU"
198 };
199 
200 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
201 				  struct perf_sample *sample, struct machine *machine);
202 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
203 				   struct perf_sample *sample, struct machine *machine);
204 static int process_timestamp_boundary(const struct perf_tool *tool,
205 				      union perf_event *event,
206 				      struct perf_sample *sample,
207 				      struct machine *machine);
208 
209 #ifndef HAVE_GETTID
210 static inline pid_t gettid(void)
211 {
212 	return (pid_t)syscall(__NR_gettid);
213 }
214 #endif
215 
216 static int record__threads_enabled(struct record *rec)
217 {
218 	return rec->opts.threads_spec;
219 }
220 
221 static bool switch_output_signal(struct record *rec)
222 {
223 	return rec->switch_output.signal &&
224 	       trigger_is_ready(&switch_output_trigger);
225 }
226 
227 static bool switch_output_size(struct record *rec)
228 {
229 	return rec->switch_output.size &&
230 	       trigger_is_ready(&switch_output_trigger) &&
231 	       (rec->bytes_written >= rec->switch_output.size);
232 }
233 
234 static bool switch_output_time(struct record *rec)
235 {
236 	return rec->switch_output.time &&
237 	       trigger_is_ready(&switch_output_trigger);
238 }
239 
240 static u64 record__bytes_written(struct record *rec)
241 {
242 	return rec->bytes_written + rec->thread_bytes_written;
243 }
244 
245 static bool record__output_max_size_exceeded(struct record *rec)
246 {
247 	return rec->output_max_size &&
248 	       (record__bytes_written(rec) >= rec->output_max_size);
249 }
250 
251 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
252 			 void *bf, size_t size)
253 {
254 	struct perf_data_file *file = &rec->session->data->file;
255 
256 	if (map && map->file)
257 		file = map->file;
258 
259 	if (perf_data_file__write(file, bf, size) < 0) {
260 		pr_err("failed to write perf data, error: %m\n");
261 		return -1;
262 	}
263 
264 	if (map && map->file) {
265 		thread->bytes_written += size;
266 		rec->thread_bytes_written += size;
267 	} else {
268 		rec->bytes_written += size;
269 	}
270 
271 	if (record__output_max_size_exceeded(rec) && !done) {
272 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
273 				" stopping session ]\n",
274 				record__bytes_written(rec) >> 10);
275 		done = 1;
276 	}
277 
278 	if (switch_output_size(rec))
279 		trigger_hit(&switch_output_trigger);
280 
281 	return 0;
282 }
283 
284 static int record__aio_enabled(struct record *rec);
285 static int record__comp_enabled(struct record *rec);
286 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
287 			    void *dst, size_t dst_size, void *src, size_t src_size);
288 
289 #ifdef HAVE_AIO_SUPPORT
290 static int record__aio_write(struct aiocb *cblock, int trace_fd,
291 		void *buf, size_t size, off_t off)
292 {
293 	int rc;
294 
295 	cblock->aio_fildes = trace_fd;
296 	cblock->aio_buf    = buf;
297 	cblock->aio_nbytes = size;
298 	cblock->aio_offset = off;
299 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
300 
301 	do {
302 		rc = aio_write(cblock);
303 		if (rc == 0) {
304 			break;
305 		} else if (errno != EAGAIN) {
306 			cblock->aio_fildes = -1;
307 			pr_err("failed to queue perf data, error: %m\n");
308 			break;
309 		}
310 	} while (1);
311 
312 	return rc;
313 }
314 
315 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
316 {
317 	void *rem_buf;
318 	off_t rem_off;
319 	size_t rem_size;
320 	int rc, aio_errno;
321 	ssize_t aio_ret, written;
322 
323 	aio_errno = aio_error(cblock);
324 	if (aio_errno == EINPROGRESS)
325 		return 0;
326 
327 	written = aio_ret = aio_return(cblock);
328 	if (aio_ret < 0) {
329 		if (aio_errno != EINTR)
330 			pr_err("failed to write perf data, error: %m\n");
331 		written = 0;
332 	}
333 
334 	rem_size = cblock->aio_nbytes - written;
335 
336 	if (rem_size == 0) {
337 		cblock->aio_fildes = -1;
338 		/*
339 		 * md->refcount is incremented in record__aio_pushfn() for
340 		 * every aio write request started in record__aio_push() so
341 		 * decrement it because the request is now complete.
342 		 */
343 		perf_mmap__put(&md->core);
344 		rc = 1;
345 	} else {
346 		/*
347 		 * aio write request may require restart with the
348 		 * remainder if the kernel didn't write whole
349 		 * chunk at once.
350 		 */
351 		rem_off = cblock->aio_offset + written;
352 		rem_buf = (void *)(cblock->aio_buf + written);
353 		record__aio_write(cblock, cblock->aio_fildes,
354 				rem_buf, rem_size, rem_off);
355 		rc = 0;
356 	}
357 
358 	return rc;
359 }
360 
361 static int record__aio_sync(struct mmap *md, bool sync_all)
362 {
363 	struct aiocb **aiocb = md->aio.aiocb;
364 	struct aiocb *cblocks = md->aio.cblocks;
365 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
366 	int i, do_suspend;
367 
368 	do {
369 		do_suspend = 0;
370 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
371 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
372 				if (sync_all)
373 					aiocb[i] = NULL;
374 				else
375 					return i;
376 			} else {
377 				/*
378 				 * Started aio write is not complete yet
379 				 * so it has to be waited before the
380 				 * next allocation.
381 				 */
382 				aiocb[i] = &cblocks[i];
383 				do_suspend = 1;
384 			}
385 		}
386 		if (!do_suspend)
387 			return -1;
388 
389 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
390 			if (!(errno == EAGAIN || errno == EINTR))
391 				pr_err("failed to sync perf data, error: %m\n");
392 		}
393 	} while (1);
394 }
395 
396 struct record_aio {
397 	struct record	*rec;
398 	void		*data;
399 	size_t		size;
400 };
401 
402 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
403 {
404 	struct record_aio *aio = to;
405 
406 	/*
407 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
408 	 * to release space in the kernel buffer as fast as possible, calling
409 	 * perf_mmap__consume() from perf_mmap__push() function.
410 	 *
411 	 * That lets the kernel to proceed with storing more profiling data into
412 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
413 	 *
414 	 * Coping can be done in two steps in case the chunk of profiling data
415 	 * crosses the upper bound of the kernel buffer. In this case we first move
416 	 * part of data from map->start till the upper bound and then the remainder
417 	 * from the beginning of the kernel buffer till the end of the data chunk.
418 	 */
419 
420 	if (record__comp_enabled(aio->rec)) {
421 		ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
422 						   mmap__mmap_len(map) - aio->size,
423 						   buf, size);
424 		if (compressed < 0)
425 			return (int)compressed;
426 
427 		size = compressed;
428 	} else {
429 		memcpy(aio->data + aio->size, buf, size);
430 	}
431 
432 	if (!aio->size) {
433 		/*
434 		 * Increment map->refcount to guard map->aio.data[] buffer
435 		 * from premature deallocation because map object can be
436 		 * released earlier than aio write request started on
437 		 * map->aio.data[] buffer is complete.
438 		 *
439 		 * perf_mmap__put() is done at record__aio_complete()
440 		 * after started aio request completion or at record__aio_push()
441 		 * if the request failed to start.
442 		 */
443 		perf_mmap__get(&map->core);
444 	}
445 
446 	aio->size += size;
447 
448 	return size;
449 }
450 
451 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
452 {
453 	int ret, idx;
454 	int trace_fd = rec->session->data->file.fd;
455 	struct record_aio aio = { .rec = rec, .size = 0 };
456 
457 	/*
458 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
459 	 * becomes available after previous aio write operation.
460 	 */
461 
462 	idx = record__aio_sync(map, false);
463 	aio.data = map->aio.data[idx];
464 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
465 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
466 		return ret;
467 
468 	rec->samples++;
469 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
470 	if (!ret) {
471 		*off += aio.size;
472 		rec->bytes_written += aio.size;
473 		if (switch_output_size(rec))
474 			trigger_hit(&switch_output_trigger);
475 	} else {
476 		/*
477 		 * Decrement map->refcount incremented in record__aio_pushfn()
478 		 * back if record__aio_write() operation failed to start, otherwise
479 		 * map->refcount is decremented in record__aio_complete() after
480 		 * aio write operation finishes successfully.
481 		 */
482 		perf_mmap__put(&map->core);
483 	}
484 
485 	return ret;
486 }
487 
488 static off_t record__aio_get_pos(int trace_fd)
489 {
490 	return lseek(trace_fd, 0, SEEK_CUR);
491 }
492 
493 static void record__aio_set_pos(int trace_fd, off_t pos)
494 {
495 	lseek(trace_fd, pos, SEEK_SET);
496 }
497 
498 static void record__aio_mmap_read_sync(struct record *rec)
499 {
500 	int i;
501 	struct evlist *evlist = rec->evlist;
502 	struct mmap *maps = evlist->mmap;
503 
504 	if (!record__aio_enabled(rec))
505 		return;
506 
507 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
508 		struct mmap *map = &maps[i];
509 
510 		if (map->core.base)
511 			record__aio_sync(map, true);
512 	}
513 }
514 
515 static int nr_cblocks_default = 1;
516 static int nr_cblocks_max = 4;
517 
518 static int record__aio_parse(const struct option *opt,
519 			     const char *str,
520 			     int unset)
521 {
522 	struct record_opts *opts = (struct record_opts *)opt->value;
523 
524 	if (unset) {
525 		opts->nr_cblocks = 0;
526 	} else {
527 		if (str)
528 			opts->nr_cblocks = strtol(str, NULL, 0);
529 		if (!opts->nr_cblocks)
530 			opts->nr_cblocks = nr_cblocks_default;
531 	}
532 
533 	return 0;
534 }
535 #else /* HAVE_AIO_SUPPORT */
536 static int nr_cblocks_max = 0;
537 
538 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
539 			    off_t *off __maybe_unused)
540 {
541 	return -1;
542 }
543 
544 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
545 {
546 	return -1;
547 }
548 
549 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
550 {
551 }
552 
553 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
554 {
555 }
556 #endif
557 
558 static int record__aio_enabled(struct record *rec)
559 {
560 	return rec->opts.nr_cblocks > 0;
561 }
562 
563 #define MMAP_FLUSH_DEFAULT 1
564 static int record__mmap_flush_parse(const struct option *opt,
565 				    const char *str,
566 				    int unset)
567 {
568 	int flush_max;
569 	struct record_opts *opts = (struct record_opts *)opt->value;
570 	static struct parse_tag tags[] = {
571 			{ .tag  = 'B', .mult = 1       },
572 			{ .tag  = 'K', .mult = 1 << 10 },
573 			{ .tag  = 'M', .mult = 1 << 20 },
574 			{ .tag  = 'G', .mult = 1 << 30 },
575 			{ .tag  = 0 },
576 	};
577 
578 	if (unset)
579 		return 0;
580 
581 	if (str) {
582 		opts->mmap_flush = parse_tag_value(str, tags);
583 		if (opts->mmap_flush == (int)-1)
584 			opts->mmap_flush = strtol(str, NULL, 0);
585 	}
586 
587 	if (!opts->mmap_flush)
588 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
589 
590 	flush_max = evlist__mmap_size(opts->mmap_pages);
591 	flush_max /= 4;
592 	if (opts->mmap_flush > flush_max)
593 		opts->mmap_flush = flush_max;
594 
595 	return 0;
596 }
597 
598 #ifdef HAVE_ZSTD_SUPPORT
599 static unsigned int comp_level_default = 1;
600 
601 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
602 {
603 	struct record_opts *opts = opt->value;
604 
605 	if (unset) {
606 		opts->comp_level = 0;
607 	} else {
608 		if (str)
609 			opts->comp_level = strtol(str, NULL, 0);
610 		if (!opts->comp_level)
611 			opts->comp_level = comp_level_default;
612 	}
613 
614 	return 0;
615 }
616 #endif
617 static unsigned int comp_level_max = 22;
618 
619 static int record__comp_enabled(struct record *rec)
620 {
621 	return rec->opts.comp_level > 0;
622 }
623 
624 static int process_synthesized_event(const struct perf_tool *tool,
625 				     union perf_event *event,
626 				     struct perf_sample *sample __maybe_unused,
627 				     struct machine *machine __maybe_unused)
628 {
629 	struct record *rec = container_of(tool, struct record, tool);
630 	return record__write(rec, NULL, event, event->header.size);
631 }
632 
633 static struct mutex synth_lock;
634 
635 static int process_locked_synthesized_event(const struct perf_tool *tool,
636 				     union perf_event *event,
637 				     struct perf_sample *sample __maybe_unused,
638 				     struct machine *machine __maybe_unused)
639 {
640 	int ret;
641 
642 	mutex_lock(&synth_lock);
643 	ret = process_synthesized_event(tool, event, sample, machine);
644 	mutex_unlock(&synth_lock);
645 	return ret;
646 }
647 
648 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
649 {
650 	struct record *rec = to;
651 
652 	if (record__comp_enabled(rec)) {
653 		struct perf_record_compressed2 *event = map->data;
654 		size_t padding = 0;
655 		u8 pad[8] = {0};
656 		ssize_t compressed = zstd_compress(rec->session, map, map->data,
657 						   mmap__mmap_len(map), bf, size);
658 
659 		if (compressed < 0)
660 			return (int)compressed;
661 
662 		bf = event;
663 		thread->samples++;
664 
665 		/*
666 		 * The record from `zstd_compress` is not 8 bytes aligned, which would cause asan
667 		 * error. We make it aligned here.
668 		 */
669 		event->data_size = compressed - sizeof(struct perf_record_compressed2);
670 		event->header.size = PERF_ALIGN(compressed, sizeof(u64));
671 		padding = event->header.size - compressed;
672 		return record__write(rec, map, bf, compressed) ||
673 		       record__write(rec, map, &pad, padding);
674 	}
675 
676 	thread->samples++;
677 	return record__write(rec, map, bf, size);
678 }
679 
680 static volatile sig_atomic_t signr = -1;
681 static volatile sig_atomic_t child_finished;
682 #ifdef HAVE_EVENTFD_SUPPORT
683 static volatile sig_atomic_t done_fd = -1;
684 #endif
685 
686 static void sig_handler(int sig)
687 {
688 	if (sig == SIGCHLD)
689 		child_finished = 1;
690 	else
691 		signr = sig;
692 
693 	done = 1;
694 #ifdef HAVE_EVENTFD_SUPPORT
695 	if (done_fd >= 0) {
696 		u64 tmp = 1;
697 		int orig_errno = errno;
698 
699 		/*
700 		 * It is possible for this signal handler to run after done is
701 		 * checked in the main loop, but before the perf counter fds are
702 		 * polled. If this happens, the poll() will continue to wait
703 		 * even though done is set, and will only break out if either
704 		 * another signal is received, or the counters are ready for
705 		 * read. To ensure the poll() doesn't sleep when done is set,
706 		 * use an eventfd (done_fd) to wake up the poll().
707 		 */
708 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
709 			pr_err("failed to signal wakeup fd, error: %m\n");
710 
711 		errno = orig_errno;
712 	}
713 #endif // HAVE_EVENTFD_SUPPORT
714 }
715 
716 static void sigsegv_handler(int sig)
717 {
718 	perf_hooks__recover();
719 	sighandler_dump_stack(sig);
720 }
721 
722 static void record__sig_exit(void)
723 {
724 	if (signr == -1)
725 		return;
726 
727 	signal(signr, SIG_DFL);
728 	raise(signr);
729 }
730 
731 #ifdef HAVE_AUXTRACE_SUPPORT
732 
733 static int record__process_auxtrace(const struct perf_tool *tool,
734 				    struct mmap *map,
735 				    union perf_event *event, void *data1,
736 				    size_t len1, void *data2, size_t len2)
737 {
738 	struct record *rec = container_of(tool, struct record, tool);
739 	struct perf_data *data = &rec->data;
740 	size_t padding;
741 	u8 pad[8] = {0};
742 
743 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
744 		off_t file_offset;
745 		int fd = perf_data__fd(data);
746 		int err;
747 
748 		file_offset = lseek(fd, 0, SEEK_CUR);
749 		if (file_offset == -1)
750 			return -1;
751 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
752 						     event, file_offset);
753 		if (err)
754 			return err;
755 	}
756 
757 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
758 	padding = (len1 + len2) & 7;
759 	if (padding)
760 		padding = 8 - padding;
761 
762 	record__write(rec, map, event, event->header.size);
763 	record__write(rec, map, data1, len1);
764 	if (len2)
765 		record__write(rec, map, data2, len2);
766 	record__write(rec, map, &pad, padding);
767 
768 	return 0;
769 }
770 
771 static int record__auxtrace_mmap_read(struct record *rec,
772 				      struct mmap *map)
773 {
774 	int ret;
775 
776 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
777 				  record__process_auxtrace);
778 	if (ret < 0)
779 		return ret;
780 
781 	if (ret)
782 		rec->samples++;
783 
784 	return 0;
785 }
786 
787 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
788 					       struct mmap *map)
789 {
790 	int ret;
791 
792 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
793 					   record__process_auxtrace,
794 					   rec->opts.auxtrace_snapshot_size);
795 	if (ret < 0)
796 		return ret;
797 
798 	if (ret)
799 		rec->samples++;
800 
801 	return 0;
802 }
803 
804 static int record__auxtrace_read_snapshot_all(struct record *rec)
805 {
806 	int i;
807 	int rc = 0;
808 
809 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
810 		struct mmap *map = &rec->evlist->mmap[i];
811 
812 		if (!map->auxtrace_mmap.base)
813 			continue;
814 
815 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
816 			rc = -1;
817 			goto out;
818 		}
819 	}
820 out:
821 	return rc;
822 }
823 
824 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
825 {
826 	pr_debug("Recording AUX area tracing snapshot\n");
827 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
828 		trigger_error(&auxtrace_snapshot_trigger);
829 	} else {
830 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
831 			trigger_error(&auxtrace_snapshot_trigger);
832 		else
833 			trigger_ready(&auxtrace_snapshot_trigger);
834 	}
835 }
836 
837 static int record__auxtrace_snapshot_exit(struct record *rec)
838 {
839 	if (trigger_is_error(&auxtrace_snapshot_trigger))
840 		return 0;
841 
842 	if (!auxtrace_record__snapshot_started &&
843 	    auxtrace_record__snapshot_start(rec->itr))
844 		return -1;
845 
846 	record__read_auxtrace_snapshot(rec, true);
847 	if (trigger_is_error(&auxtrace_snapshot_trigger))
848 		return -1;
849 
850 	return 0;
851 }
852 
853 static int record__auxtrace_init(struct record *rec)
854 {
855 	int err;
856 
857 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
858 	    && record__threads_enabled(rec)) {
859 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
860 		return -EINVAL;
861 	}
862 
863 	if (!rec->itr) {
864 		rec->itr = auxtrace_record__init(rec->evlist, &err);
865 		if (err)
866 			return err;
867 	}
868 
869 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
870 					      rec->opts.auxtrace_snapshot_opts);
871 	if (err)
872 		return err;
873 
874 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
875 					    rec->opts.auxtrace_sample_opts);
876 	if (err)
877 		return err;
878 
879 	err = auxtrace_parse_aux_action(rec->evlist);
880 	if (err)
881 		return err;
882 
883 	return auxtrace_parse_filters(rec->evlist);
884 }
885 
886 #else
887 
888 static inline
889 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
890 			       struct mmap *map __maybe_unused)
891 {
892 	return 0;
893 }
894 
895 static inline
896 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
897 				    bool on_exit __maybe_unused)
898 {
899 }
900 
901 static inline
902 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
903 {
904 	return 0;
905 }
906 
907 static inline
908 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
909 {
910 	return 0;
911 }
912 
913 static int record__auxtrace_init(struct record *rec __maybe_unused)
914 {
915 	return 0;
916 }
917 
918 #endif
919 
920 static int record__config_text_poke(struct evlist *evlist)
921 {
922 	struct evsel *evsel;
923 
924 	/* Nothing to do if text poke is already configured */
925 	evlist__for_each_entry(evlist, evsel) {
926 		if (evsel->core.attr.text_poke)
927 			return 0;
928 	}
929 
930 	evsel = evlist__add_dummy_on_all_cpus(evlist);
931 	if (!evsel)
932 		return -ENOMEM;
933 
934 	evsel->core.attr.text_poke = 1;
935 	evsel->core.attr.ksymbol = 1;
936 	evsel->immediate = true;
937 	evsel__set_sample_bit(evsel, TIME);
938 
939 	return 0;
940 }
941 
942 static int record__config_off_cpu(struct record *rec)
943 {
944 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
945 }
946 
947 static bool record__tracking_system_wide(struct record *rec)
948 {
949 	struct evlist *evlist = rec->evlist;
950 	struct evsel *evsel;
951 
952 	/*
953 	 * If non-dummy evsel exists, system_wide sideband is need to
954 	 * help parse sample information.
955 	 * For example, PERF_EVENT_MMAP event to help parse symbol,
956 	 * and PERF_EVENT_COMM event to help parse task executable name.
957 	 */
958 	evlist__for_each_entry(evlist, evsel) {
959 		if (!evsel__is_dummy_event(evsel))
960 			return true;
961 	}
962 
963 	return false;
964 }
965 
966 static int record__config_tracking_events(struct record *rec)
967 {
968 	struct record_opts *opts = &rec->opts;
969 	struct evlist *evlist = rec->evlist;
970 	bool system_wide = false;
971 	struct evsel *evsel;
972 
973 	/*
974 	 * For initial_delay, system wide or a hybrid system, we need to add
975 	 * tracking event so that we can track PERF_RECORD_MMAP to cover the
976 	 * delay of waiting or event synthesis.
977 	 */
978 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
979 	    perf_pmus__num_core_pmus() > 1) {
980 
981 		/*
982 		 * User space tasks can migrate between CPUs, so when tracing
983 		 * selected CPUs, sideband for all CPUs is still needed.
984 		 */
985 		if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
986 			system_wide = true;
987 
988 		evsel = evlist__findnew_tracking_event(evlist, system_wide);
989 		if (!evsel)
990 			return -ENOMEM;
991 
992 		/*
993 		 * Enable the tracking event when the process is forked for
994 		 * initial_delay, immediately for system wide.
995 		 */
996 		if (opts->target.initial_delay && !evsel->immediate &&
997 		    !target__has_cpu(&opts->target))
998 			evsel->core.attr.enable_on_exec = 1;
999 		else
1000 			evsel->immediate = 1;
1001 	}
1002 
1003 	return 0;
1004 }
1005 
1006 static bool record__kcore_readable(struct machine *machine)
1007 {
1008 	char kcore[PATH_MAX];
1009 	int fd;
1010 
1011 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
1012 
1013 	fd = open(kcore, O_RDONLY);
1014 	if (fd < 0)
1015 		return false;
1016 
1017 	close(fd);
1018 
1019 	return true;
1020 }
1021 
1022 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
1023 {
1024 	char from_dir[PATH_MAX];
1025 	char kcore_dir[PATH_MAX];
1026 	int ret;
1027 
1028 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
1029 
1030 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1031 	if (ret)
1032 		return ret;
1033 
1034 	return kcore_copy(from_dir, kcore_dir);
1035 }
1036 
1037 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1038 {
1039 	thread_data->pipes.msg[0] = -1;
1040 	thread_data->pipes.msg[1] = -1;
1041 	thread_data->pipes.ack[0] = -1;
1042 	thread_data->pipes.ack[1] = -1;
1043 }
1044 
1045 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1046 {
1047 	if (pipe(thread_data->pipes.msg))
1048 		return -EINVAL;
1049 
1050 	if (pipe(thread_data->pipes.ack)) {
1051 		close(thread_data->pipes.msg[0]);
1052 		thread_data->pipes.msg[0] = -1;
1053 		close(thread_data->pipes.msg[1]);
1054 		thread_data->pipes.msg[1] = -1;
1055 		return -EINVAL;
1056 	}
1057 
1058 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1059 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1060 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1061 
1062 	return 0;
1063 }
1064 
1065 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1066 {
1067 	if (thread_data->pipes.msg[0] != -1) {
1068 		close(thread_data->pipes.msg[0]);
1069 		thread_data->pipes.msg[0] = -1;
1070 	}
1071 	if (thread_data->pipes.msg[1] != -1) {
1072 		close(thread_data->pipes.msg[1]);
1073 		thread_data->pipes.msg[1] = -1;
1074 	}
1075 	if (thread_data->pipes.ack[0] != -1) {
1076 		close(thread_data->pipes.ack[0]);
1077 		thread_data->pipes.ack[0] = -1;
1078 	}
1079 	if (thread_data->pipes.ack[1] != -1) {
1080 		close(thread_data->pipes.ack[1]);
1081 		thread_data->pipes.ack[1] = -1;
1082 	}
1083 }
1084 
1085 static bool evlist__per_thread(struct evlist *evlist)
1086 {
1087 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1088 }
1089 
1090 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1091 {
1092 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1093 	struct mmap *mmap = evlist->mmap;
1094 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1095 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1096 	bool per_thread = evlist__per_thread(evlist);
1097 
1098 	if (per_thread)
1099 		thread_data->nr_mmaps = nr_mmaps;
1100 	else
1101 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1102 						      thread_data->mask->maps.nbits);
1103 	if (mmap) {
1104 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1105 		if (!thread_data->maps)
1106 			return -ENOMEM;
1107 	}
1108 	if (overwrite_mmap) {
1109 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1110 		if (!thread_data->overwrite_maps) {
1111 			zfree(&thread_data->maps);
1112 			return -ENOMEM;
1113 		}
1114 	}
1115 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1116 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1117 
1118 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1119 		if (per_thread ||
1120 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1121 			if (thread_data->maps) {
1122 				thread_data->maps[tm] = &mmap[m];
1123 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1124 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1125 			}
1126 			if (thread_data->overwrite_maps) {
1127 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1128 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1129 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1130 			}
1131 			tm++;
1132 		}
1133 	}
1134 
1135 	return 0;
1136 }
1137 
1138 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1139 {
1140 	int f, tm, pos;
1141 	struct mmap *map, *overwrite_map;
1142 
1143 	fdarray__init(&thread_data->pollfd, 64);
1144 
1145 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1146 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1147 		overwrite_map = thread_data->overwrite_maps ?
1148 				thread_data->overwrite_maps[tm] : NULL;
1149 
1150 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1151 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1152 
1153 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1154 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1155 							      &evlist->core.pollfd);
1156 				if (pos < 0)
1157 					return pos;
1158 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1159 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1160 			}
1161 		}
1162 	}
1163 
1164 	return 0;
1165 }
1166 
1167 static void record__free_thread_data(struct record *rec)
1168 {
1169 	int t;
1170 	struct record_thread *thread_data = rec->thread_data;
1171 
1172 	if (thread_data == NULL)
1173 		return;
1174 
1175 	for (t = 0; t < rec->nr_threads; t++) {
1176 		record__thread_data_close_pipes(&thread_data[t]);
1177 		zfree(&thread_data[t].maps);
1178 		zfree(&thread_data[t].overwrite_maps);
1179 		fdarray__exit(&thread_data[t].pollfd);
1180 	}
1181 
1182 	zfree(&rec->thread_data);
1183 }
1184 
1185 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1186 						    int evlist_pollfd_index,
1187 						    int thread_pollfd_index)
1188 {
1189 	size_t x = rec->index_map_cnt;
1190 
1191 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1192 		return -ENOMEM;
1193 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1194 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1195 	rec->index_map_cnt += 1;
1196 	return 0;
1197 }
1198 
1199 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1200 						    struct evlist *evlist,
1201 						    struct record_thread *thread_data)
1202 {
1203 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1204 	struct pollfd *t_entries = thread_data->pollfd.entries;
1205 	int err = 0;
1206 	size_t i;
1207 
1208 	for (i = 0; i < rec->index_map_cnt; i++) {
1209 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1210 		int t_pos = rec->index_map[i].thread_pollfd_index;
1211 
1212 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1213 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1214 			pr_err("Thread and evlist pollfd index mismatch\n");
1215 			err = -EINVAL;
1216 			continue;
1217 		}
1218 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1219 	}
1220 	return err;
1221 }
1222 
1223 static int record__dup_non_perf_events(struct record *rec,
1224 				       struct evlist *evlist,
1225 				       struct record_thread *thread_data)
1226 {
1227 	struct fdarray *fda = &evlist->core.pollfd;
1228 	int i, ret;
1229 
1230 	for (i = 0; i < fda->nr; i++) {
1231 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1232 			continue;
1233 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1234 		if (ret < 0) {
1235 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1236 			return ret;
1237 		}
1238 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1239 			  thread_data, ret, fda->entries[i].fd);
1240 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1241 		if (ret < 0) {
1242 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1243 			return ret;
1244 		}
1245 	}
1246 	return 0;
1247 }
1248 
1249 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1250 {
1251 	int t, ret;
1252 	struct record_thread *thread_data;
1253 
1254 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1255 	if (!rec->thread_data) {
1256 		pr_err("Failed to allocate thread data\n");
1257 		return -ENOMEM;
1258 	}
1259 	thread_data = rec->thread_data;
1260 
1261 	for (t = 0; t < rec->nr_threads; t++)
1262 		record__thread_data_init_pipes(&thread_data[t]);
1263 
1264 	for (t = 0; t < rec->nr_threads; t++) {
1265 		thread_data[t].rec = rec;
1266 		thread_data[t].mask = &rec->thread_masks[t];
1267 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1268 		if (ret) {
1269 			pr_err("Failed to initialize thread[%d] maps\n", t);
1270 			goto out_free;
1271 		}
1272 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1273 		if (ret) {
1274 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1275 			goto out_free;
1276 		}
1277 		if (t) {
1278 			thread_data[t].tid = -1;
1279 			ret = record__thread_data_open_pipes(&thread_data[t]);
1280 			if (ret) {
1281 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1282 				goto out_free;
1283 			}
1284 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1285 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1286 			if (ret < 0) {
1287 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1288 				goto out_free;
1289 			}
1290 			thread_data[t].ctlfd_pos = ret;
1291 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1292 				 thread_data, thread_data[t].ctlfd_pos,
1293 				 thread_data[t].pipes.msg[0]);
1294 		} else {
1295 			thread_data[t].tid = gettid();
1296 
1297 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1298 			if (ret < 0)
1299 				goto out_free;
1300 
1301 			thread_data[t].ctlfd_pos = -1; /* Not used */
1302 		}
1303 	}
1304 
1305 	return 0;
1306 
1307 out_free:
1308 	record__free_thread_data(rec);
1309 
1310 	return ret;
1311 }
1312 
1313 static int record__mmap_evlist(struct record *rec,
1314 			       struct evlist *evlist)
1315 {
1316 	int i, ret;
1317 	struct record_opts *opts = &rec->opts;
1318 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1319 				  opts->auxtrace_sample_mode;
1320 	char msg[512];
1321 
1322 	if (opts->affinity != PERF_AFFINITY_SYS)
1323 		cpu__setup_cpunode_map();
1324 
1325 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1326 				 opts->auxtrace_mmap_pages,
1327 				 auxtrace_overwrite,
1328 				 opts->nr_cblocks, opts->affinity,
1329 				 opts->mmap_flush, opts->comp_level) < 0) {
1330 		if (errno == EPERM) {
1331 			pr_err("Permission error mapping pages.\n"
1332 			       "Consider increasing "
1333 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1334 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1335 			       "(current value: %u,%u)\n",
1336 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1337 			return -errno;
1338 		} else {
1339 			pr_err("failed to mmap with %d (%s)\n", errno,
1340 				str_error_r(errno, msg, sizeof(msg)));
1341 			if (errno)
1342 				return -errno;
1343 			else
1344 				return -EINVAL;
1345 		}
1346 	}
1347 
1348 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1349 		return -1;
1350 
1351 	ret = record__alloc_thread_data(rec, evlist);
1352 	if (ret)
1353 		return ret;
1354 
1355 	if (record__threads_enabled(rec)) {
1356 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1357 		if (ret) {
1358 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1359 			return ret;
1360 		}
1361 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1362 			if (evlist->mmap)
1363 				evlist->mmap[i].file = &rec->data.dir.files[i];
1364 			if (evlist->overwrite_mmap)
1365 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1366 		}
1367 	}
1368 
1369 	return 0;
1370 }
1371 
1372 static int record__mmap(struct record *rec)
1373 {
1374 	return record__mmap_evlist(rec, rec->evlist);
1375 }
1376 
1377 static int record__open(struct record *rec)
1378 {
1379 	char msg[BUFSIZ];
1380 	struct evsel *pos;
1381 	struct evlist *evlist = rec->evlist;
1382 	struct perf_session *session = rec->session;
1383 	struct record_opts *opts = &rec->opts;
1384 	int rc = 0;
1385 
1386 	evlist__for_each_entry(evlist, pos) {
1387 try_again:
1388 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1389 			if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1390 				if (verbose > 0)
1391 					ui__warning("%s\n", msg);
1392 				goto try_again;
1393 			}
1394 			if ((errno == EINVAL || errno == EBADF) &&
1395 			    pos->core.leader != &pos->core &&
1396 			    pos->weak_group) {
1397 			        pos = evlist__reset_weak_group(evlist, pos, true);
1398 				goto try_again;
1399 			}
1400 			rc = -errno;
1401 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1402 			ui__error("%s\n", msg);
1403 			goto out;
1404 		}
1405 
1406 		pos->supported = true;
1407 	}
1408 
1409 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1410 		pr_warning(
1411 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1412 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1413 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1414 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1415 "Samples in kernel modules won't be resolved at all.\n\n"
1416 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1417 "even with a suitable vmlinux or kallsyms file.\n\n");
1418 	}
1419 
1420 	if (evlist__apply_filters(evlist, &pos, &opts->target)) {
1421 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1422 			pos->filter ?: "BPF", evsel__name(pos), errno,
1423 			str_error_r(errno, msg, sizeof(msg)));
1424 		rc = -1;
1425 		goto out;
1426 	}
1427 
1428 	rc = record__mmap(rec);
1429 	if (rc)
1430 		goto out;
1431 
1432 	session->evlist = evlist;
1433 	perf_session__set_id_hdr_size(session);
1434 out:
1435 	return rc;
1436 }
1437 
1438 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1439 {
1440 	if (rec->evlist->first_sample_time == 0)
1441 		rec->evlist->first_sample_time = sample_time;
1442 
1443 	if (sample_time)
1444 		rec->evlist->last_sample_time = sample_time;
1445 }
1446 
1447 static int process_sample_event(const struct perf_tool *tool,
1448 				union perf_event *event,
1449 				struct perf_sample *sample,
1450 				struct evsel *evsel,
1451 				struct machine *machine)
1452 {
1453 	struct record *rec = container_of(tool, struct record, tool);
1454 
1455 	set_timestamp_boundary(rec, sample->time);
1456 
1457 	if (rec->buildid_all)
1458 		return 0;
1459 
1460 	rec->samples++;
1461 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1462 }
1463 
1464 static int process_buildids(struct record *rec)
1465 {
1466 	struct perf_session *session = rec->session;
1467 
1468 	if (perf_data__size(&rec->data) == 0)
1469 		return 0;
1470 
1471 	/*
1472 	 * During this process, it'll load kernel map and replace the
1473 	 * dso->long_name to a real pathname it found.  In this case
1474 	 * we prefer the vmlinux path like
1475 	 *   /lib/modules/3.16.4/build/vmlinux
1476 	 *
1477 	 * rather than build-id path (in debug directory).
1478 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1479 	 */
1480 	symbol_conf.ignore_vmlinux_buildid = true;
1481 
1482 	/*
1483 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1484 	 * so no need to process samples. But if timestamp_boundary is enabled,
1485 	 * it still needs to walk on all samples to get the timestamps of
1486 	 * first/last samples.
1487 	 */
1488 	if (rec->buildid_all && !rec->timestamp_boundary)
1489 		rec->tool.sample = process_event_sample_stub;
1490 
1491 	return perf_session__process_events(session);
1492 }
1493 
1494 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1495 {
1496 	int err;
1497 	struct perf_tool *tool = data;
1498 	/*
1499 	 *As for guest kernel when processing subcommand record&report,
1500 	 *we arrange module mmap prior to guest kernel mmap and trigger
1501 	 *a preload dso because default guest module symbols are loaded
1502 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1503 	 *method is used to avoid symbol missing when the first addr is
1504 	 *in module instead of in guest kernel.
1505 	 */
1506 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1507 					     machine);
1508 	if (err < 0)
1509 		pr_err("Couldn't record guest kernel [%d]'s reference"
1510 		       " relocation symbol.\n", machine->pid);
1511 
1512 	/*
1513 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1514 	 * have no _text sometimes.
1515 	 */
1516 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1517 						 machine);
1518 	if (err < 0)
1519 		pr_err("Couldn't record guest kernel [%d]'s reference"
1520 		       " relocation symbol.\n", machine->pid);
1521 }
1522 
1523 static struct perf_event_header finished_round_event = {
1524 	.size = sizeof(struct perf_event_header),
1525 	.type = PERF_RECORD_FINISHED_ROUND,
1526 };
1527 
1528 static struct perf_event_header finished_init_event = {
1529 	.size = sizeof(struct perf_event_header),
1530 	.type = PERF_RECORD_FINISHED_INIT,
1531 };
1532 
1533 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1534 {
1535 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1536 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1537 			  thread->mask->affinity.nbits)) {
1538 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1539 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1540 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1541 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1542 					(cpu_set_t *)thread->mask->affinity.bits);
1543 		if (verbose == 2) {
1544 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1545 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1546 		}
1547 	}
1548 }
1549 
1550 static size_t process_comp_header(void *record, size_t increment)
1551 {
1552 	struct perf_record_compressed2 *event = record;
1553 	size_t size = sizeof(*event);
1554 
1555 	if (increment) {
1556 		event->header.size += increment;
1557 		return increment;
1558 	}
1559 
1560 	event->header.type = PERF_RECORD_COMPRESSED2;
1561 	event->header.size = size;
1562 
1563 	return size;
1564 }
1565 
1566 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1567 			    void *dst, size_t dst_size, void *src, size_t src_size)
1568 {
1569 	ssize_t compressed;
1570 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed2) - 1;
1571 	struct zstd_data *zstd_data = &session->zstd_data;
1572 
1573 	if (map && map->file)
1574 		zstd_data = &map->zstd_data;
1575 
1576 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1577 						     max_record_size, process_comp_header);
1578 	if (compressed < 0)
1579 		return compressed;
1580 
1581 	if (map && map->file) {
1582 		thread->bytes_transferred += src_size;
1583 		thread->bytes_compressed  += compressed;
1584 	} else {
1585 		session->bytes_transferred += src_size;
1586 		session->bytes_compressed  += compressed;
1587 	}
1588 
1589 	return compressed;
1590 }
1591 
1592 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1593 				    bool overwrite, bool synch)
1594 {
1595 	u64 bytes_written = rec->bytes_written;
1596 	int i;
1597 	int rc = 0;
1598 	int nr_mmaps;
1599 	struct mmap **maps;
1600 	int trace_fd = rec->data.file.fd;
1601 	off_t off = 0;
1602 
1603 	if (!evlist)
1604 		return 0;
1605 
1606 	nr_mmaps = thread->nr_mmaps;
1607 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1608 
1609 	if (!maps)
1610 		return 0;
1611 
1612 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1613 		return 0;
1614 
1615 	if (record__aio_enabled(rec))
1616 		off = record__aio_get_pos(trace_fd);
1617 
1618 	for (i = 0; i < nr_mmaps; i++) {
1619 		u64 flush = 0;
1620 		struct mmap *map = maps[i];
1621 
1622 		if (map->core.base) {
1623 			record__adjust_affinity(rec, map);
1624 			if (synch) {
1625 				flush = map->core.flush;
1626 				map->core.flush = 1;
1627 			}
1628 			if (!record__aio_enabled(rec)) {
1629 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1630 					if (synch)
1631 						map->core.flush = flush;
1632 					rc = -1;
1633 					goto out;
1634 				}
1635 			} else {
1636 				if (record__aio_push(rec, map, &off) < 0) {
1637 					record__aio_set_pos(trace_fd, off);
1638 					if (synch)
1639 						map->core.flush = flush;
1640 					rc = -1;
1641 					goto out;
1642 				}
1643 			}
1644 			if (synch)
1645 				map->core.flush = flush;
1646 		}
1647 
1648 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1649 		    !rec->opts.auxtrace_sample_mode &&
1650 		    record__auxtrace_mmap_read(rec, map) != 0) {
1651 			rc = -1;
1652 			goto out;
1653 		}
1654 	}
1655 
1656 	if (record__aio_enabled(rec))
1657 		record__aio_set_pos(trace_fd, off);
1658 
1659 	/*
1660 	 * Mark the round finished in case we wrote
1661 	 * at least one event.
1662 	 *
1663 	 * No need for round events in directory mode,
1664 	 * because per-cpu maps and files have data
1665 	 * sorted by kernel.
1666 	 */
1667 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1668 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1669 
1670 	if (overwrite)
1671 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1672 out:
1673 	return rc;
1674 }
1675 
1676 static int record__mmap_read_all(struct record *rec, bool synch)
1677 {
1678 	int err;
1679 
1680 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1681 	if (err)
1682 		return err;
1683 
1684 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1685 }
1686 
1687 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1688 					   void *arg __maybe_unused)
1689 {
1690 	struct perf_mmap *map = fda->priv[fd].ptr;
1691 
1692 	if (map)
1693 		perf_mmap__put(map);
1694 }
1695 
1696 static void *record__thread(void *arg)
1697 {
1698 	enum thread_msg msg = THREAD_MSG__READY;
1699 	bool terminate = false;
1700 	struct fdarray *pollfd;
1701 	int err, ctlfd_pos;
1702 
1703 	thread = arg;
1704 	thread->tid = gettid();
1705 
1706 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1707 	if (err == -1)
1708 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1709 			   thread->tid, strerror(errno));
1710 
1711 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1712 
1713 	pollfd = &thread->pollfd;
1714 	ctlfd_pos = thread->ctlfd_pos;
1715 
1716 	for (;;) {
1717 		unsigned long long hits = thread->samples;
1718 
1719 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1720 			break;
1721 
1722 		if (hits == thread->samples) {
1723 
1724 			err = fdarray__poll(pollfd, -1);
1725 			/*
1726 			 * Propagate error, only if there's any. Ignore positive
1727 			 * number of returned events and interrupt error.
1728 			 */
1729 			if (err > 0 || (err < 0 && errno == EINTR))
1730 				err = 0;
1731 			thread->waking++;
1732 
1733 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1734 					    record__thread_munmap_filtered, NULL) == 0)
1735 				break;
1736 		}
1737 
1738 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1739 			terminate = true;
1740 			close(thread->pipes.msg[0]);
1741 			thread->pipes.msg[0] = -1;
1742 			pollfd->entries[ctlfd_pos].fd = -1;
1743 			pollfd->entries[ctlfd_pos].events = 0;
1744 		}
1745 
1746 		pollfd->entries[ctlfd_pos].revents = 0;
1747 	}
1748 	record__mmap_read_all(thread->rec, true);
1749 
1750 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1751 	if (err == -1)
1752 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1753 			   thread->tid, strerror(errno));
1754 
1755 	return NULL;
1756 }
1757 
1758 static void record__init_features(struct record *rec)
1759 {
1760 	struct perf_session *session = rec->session;
1761 	int feat;
1762 
1763 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1764 		perf_header__set_feat(&session->header, feat);
1765 
1766 	if (rec->no_buildid)
1767 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1768 
1769 	if (!have_tracepoints(&rec->evlist->core.entries))
1770 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1771 
1772 	if (!rec->opts.branch_stack)
1773 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1774 
1775 	if (!rec->opts.full_auxtrace)
1776 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1777 
1778 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1779 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1780 
1781 	if (!rec->opts.use_clockid)
1782 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1783 
1784 	if (!record__threads_enabled(rec))
1785 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1786 
1787 	if (!record__comp_enabled(rec))
1788 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1789 
1790 	perf_header__clear_feat(&session->header, HEADER_STAT);
1791 }
1792 
1793 static void
1794 record__finish_output(struct record *rec)
1795 {
1796 	int i;
1797 	struct perf_data *data = &rec->data;
1798 	int fd = perf_data__fd(data);
1799 
1800 	if (data->is_pipe) {
1801 		/* Just to display approx. size */
1802 		data->file.size = rec->bytes_written;
1803 		return;
1804 	}
1805 
1806 	rec->session->header.data_size += rec->bytes_written;
1807 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1808 	if (record__threads_enabled(rec)) {
1809 		for (i = 0; i < data->dir.nr; i++)
1810 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1811 	}
1812 
1813 	if (!rec->no_buildid) {
1814 		process_buildids(rec);
1815 
1816 		if (rec->buildid_all)
1817 			perf_session__dsos_hit_all(rec->session);
1818 	}
1819 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1820 
1821 	return;
1822 }
1823 
1824 static int record__synthesize_workload(struct record *rec, bool tail)
1825 {
1826 	int err;
1827 	struct perf_thread_map *thread_map;
1828 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1829 
1830 	if (rec->opts.tail_synthesize != tail)
1831 		return 0;
1832 
1833 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1834 	if (thread_map == NULL)
1835 		return -1;
1836 
1837 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1838 						 process_synthesized_event,
1839 						 &rec->session->machines.host,
1840 						 needs_mmap,
1841 						 rec->opts.sample_address);
1842 	perf_thread_map__put(thread_map);
1843 	return err;
1844 }
1845 
1846 static int write_finished_init(struct record *rec, bool tail)
1847 {
1848 	if (rec->opts.tail_synthesize != tail)
1849 		return 0;
1850 
1851 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1852 }
1853 
1854 static int record__synthesize(struct record *rec, bool tail);
1855 
1856 static int
1857 record__switch_output(struct record *rec, bool at_exit)
1858 {
1859 	struct perf_data *data = &rec->data;
1860 	char *new_filename = NULL;
1861 	int fd, err;
1862 
1863 	/* Same Size:      "2015122520103046"*/
1864 	char timestamp[] = "InvalidTimestamp";
1865 
1866 	record__aio_mmap_read_sync(rec);
1867 
1868 	write_finished_init(rec, true);
1869 
1870 	record__synthesize(rec, true);
1871 	if (target__none(&rec->opts.target))
1872 		record__synthesize_workload(rec, true);
1873 
1874 	rec->samples = 0;
1875 	record__finish_output(rec);
1876 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1877 	if (err) {
1878 		pr_err("Failed to get current timestamp\n");
1879 		return -EINVAL;
1880 	}
1881 
1882 	fd = perf_data__switch(data, timestamp,
1883 			       rec->session->header.data_offset,
1884 			       at_exit, &new_filename);
1885 	if (fd >= 0 && !at_exit) {
1886 		rec->bytes_written = 0;
1887 		rec->session->header.data_size = 0;
1888 	}
1889 
1890 	if (!quiet) {
1891 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1892 			data->path, timestamp);
1893 	}
1894 
1895 	if (rec->switch_output.num_files) {
1896 		int n = rec->switch_output.cur_file + 1;
1897 
1898 		if (n >= rec->switch_output.num_files)
1899 			n = 0;
1900 		rec->switch_output.cur_file = n;
1901 		if (rec->switch_output.filenames[n]) {
1902 			remove(rec->switch_output.filenames[n]);
1903 			zfree(&rec->switch_output.filenames[n]);
1904 		}
1905 		rec->switch_output.filenames[n] = new_filename;
1906 	} else {
1907 		free(new_filename);
1908 	}
1909 
1910 	/* Output tracking events */
1911 	if (!at_exit) {
1912 		record__synthesize(rec, false);
1913 
1914 		/*
1915 		 * In 'perf record --switch-output' without -a,
1916 		 * record__synthesize() in record__switch_output() won't
1917 		 * generate tracking events because there's no thread_map
1918 		 * in evlist. Which causes newly created perf.data doesn't
1919 		 * contain map and comm information.
1920 		 * Create a fake thread_map and directly call
1921 		 * perf_event__synthesize_thread_map() for those events.
1922 		 */
1923 		if (target__none(&rec->opts.target))
1924 			record__synthesize_workload(rec, false);
1925 		write_finished_init(rec, false);
1926 	}
1927 	return fd;
1928 }
1929 
1930 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1931 					struct perf_record_lost_samples *lost,
1932 					int cpu_idx, int thread_idx, u64 lost_count,
1933 					u16 misc_flag)
1934 {
1935 	struct perf_sample_id *sid;
1936 	struct perf_sample sample;
1937 	int id_hdr_size;
1938 
1939 	perf_sample__init(&sample, /*all=*/true);
1940 	lost->lost = lost_count;
1941 	if (evsel->core.ids) {
1942 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1943 		sample.id = sid->id;
1944 	}
1945 
1946 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1947 						       evsel->core.attr.sample_type, &sample);
1948 	lost->header.size = sizeof(*lost) + id_hdr_size;
1949 	lost->header.misc = misc_flag;
1950 	record__write(rec, NULL, lost, lost->header.size);
1951 	perf_sample__exit(&sample);
1952 }
1953 
1954 static void record__read_lost_samples(struct record *rec)
1955 {
1956 	struct perf_session *session = rec->session;
1957 	struct perf_record_lost_samples_and_ids lost;
1958 	struct evsel *evsel;
1959 
1960 	/* there was an error during record__open */
1961 	if (session->evlist == NULL)
1962 		return;
1963 
1964 	evlist__for_each_entry(session->evlist, evsel) {
1965 		struct xyarray *xy = evsel->core.sample_id;
1966 		u64 lost_count;
1967 
1968 		if (xy == NULL || evsel->core.fd == NULL)
1969 			continue;
1970 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1971 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1972 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1973 			continue;
1974 		}
1975 
1976 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1977 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1978 				struct perf_counts_values count;
1979 
1980 				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1981 					pr_debug("read LOST count failed\n");
1982 					return;
1983 				}
1984 
1985 				if (count.lost) {
1986 					memset(&lost, 0, sizeof(lost));
1987 					lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
1988 					__record__save_lost_samples(rec, evsel, &lost.lost,
1989 								    x, y, count.lost, 0);
1990 				}
1991 			}
1992 		}
1993 
1994 		lost_count = perf_bpf_filter__lost_count(evsel);
1995 		if (lost_count) {
1996 			memset(&lost, 0, sizeof(lost));
1997 			lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
1998 			__record__save_lost_samples(rec, evsel, &lost.lost, 0, 0, lost_count,
1999 						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
2000 		}
2001 	}
2002 }
2003 
2004 static volatile sig_atomic_t workload_exec_errno;
2005 
2006 /*
2007  * evlist__prepare_workload will send a SIGUSR1
2008  * if the fork fails, since we asked by setting its
2009  * want_signal to true.
2010  */
2011 static void workload_exec_failed_signal(int signo __maybe_unused,
2012 					siginfo_t *info,
2013 					void *ucontext __maybe_unused)
2014 {
2015 	workload_exec_errno = info->si_value.sival_int;
2016 	done = 1;
2017 	child_finished = 1;
2018 }
2019 
2020 static void snapshot_sig_handler(int sig);
2021 static void alarm_sig_handler(int sig);
2022 
2023 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
2024 {
2025 	if (evlist) {
2026 		if (evlist->mmap && evlist->mmap[0].core.base)
2027 			return evlist->mmap[0].core.base;
2028 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2029 			return evlist->overwrite_mmap[0].core.base;
2030 	}
2031 	return NULL;
2032 }
2033 
2034 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2035 {
2036 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2037 	if (pc)
2038 		return pc;
2039 	return NULL;
2040 }
2041 
2042 static int record__synthesize(struct record *rec, bool tail)
2043 {
2044 	struct perf_session *session = rec->session;
2045 	struct machine *machine = &session->machines.host;
2046 	struct perf_data *data = &rec->data;
2047 	struct record_opts *opts = &rec->opts;
2048 	struct perf_tool *tool = &rec->tool;
2049 	int err = 0;
2050 	event_op f = process_synthesized_event;
2051 
2052 	if (rec->opts.tail_synthesize != tail)
2053 		return 0;
2054 
2055 	if (data->is_pipe) {
2056 		err = perf_event__synthesize_for_pipe(tool, session, data,
2057 						      process_synthesized_event);
2058 		if (err < 0)
2059 			goto out;
2060 
2061 		rec->bytes_written += err;
2062 	}
2063 
2064 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2065 					  process_synthesized_event, machine);
2066 	if (err)
2067 		goto out;
2068 
2069 	/* Synthesize id_index before auxtrace_info */
2070 	err = perf_event__synthesize_id_index(tool,
2071 					      process_synthesized_event,
2072 					      session->evlist, machine);
2073 	if (err)
2074 		goto out;
2075 
2076 	if (rec->opts.full_auxtrace) {
2077 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2078 					session, process_synthesized_event);
2079 		if (err)
2080 			goto out;
2081 	}
2082 
2083 	if (!evlist__exclude_kernel(rec->evlist)) {
2084 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2085 							 machine);
2086 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2087 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2088 				   "Check /proc/kallsyms permission or run as root.\n");
2089 
2090 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2091 						     machine);
2092 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2093 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2094 				   "Check /proc/modules permission or run as root.\n");
2095 	}
2096 
2097 	if (perf_guest) {
2098 		machines__process_guests(&session->machines,
2099 					 perf_event__synthesize_guest_os, tool);
2100 	}
2101 
2102 	err = perf_event__synthesize_extra_attr(&rec->tool,
2103 						rec->evlist,
2104 						process_synthesized_event,
2105 						data->is_pipe);
2106 	if (err)
2107 		goto out;
2108 
2109 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2110 						 process_synthesized_event,
2111 						NULL);
2112 	if (err < 0) {
2113 		pr_err("Couldn't synthesize thread map.\n");
2114 		return err;
2115 	}
2116 
2117 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2118 					     process_synthesized_event, NULL);
2119 	if (err < 0) {
2120 		pr_err("Couldn't synthesize cpu map.\n");
2121 		return err;
2122 	}
2123 
2124 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2125 						machine, opts);
2126 	if (err < 0) {
2127 		pr_warning("Couldn't synthesize bpf events.\n");
2128 		err = 0;
2129 	}
2130 
2131 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2132 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2133 						     machine);
2134 		if (err < 0) {
2135 			pr_warning("Couldn't synthesize cgroup events.\n");
2136 			err = 0;
2137 		}
2138 	}
2139 
2140 	if (rec->opts.nr_threads_synthesize > 1) {
2141 		mutex_init(&synth_lock);
2142 		perf_set_multithreaded();
2143 		f = process_locked_synthesized_event;
2144 	}
2145 
2146 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2147 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2148 
2149 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2150 						    rec->evlist->core.threads,
2151 						    f, needs_mmap, opts->sample_address,
2152 						    rec->opts.nr_threads_synthesize);
2153 	}
2154 
2155 	if (rec->opts.nr_threads_synthesize > 1) {
2156 		perf_set_singlethreaded();
2157 		mutex_destroy(&synth_lock);
2158 	}
2159 
2160 out:
2161 	return err;
2162 }
2163 
2164 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2165 {
2166 	struct record *rec = data;
2167 	pthread_kill(rec->thread_id, SIGUSR2);
2168 	return 0;
2169 }
2170 
2171 static int record__setup_sb_evlist(struct record *rec)
2172 {
2173 	struct record_opts *opts = &rec->opts;
2174 
2175 	if (rec->sb_evlist != NULL) {
2176 		/*
2177 		 * We get here if --switch-output-event populated the
2178 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2179 		 * to the main thread.
2180 		 */
2181 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2182 		rec->thread_id = pthread_self();
2183 	}
2184 #ifdef HAVE_LIBBPF_SUPPORT
2185 	if (!opts->no_bpf_event) {
2186 		if (rec->sb_evlist == NULL) {
2187 			rec->sb_evlist = evlist__new();
2188 
2189 			if (rec->sb_evlist == NULL) {
2190 				pr_err("Couldn't create side band evlist.\n.");
2191 				return -1;
2192 			}
2193 		}
2194 
2195 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2196 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2197 			return -1;
2198 		}
2199 	}
2200 #endif
2201 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2202 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2203 		opts->no_bpf_event = true;
2204 	}
2205 
2206 	return 0;
2207 }
2208 
2209 static int record__init_clock(struct record *rec)
2210 {
2211 	struct perf_session *session = rec->session;
2212 	struct timespec ref_clockid;
2213 	struct timeval ref_tod;
2214 	u64 ref;
2215 
2216 	if (!rec->opts.use_clockid)
2217 		return 0;
2218 
2219 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2220 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2221 
2222 	session->header.env.clock.clockid = rec->opts.clockid;
2223 
2224 	if (gettimeofday(&ref_tod, NULL) != 0) {
2225 		pr_err("gettimeofday failed, cannot set reference time.\n");
2226 		return -1;
2227 	}
2228 
2229 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2230 		pr_err("clock_gettime failed, cannot set reference time.\n");
2231 		return -1;
2232 	}
2233 
2234 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2235 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2236 
2237 	session->header.env.clock.tod_ns = ref;
2238 
2239 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2240 	      (u64) ref_clockid.tv_nsec;
2241 
2242 	session->header.env.clock.clockid_ns = ref;
2243 	return 0;
2244 }
2245 
2246 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2247 {
2248 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2249 		trigger_hit(&auxtrace_snapshot_trigger);
2250 		auxtrace_record__snapshot_started = 1;
2251 		if (auxtrace_record__snapshot_start(rec->itr))
2252 			trigger_error(&auxtrace_snapshot_trigger);
2253 	}
2254 }
2255 
2256 static int record__terminate_thread(struct record_thread *thread_data)
2257 {
2258 	int err;
2259 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2260 	pid_t tid = thread_data->tid;
2261 
2262 	close(thread_data->pipes.msg[1]);
2263 	thread_data->pipes.msg[1] = -1;
2264 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2265 	if (err > 0)
2266 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2267 	else
2268 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2269 			   thread->tid, tid);
2270 
2271 	return 0;
2272 }
2273 
2274 static int record__start_threads(struct record *rec)
2275 {
2276 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2277 	struct record_thread *thread_data = rec->thread_data;
2278 	sigset_t full, mask;
2279 	pthread_t handle;
2280 	pthread_attr_t attrs;
2281 
2282 	thread = &thread_data[0];
2283 
2284 	if (!record__threads_enabled(rec))
2285 		return 0;
2286 
2287 	sigfillset(&full);
2288 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2289 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2290 		return -1;
2291 	}
2292 
2293 	pthread_attr_init(&attrs);
2294 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2295 
2296 	for (t = 1; t < nr_threads; t++) {
2297 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2298 
2299 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2300 		pthread_attr_setaffinity_np(&attrs,
2301 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2302 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2303 #endif
2304 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2305 			for (tt = 1; tt < t; tt++)
2306 				record__terminate_thread(&thread_data[t]);
2307 			pr_err("Failed to start threads: %s\n", strerror(errno));
2308 			ret = -1;
2309 			goto out_err;
2310 		}
2311 
2312 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2313 		if (err > 0)
2314 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2315 				  thread_msg_tags[msg]);
2316 		else
2317 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2318 				   thread->tid, rec->thread_data[t].tid);
2319 	}
2320 
2321 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2322 			(cpu_set_t *)thread->mask->affinity.bits);
2323 
2324 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2325 
2326 out_err:
2327 	pthread_attr_destroy(&attrs);
2328 
2329 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2330 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2331 		ret = -1;
2332 	}
2333 
2334 	return ret;
2335 }
2336 
2337 static int record__stop_threads(struct record *rec)
2338 {
2339 	int t;
2340 	struct record_thread *thread_data = rec->thread_data;
2341 
2342 	for (t = 1; t < rec->nr_threads; t++)
2343 		record__terminate_thread(&thread_data[t]);
2344 
2345 	for (t = 0; t < rec->nr_threads; t++) {
2346 		rec->samples += thread_data[t].samples;
2347 		if (!record__threads_enabled(rec))
2348 			continue;
2349 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2350 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2351 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2352 			 thread_data[t].samples, thread_data[t].waking);
2353 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2354 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2355 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2356 		else
2357 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2358 	}
2359 
2360 	return 0;
2361 }
2362 
2363 static unsigned long record__waking(struct record *rec)
2364 {
2365 	int t;
2366 	unsigned long waking = 0;
2367 	struct record_thread *thread_data = rec->thread_data;
2368 
2369 	for (t = 0; t < rec->nr_threads; t++)
2370 		waking += thread_data[t].waking;
2371 
2372 	return waking;
2373 }
2374 
2375 static int __cmd_record(struct record *rec, int argc, const char **argv)
2376 {
2377 	int err;
2378 	int status = 0;
2379 	const bool forks = argc > 0;
2380 	struct perf_tool *tool = &rec->tool;
2381 	struct record_opts *opts = &rec->opts;
2382 	struct perf_data *data = &rec->data;
2383 	struct perf_session *session;
2384 	bool disabled = false, draining = false;
2385 	int fd;
2386 	float ratio = 0;
2387 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2388 
2389 	atexit(record__sig_exit);
2390 	signal(SIGCHLD, sig_handler);
2391 	signal(SIGINT, sig_handler);
2392 	signal(SIGTERM, sig_handler);
2393 	signal(SIGSEGV, sigsegv_handler);
2394 
2395 	if (rec->opts.record_cgroup) {
2396 #ifndef HAVE_FILE_HANDLE
2397 		pr_err("cgroup tracking is not supported\n");
2398 		return -1;
2399 #endif
2400 	}
2401 
2402 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2403 		signal(SIGUSR2, snapshot_sig_handler);
2404 		if (rec->opts.auxtrace_snapshot_mode)
2405 			trigger_on(&auxtrace_snapshot_trigger);
2406 		if (rec->switch_output.enabled)
2407 			trigger_on(&switch_output_trigger);
2408 	} else {
2409 		signal(SIGUSR2, SIG_IGN);
2410 	}
2411 
2412 	perf_tool__init(tool, /*ordered_events=*/true);
2413 	tool->sample		= process_sample_event;
2414 	tool->fork		= perf_event__process_fork;
2415 	tool->exit		= perf_event__process_exit;
2416 	tool->comm		= perf_event__process_comm;
2417 	tool->namespaces	= perf_event__process_namespaces;
2418 	tool->mmap		= build_id__process_mmap;
2419 	tool->mmap2		= build_id__process_mmap2;
2420 	tool->itrace_start	= process_timestamp_boundary;
2421 	tool->aux		= process_timestamp_boundary;
2422 	tool->namespace_events	= rec->opts.record_namespaces;
2423 	tool->cgroup_events	= rec->opts.record_cgroup;
2424 	session = perf_session__new(data, tool);
2425 	if (IS_ERR(session)) {
2426 		pr_err("Perf session creation failed.\n");
2427 		return PTR_ERR(session);
2428 	}
2429 
2430 	if (record__threads_enabled(rec)) {
2431 		if (perf_data__is_pipe(&rec->data)) {
2432 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2433 			return -1;
2434 		}
2435 		if (rec->opts.full_auxtrace) {
2436 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2437 			return -1;
2438 		}
2439 	}
2440 
2441 	fd = perf_data__fd(data);
2442 	rec->session = session;
2443 
2444 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2445 		pr_err("Compression initialization failed.\n");
2446 		return -1;
2447 	}
2448 #ifdef HAVE_EVENTFD_SUPPORT
2449 	done_fd = eventfd(0, EFD_NONBLOCK);
2450 	if (done_fd < 0) {
2451 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2452 		status = -1;
2453 		goto out_delete_session;
2454 	}
2455 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2456 	if (err < 0) {
2457 		pr_err("Failed to add wakeup eventfd to poll list\n");
2458 		status = err;
2459 		goto out_delete_session;
2460 	}
2461 #endif // HAVE_EVENTFD_SUPPORT
2462 
2463 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2464 	session->header.env.comp_level = rec->opts.comp_level;
2465 
2466 	if (rec->opts.kcore &&
2467 	    !record__kcore_readable(&session->machines.host)) {
2468 		pr_err("ERROR: kcore is not readable.\n");
2469 		return -1;
2470 	}
2471 
2472 	if (record__init_clock(rec))
2473 		return -1;
2474 
2475 	record__init_features(rec);
2476 
2477 	if (forks) {
2478 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2479 					       workload_exec_failed_signal);
2480 		if (err < 0) {
2481 			pr_err("Couldn't run the workload!\n");
2482 			status = err;
2483 			goto out_delete_session;
2484 		}
2485 	}
2486 
2487 	/*
2488 	 * If we have just single event and are sending data
2489 	 * through pipe, we need to force the ids allocation,
2490 	 * because we synthesize event name through the pipe
2491 	 * and need the id for that.
2492 	 */
2493 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2494 		rec->opts.sample_id = true;
2495 
2496 	if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2497 		rec->timestamp_filename = false;
2498 		pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2499 	}
2500 
2501 	/*
2502 	 * Use global stat_config that is zero meaning aggr_mode is AGGR_NONE
2503 	 * and hybrid_merge is false.
2504 	 */
2505 	evlist__uniquify_evsel_names(rec->evlist, &stat_config);
2506 
2507 	evlist__config(rec->evlist, opts, &callchain_param);
2508 
2509 	/* Debug message used by test scripts */
2510 	pr_debug3("perf record opening and mmapping events\n");
2511 	if (record__open(rec) != 0) {
2512 		err = -1;
2513 		goto out_free_threads;
2514 	}
2515 	/* Debug message used by test scripts */
2516 	pr_debug3("perf record done opening and mmapping events\n");
2517 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2518 
2519 	if (rec->opts.kcore) {
2520 		err = record__kcore_copy(&session->machines.host, data);
2521 		if (err) {
2522 			pr_err("ERROR: Failed to copy kcore\n");
2523 			goto out_free_threads;
2524 		}
2525 	}
2526 
2527 	/*
2528 	 * Normally perf_session__new would do this, but it doesn't have the
2529 	 * evlist.
2530 	 */
2531 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2532 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2533 		rec->tool.ordered_events = false;
2534 	}
2535 
2536 	if (evlist__nr_groups(rec->evlist) == 0)
2537 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2538 
2539 	if (data->is_pipe) {
2540 		err = perf_header__write_pipe(fd);
2541 		if (err < 0)
2542 			goto out_free_threads;
2543 	} else {
2544 		err = perf_session__write_header(session, rec->evlist, fd, false);
2545 		if (err < 0)
2546 			goto out_free_threads;
2547 	}
2548 
2549 	err = -1;
2550 	if (!rec->no_buildid
2551 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2552 		pr_err("Couldn't generate buildids. "
2553 		       "Use --no-buildid to profile anyway.\n");
2554 		goto out_free_threads;
2555 	}
2556 
2557 	if (!evlist__needs_bpf_sb_event(rec->evlist))
2558 		opts->no_bpf_event = true;
2559 
2560 	err = record__setup_sb_evlist(rec);
2561 	if (err)
2562 		goto out_free_threads;
2563 
2564 	err = record__synthesize(rec, false);
2565 	if (err < 0)
2566 		goto out_free_threads;
2567 
2568 	if (rec->realtime_prio) {
2569 		struct sched_param param;
2570 
2571 		param.sched_priority = rec->realtime_prio;
2572 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2573 			pr_err("Could not set realtime priority.\n");
2574 			err = -1;
2575 			goto out_free_threads;
2576 		}
2577 	}
2578 
2579 	if (record__start_threads(rec))
2580 		goto out_free_threads;
2581 
2582 	/*
2583 	 * When perf is starting the traced process, all the events
2584 	 * (apart from group members) have enable_on_exec=1 set,
2585 	 * so don't spoil it by prematurely enabling them.
2586 	 */
2587 	if (!target__none(&opts->target) && !opts->target.initial_delay)
2588 		evlist__enable(rec->evlist);
2589 
2590 	/*
2591 	 * offcpu-time does not call execve, so enable_on_exe wouldn't work
2592 	 * when recording a workload, do it manually
2593 	 */
2594 	if (rec->off_cpu)
2595 		evlist__enable_evsel(rec->evlist, (char *)OFFCPU_EVENT);
2596 
2597 	/*
2598 	 * Let the child rip
2599 	 */
2600 	if (forks) {
2601 		struct machine *machine = &session->machines.host;
2602 		union perf_event *event;
2603 		pid_t tgid;
2604 
2605 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2606 		if (event == NULL) {
2607 			err = -ENOMEM;
2608 			goto out_child;
2609 		}
2610 
2611 		/*
2612 		 * Some H/W events are generated before COMM event
2613 		 * which is emitted during exec(), so perf script
2614 		 * cannot see a correct process name for those events.
2615 		 * Synthesize COMM event to prevent it.
2616 		 */
2617 		tgid = perf_event__synthesize_comm(tool, event,
2618 						   rec->evlist->workload.pid,
2619 						   process_synthesized_event,
2620 						   machine);
2621 		free(event);
2622 
2623 		if (tgid == -1)
2624 			goto out_child;
2625 
2626 		event = malloc(sizeof(event->namespaces) +
2627 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2628 			       machine->id_hdr_size);
2629 		if (event == NULL) {
2630 			err = -ENOMEM;
2631 			goto out_child;
2632 		}
2633 
2634 		/*
2635 		 * Synthesize NAMESPACES event for the command specified.
2636 		 */
2637 		perf_event__synthesize_namespaces(tool, event,
2638 						  rec->evlist->workload.pid,
2639 						  tgid, process_synthesized_event,
2640 						  machine);
2641 		free(event);
2642 
2643 		evlist__start_workload(rec->evlist);
2644 	}
2645 
2646 	if (opts->target.initial_delay) {
2647 		pr_info(EVLIST_DISABLED_MSG);
2648 		if (opts->target.initial_delay > 0) {
2649 			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2650 			evlist__enable(rec->evlist);
2651 			pr_info(EVLIST_ENABLED_MSG);
2652 		}
2653 	}
2654 
2655 	err = event_enable_timer__start(rec->evlist->eet);
2656 	if (err)
2657 		goto out_child;
2658 
2659 	/* Debug message used by test scripts */
2660 	pr_debug3("perf record has started\n");
2661 	fflush(stderr);
2662 
2663 	trigger_ready(&auxtrace_snapshot_trigger);
2664 	trigger_ready(&switch_output_trigger);
2665 	perf_hooks__invoke_record_start();
2666 
2667 	/*
2668 	 * Must write FINISHED_INIT so it will be seen after all other
2669 	 * synthesized user events, but before any regular events.
2670 	 */
2671 	err = write_finished_init(rec, false);
2672 	if (err < 0)
2673 		goto out_child;
2674 
2675 	for (;;) {
2676 		unsigned long long hits = thread->samples;
2677 
2678 		/*
2679 		 * rec->evlist->bkw_mmap_state is possible to be
2680 		 * BKW_MMAP_EMPTY here: when done == true and
2681 		 * hits != rec->samples in previous round.
2682 		 *
2683 		 * evlist__toggle_bkw_mmap ensure we never
2684 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2685 		 */
2686 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2687 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2688 
2689 		if (record__mmap_read_all(rec, false) < 0) {
2690 			trigger_error(&auxtrace_snapshot_trigger);
2691 			trigger_error(&switch_output_trigger);
2692 			err = -1;
2693 			goto out_child;
2694 		}
2695 
2696 		if (auxtrace_record__snapshot_started) {
2697 			auxtrace_record__snapshot_started = 0;
2698 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2699 				record__read_auxtrace_snapshot(rec, false);
2700 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2701 				pr_err("AUX area tracing snapshot failed\n");
2702 				err = -1;
2703 				goto out_child;
2704 			}
2705 		}
2706 
2707 		if (trigger_is_hit(&switch_output_trigger)) {
2708 			/*
2709 			 * If switch_output_trigger is hit, the data in
2710 			 * overwritable ring buffer should have been collected,
2711 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2712 			 *
2713 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2714 			 * record__mmap_read_all() didn't collect data from
2715 			 * overwritable ring buffer. Read again.
2716 			 */
2717 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2718 				continue;
2719 			trigger_ready(&switch_output_trigger);
2720 
2721 			/*
2722 			 * Reenable events in overwrite ring buffer after
2723 			 * record__mmap_read_all(): we should have collected
2724 			 * data from it.
2725 			 */
2726 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2727 
2728 			if (!quiet)
2729 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2730 					record__waking(rec));
2731 			thread->waking = 0;
2732 			fd = record__switch_output(rec, false);
2733 			if (fd < 0) {
2734 				pr_err("Failed to switch to new file\n");
2735 				trigger_error(&switch_output_trigger);
2736 				err = fd;
2737 				goto out_child;
2738 			}
2739 
2740 			/* re-arm the alarm */
2741 			if (rec->switch_output.time)
2742 				alarm(rec->switch_output.time);
2743 		}
2744 
2745 		if (hits == thread->samples) {
2746 			if (done || draining)
2747 				break;
2748 			err = fdarray__poll(&thread->pollfd, -1);
2749 			/*
2750 			 * Propagate error, only if there's any. Ignore positive
2751 			 * number of returned events and interrupt error.
2752 			 */
2753 			if (err > 0 || (err < 0 && errno == EINTR))
2754 				err = 0;
2755 			thread->waking++;
2756 
2757 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2758 					    record__thread_munmap_filtered, NULL) == 0)
2759 				draining = true;
2760 
2761 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2762 			if (err)
2763 				goto out_child;
2764 		}
2765 
2766 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2767 			switch (cmd) {
2768 			case EVLIST_CTL_CMD_SNAPSHOT:
2769 				hit_auxtrace_snapshot_trigger(rec);
2770 				evlist__ctlfd_ack(rec->evlist);
2771 				break;
2772 			case EVLIST_CTL_CMD_STOP:
2773 				done = 1;
2774 				break;
2775 			case EVLIST_CTL_CMD_ACK:
2776 			case EVLIST_CTL_CMD_UNSUPPORTED:
2777 			case EVLIST_CTL_CMD_ENABLE:
2778 			case EVLIST_CTL_CMD_DISABLE:
2779 			case EVLIST_CTL_CMD_EVLIST:
2780 			case EVLIST_CTL_CMD_PING:
2781 			default:
2782 				break;
2783 			}
2784 		}
2785 
2786 		err = event_enable_timer__process(rec->evlist->eet);
2787 		if (err < 0)
2788 			goto out_child;
2789 		if (err) {
2790 			err = 0;
2791 			done = 1;
2792 		}
2793 
2794 		/*
2795 		 * When perf is starting the traced process, at the end events
2796 		 * die with the process and we wait for that. Thus no need to
2797 		 * disable events in this case.
2798 		 */
2799 		if (done && !disabled && !target__none(&opts->target)) {
2800 			trigger_off(&auxtrace_snapshot_trigger);
2801 			evlist__disable(rec->evlist);
2802 			disabled = true;
2803 		}
2804 	}
2805 
2806 	trigger_off(&auxtrace_snapshot_trigger);
2807 	trigger_off(&switch_output_trigger);
2808 
2809 	if (opts->auxtrace_snapshot_on_exit)
2810 		record__auxtrace_snapshot_exit(rec);
2811 
2812 	if (forks && workload_exec_errno) {
2813 		char msg[STRERR_BUFSIZE];
2814 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2815 		struct strbuf sb = STRBUF_INIT;
2816 
2817 		evlist__format_evsels(rec->evlist, &sb, 2048);
2818 
2819 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2820 			sb.buf, argv[0], emsg);
2821 		strbuf_release(&sb);
2822 		err = -1;
2823 		goto out_child;
2824 	}
2825 
2826 	if (!quiet)
2827 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2828 			record__waking(rec));
2829 
2830 	write_finished_init(rec, true);
2831 
2832 	if (target__none(&rec->opts.target))
2833 		record__synthesize_workload(rec, true);
2834 
2835 out_child:
2836 	record__stop_threads(rec);
2837 	record__mmap_read_all(rec, true);
2838 out_free_threads:
2839 	record__free_thread_data(rec);
2840 	evlist__finalize_ctlfd(rec->evlist);
2841 	record__aio_mmap_read_sync(rec);
2842 
2843 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2844 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2845 		session->header.env.comp_ratio = ratio + 0.5;
2846 	}
2847 
2848 	if (forks) {
2849 		int exit_status;
2850 
2851 		if (!child_finished)
2852 			kill(rec->evlist->workload.pid, SIGTERM);
2853 
2854 		wait(&exit_status);
2855 
2856 		if (err < 0)
2857 			status = err;
2858 		else if (WIFEXITED(exit_status))
2859 			status = WEXITSTATUS(exit_status);
2860 		else if (WIFSIGNALED(exit_status))
2861 			signr = WTERMSIG(exit_status);
2862 	} else
2863 		status = err;
2864 
2865 	if (rec->off_cpu)
2866 		rec->bytes_written += off_cpu_write(rec->session);
2867 
2868 	record__read_lost_samples(rec);
2869 	record__synthesize(rec, true);
2870 	/* this will be recalculated during process_buildids() */
2871 	rec->samples = 0;
2872 
2873 	if (!err) {
2874 		if (!rec->timestamp_filename) {
2875 			record__finish_output(rec);
2876 		} else {
2877 			fd = record__switch_output(rec, true);
2878 			if (fd < 0) {
2879 				status = fd;
2880 				goto out_delete_session;
2881 			}
2882 		}
2883 	}
2884 
2885 	perf_hooks__invoke_record_end();
2886 
2887 	if (!err && !quiet) {
2888 		char samples[128];
2889 		const char *postfix = rec->timestamp_filename ?
2890 					".<timestamp>" : "";
2891 
2892 		if (rec->samples && !rec->opts.full_auxtrace)
2893 			scnprintf(samples, sizeof(samples),
2894 				  " (%" PRIu64 " samples)", rec->samples);
2895 		else
2896 			samples[0] = '\0';
2897 
2898 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2899 			perf_data__size(data) / 1024.0 / 1024.0,
2900 			data->path, postfix, samples);
2901 		if (ratio) {
2902 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2903 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2904 					ratio);
2905 		}
2906 		fprintf(stderr, " ]\n");
2907 	}
2908 
2909 out_delete_session:
2910 #ifdef HAVE_EVENTFD_SUPPORT
2911 	if (done_fd >= 0) {
2912 		fd = done_fd;
2913 		done_fd = -1;
2914 
2915 		close(fd);
2916 	}
2917 #endif
2918 	zstd_fini(&session->zstd_data);
2919 	if (!opts->no_bpf_event)
2920 		evlist__stop_sb_thread(rec->sb_evlist);
2921 
2922 	perf_session__delete(session);
2923 	return status;
2924 }
2925 
2926 static void callchain_debug(struct callchain_param *callchain)
2927 {
2928 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2929 
2930 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2931 
2932 	if (callchain->record_mode == CALLCHAIN_DWARF)
2933 		pr_debug("callchain: stack dump size %d\n",
2934 			 callchain->dump_size);
2935 }
2936 
2937 int record_opts__parse_callchain(struct record_opts *record,
2938 				 struct callchain_param *callchain,
2939 				 const char *arg, bool unset)
2940 {
2941 	int ret;
2942 	callchain->enabled = !unset;
2943 
2944 	/* --no-call-graph */
2945 	if (unset) {
2946 		callchain->record_mode = CALLCHAIN_NONE;
2947 		pr_debug("callchain: disabled\n");
2948 		return 0;
2949 	}
2950 
2951 	ret = parse_callchain_record_opt(arg, callchain);
2952 	if (!ret) {
2953 		/* Enable data address sampling for DWARF unwind. */
2954 		if (callchain->record_mode == CALLCHAIN_DWARF)
2955 			record->sample_address = true;
2956 		callchain_debug(callchain);
2957 	}
2958 
2959 	return ret;
2960 }
2961 
2962 int record_parse_callchain_opt(const struct option *opt,
2963 			       const char *arg,
2964 			       int unset)
2965 {
2966 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2967 }
2968 
2969 int record_callchain_opt(const struct option *opt,
2970 			 const char *arg __maybe_unused,
2971 			 int unset __maybe_unused)
2972 {
2973 	struct callchain_param *callchain = opt->value;
2974 
2975 	callchain->enabled = true;
2976 
2977 	if (callchain->record_mode == CALLCHAIN_NONE)
2978 		callchain->record_mode = CALLCHAIN_FP;
2979 
2980 	callchain_debug(callchain);
2981 	return 0;
2982 }
2983 
2984 static int perf_record_config(const char *var, const char *value, void *cb)
2985 {
2986 	struct record *rec = cb;
2987 
2988 	if (!strcmp(var, "record.build-id")) {
2989 		if (!strcmp(value, "cache"))
2990 			rec->no_buildid_cache = false;
2991 		else if (!strcmp(value, "no-cache"))
2992 			rec->no_buildid_cache = true;
2993 		else if (!strcmp(value, "skip"))
2994 			rec->no_buildid = true;
2995 		else if (!strcmp(value, "mmap"))
2996 			rec->buildid_mmap = true;
2997 		else
2998 			return -1;
2999 		return 0;
3000 	}
3001 	if (!strcmp(var, "record.call-graph")) {
3002 		var = "call-graph.record-mode";
3003 		return perf_default_config(var, value, cb);
3004 	}
3005 #ifdef HAVE_AIO_SUPPORT
3006 	if (!strcmp(var, "record.aio")) {
3007 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
3008 		if (!rec->opts.nr_cblocks)
3009 			rec->opts.nr_cblocks = nr_cblocks_default;
3010 	}
3011 #endif
3012 	if (!strcmp(var, "record.debuginfod")) {
3013 		rec->debuginfod.urls = strdup(value);
3014 		if (!rec->debuginfod.urls)
3015 			return -ENOMEM;
3016 		rec->debuginfod.set = true;
3017 	}
3018 
3019 	return 0;
3020 }
3021 
3022 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
3023 {
3024 	struct record *rec = (struct record *)opt->value;
3025 
3026 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
3027 }
3028 
3029 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
3030 {
3031 	struct record_opts *opts = (struct record_opts *)opt->value;
3032 
3033 	if (unset || !str)
3034 		return 0;
3035 
3036 	if (!strcasecmp(str, "node"))
3037 		opts->affinity = PERF_AFFINITY_NODE;
3038 	else if (!strcasecmp(str, "cpu"))
3039 		opts->affinity = PERF_AFFINITY_CPU;
3040 
3041 	return 0;
3042 }
3043 
3044 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3045 {
3046 	mask->nbits = nr_bits;
3047 	mask->bits = bitmap_zalloc(mask->nbits);
3048 	if (!mask->bits)
3049 		return -ENOMEM;
3050 
3051 	return 0;
3052 }
3053 
3054 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3055 {
3056 	bitmap_free(mask->bits);
3057 	mask->nbits = 0;
3058 }
3059 
3060 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3061 {
3062 	int ret;
3063 
3064 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3065 	if (ret) {
3066 		mask->affinity.bits = NULL;
3067 		return ret;
3068 	}
3069 
3070 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3071 	if (ret) {
3072 		record__mmap_cpu_mask_free(&mask->maps);
3073 		mask->maps.bits = NULL;
3074 	}
3075 
3076 	return ret;
3077 }
3078 
3079 static void record__thread_mask_free(struct thread_mask *mask)
3080 {
3081 	record__mmap_cpu_mask_free(&mask->maps);
3082 	record__mmap_cpu_mask_free(&mask->affinity);
3083 }
3084 
3085 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3086 {
3087 	int s;
3088 	struct record_opts *opts = opt->value;
3089 
3090 	if (unset || !str || !strlen(str)) {
3091 		opts->threads_spec = THREAD_SPEC__CPU;
3092 	} else {
3093 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3094 			if (s == THREAD_SPEC__USER) {
3095 				opts->threads_user_spec = strdup(str);
3096 				if (!opts->threads_user_spec)
3097 					return -ENOMEM;
3098 				opts->threads_spec = THREAD_SPEC__USER;
3099 				break;
3100 			}
3101 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3102 				opts->threads_spec = s;
3103 				break;
3104 			}
3105 		}
3106 	}
3107 
3108 	if (opts->threads_spec == THREAD_SPEC__USER)
3109 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3110 	else
3111 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3112 
3113 	return 0;
3114 }
3115 
3116 static int parse_output_max_size(const struct option *opt,
3117 				 const char *str, int unset)
3118 {
3119 	unsigned long *s = (unsigned long *)opt->value;
3120 	static struct parse_tag tags_size[] = {
3121 		{ .tag  = 'B', .mult = 1       },
3122 		{ .tag  = 'K', .mult = 1 << 10 },
3123 		{ .tag  = 'M', .mult = 1 << 20 },
3124 		{ .tag  = 'G', .mult = 1 << 30 },
3125 		{ .tag  = 0 },
3126 	};
3127 	unsigned long val;
3128 
3129 	if (unset) {
3130 		*s = 0;
3131 		return 0;
3132 	}
3133 
3134 	val = parse_tag_value(str, tags_size);
3135 	if (val != (unsigned long) -1) {
3136 		*s = val;
3137 		return 0;
3138 	}
3139 
3140 	return -1;
3141 }
3142 
3143 static int record__parse_mmap_pages(const struct option *opt,
3144 				    const char *str,
3145 				    int unset __maybe_unused)
3146 {
3147 	struct record_opts *opts = opt->value;
3148 	char *s, *p;
3149 	unsigned int mmap_pages;
3150 	int ret;
3151 
3152 	if (!str)
3153 		return -EINVAL;
3154 
3155 	s = strdup(str);
3156 	if (!s)
3157 		return -ENOMEM;
3158 
3159 	p = strchr(s, ',');
3160 	if (p)
3161 		*p = '\0';
3162 
3163 	if (*s) {
3164 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3165 		if (ret)
3166 			goto out_free;
3167 		opts->mmap_pages = mmap_pages;
3168 	}
3169 
3170 	if (!p) {
3171 		ret = 0;
3172 		goto out_free;
3173 	}
3174 
3175 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3176 	if (ret)
3177 		goto out_free;
3178 
3179 	opts->auxtrace_mmap_pages = mmap_pages;
3180 
3181 out_free:
3182 	free(s);
3183 	return ret;
3184 }
3185 
3186 static int record__parse_off_cpu_thresh(const struct option *opt,
3187 					const char *str,
3188 					int unset __maybe_unused)
3189 {
3190 	struct record_opts *opts = opt->value;
3191 	char *endptr;
3192 	u64 off_cpu_thresh_ms;
3193 
3194 	if (!str)
3195 		return -EINVAL;
3196 
3197 	off_cpu_thresh_ms = strtoull(str, &endptr, 10);
3198 
3199 	/* the threshold isn't string "0", yet strtoull() returns 0, parsing failed */
3200 	if (*endptr || (off_cpu_thresh_ms == 0 && strcmp(str, "0")))
3201 		return -EINVAL;
3202 	else
3203 		opts->off_cpu_thresh_ns = off_cpu_thresh_ms * NSEC_PER_MSEC;
3204 
3205 	return 0;
3206 }
3207 
3208 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3209 {
3210 }
3211 
3212 static int parse_control_option(const struct option *opt,
3213 				const char *str,
3214 				int unset __maybe_unused)
3215 {
3216 	struct record_opts *opts = opt->value;
3217 
3218 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3219 }
3220 
3221 static void switch_output_size_warn(struct record *rec)
3222 {
3223 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3224 	struct switch_output *s = &rec->switch_output;
3225 
3226 	wakeup_size /= 2;
3227 
3228 	if (s->size < wakeup_size) {
3229 		char buf[100];
3230 
3231 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3232 		pr_warning("WARNING: switch-output data size lower than "
3233 			   "wakeup kernel buffer size (%s) "
3234 			   "expect bigger perf.data sizes\n", buf);
3235 	}
3236 }
3237 
3238 static int switch_output_setup(struct record *rec)
3239 {
3240 	struct switch_output *s = &rec->switch_output;
3241 	static struct parse_tag tags_size[] = {
3242 		{ .tag  = 'B', .mult = 1       },
3243 		{ .tag  = 'K', .mult = 1 << 10 },
3244 		{ .tag  = 'M', .mult = 1 << 20 },
3245 		{ .tag  = 'G', .mult = 1 << 30 },
3246 		{ .tag  = 0 },
3247 	};
3248 	static struct parse_tag tags_time[] = {
3249 		{ .tag  = 's', .mult = 1        },
3250 		{ .tag  = 'm', .mult = 60       },
3251 		{ .tag  = 'h', .mult = 60*60    },
3252 		{ .tag  = 'd', .mult = 60*60*24 },
3253 		{ .tag  = 0 },
3254 	};
3255 	unsigned long val;
3256 
3257 	/*
3258 	 * If we're using --switch-output-events, then we imply its
3259 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3260 	 *  thread to its parent.
3261 	 */
3262 	if (rec->switch_output_event_set) {
3263 		if (record__threads_enabled(rec)) {
3264 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3265 			return 0;
3266 		}
3267 		goto do_signal;
3268 	}
3269 
3270 	if (!s->set)
3271 		return 0;
3272 
3273 	if (record__threads_enabled(rec)) {
3274 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3275 		return 0;
3276 	}
3277 
3278 	if (!strcmp(s->str, "signal")) {
3279 do_signal:
3280 		s->signal = true;
3281 		pr_debug("switch-output with SIGUSR2 signal\n");
3282 		goto enabled;
3283 	}
3284 
3285 	val = parse_tag_value(s->str, tags_size);
3286 	if (val != (unsigned long) -1) {
3287 		s->size = val;
3288 		pr_debug("switch-output with %s size threshold\n", s->str);
3289 		goto enabled;
3290 	}
3291 
3292 	val = parse_tag_value(s->str, tags_time);
3293 	if (val != (unsigned long) -1) {
3294 		s->time = val;
3295 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3296 			 s->str, s->time);
3297 		goto enabled;
3298 	}
3299 
3300 	return -1;
3301 
3302 enabled:
3303 	rec->timestamp_filename = true;
3304 	s->enabled              = true;
3305 
3306 	if (s->size && !rec->opts.no_buffering)
3307 		switch_output_size_warn(rec);
3308 
3309 	return 0;
3310 }
3311 
3312 static const char * const __record_usage[] = {
3313 	"perf record [<options>] [<command>]",
3314 	"perf record [<options>] -- <command> [<options>]",
3315 	NULL
3316 };
3317 const char * const *record_usage = __record_usage;
3318 
3319 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
3320 				  struct perf_sample *sample, struct machine *machine)
3321 {
3322 	/*
3323 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3324 	 * no need to add them twice.
3325 	 */
3326 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3327 		return 0;
3328 	return perf_event__process_mmap(tool, event, sample, machine);
3329 }
3330 
3331 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
3332 				   struct perf_sample *sample, struct machine *machine)
3333 {
3334 	/*
3335 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3336 	 * no need to add them twice.
3337 	 */
3338 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3339 		return 0;
3340 
3341 	return perf_event__process_mmap2(tool, event, sample, machine);
3342 }
3343 
3344 static int process_timestamp_boundary(const struct perf_tool *tool,
3345 				      union perf_event *event __maybe_unused,
3346 				      struct perf_sample *sample,
3347 				      struct machine *machine __maybe_unused)
3348 {
3349 	struct record *rec = container_of(tool, struct record, tool);
3350 
3351 	set_timestamp_boundary(rec, sample->time);
3352 	return 0;
3353 }
3354 
3355 static int parse_record_synth_option(const struct option *opt,
3356 				     const char *str,
3357 				     int unset __maybe_unused)
3358 {
3359 	struct record_opts *opts = opt->value;
3360 	char *p = strdup(str);
3361 
3362 	if (p == NULL)
3363 		return -1;
3364 
3365 	opts->synth = parse_synth_opt(p);
3366 	free(p);
3367 
3368 	if (opts->synth < 0) {
3369 		pr_err("Invalid synth option: %s\n", str);
3370 		return -1;
3371 	}
3372 	return 0;
3373 }
3374 
3375 /*
3376  * XXX Ideally would be local to cmd_record() and passed to a record__new
3377  * because we need to have access to it in record__exit, that is called
3378  * after cmd_record() exits, but since record_options need to be accessible to
3379  * builtin-script, leave it here.
3380  *
3381  * At least we don't ouch it in all the other functions here directly.
3382  *
3383  * Just say no to tons of global variables, sigh.
3384  */
3385 static struct record record = {
3386 	.opts = {
3387 		.sample_time	     = true,
3388 		.mmap_pages	     = UINT_MAX,
3389 		.user_freq	     = UINT_MAX,
3390 		.user_interval	     = ULLONG_MAX,
3391 		.freq		     = 4000,
3392 		.target		     = {
3393 			.uses_mmap   = true,
3394 			.default_per_cpu = true,
3395 		},
3396 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3397 		.nr_threads_synthesize = 1,
3398 		.ctl_fd              = -1,
3399 		.ctl_fd_ack          = -1,
3400 		.synth               = PERF_SYNTH_ALL,
3401 		.off_cpu_thresh_ns   = OFFCPU_THRESH,
3402 	},
3403 };
3404 
3405 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3406 	"\n\t\t\t\tDefault: fp";
3407 
3408 static bool dry_run;
3409 
3410 static struct parse_events_option_args parse_events_option_args = {
3411 	.evlistp = &record.evlist,
3412 };
3413 
3414 static struct parse_events_option_args switch_output_parse_events_option_args = {
3415 	.evlistp = &record.sb_evlist,
3416 };
3417 
3418 /*
3419  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3420  * with it and switch to use the library functions in perf_evlist that came
3421  * from builtin-record.c, i.e. use record_opts,
3422  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3423  * using pipes, etc.
3424  */
3425 static struct option __record_options[] = {
3426 	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3427 		     "event selector. use 'perf list' to list available events",
3428 		     parse_events_option),
3429 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3430 		     "event filter", parse_filter),
3431 	OPT_BOOLEAN(0, "latency", &record.latency,
3432 		    "Enable data collection for latency profiling.\n"
3433 		    "\t\t\t  Use perf report --latency for latency-centric profile."),
3434 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3435 			   NULL, "don't record events from perf itself",
3436 			   exclude_perf),
3437 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3438 		    "record events on existing process id"),
3439 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3440 		    "record events on existing thread id"),
3441 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3442 		    "collect data with this RT SCHED_FIFO priority"),
3443 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3444 		    "collect data without buffering"),
3445 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3446 		    "collect raw sample records from all opened counters"),
3447 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3448 			    "system-wide collection from all CPUs"),
3449 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3450 		    "list of cpus to monitor"),
3451 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3452 	OPT_STRING('o', "output", &record.data.path, "file",
3453 		    "output file name"),
3454 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3455 			&record.opts.no_inherit_set,
3456 			"child tasks do not inherit counters"),
3457 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3458 		    "synthesize non-sample events at the end of output"),
3459 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3460 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3461 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3462 		    "Fail if the specified frequency can't be used"),
3463 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3464 		     "profile at this frequency",
3465 		      record__parse_freq),
3466 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3467 		     "number of mmap data pages and AUX area tracing mmap pages",
3468 		     record__parse_mmap_pages),
3469 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3470 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3471 		     record__mmap_flush_parse),
3472 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3473 			   NULL, "enables call-graph recording" ,
3474 			   &record_callchain_opt),
3475 	OPT_CALLBACK(0, "call-graph", &record.opts,
3476 		     "record_mode[,record_size]", record_callchain_help,
3477 		     &record_parse_callchain_opt),
3478 	OPT_INCR('v', "verbose", &verbose,
3479 		    "be more verbose (show counter open errors, etc)"),
3480 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3481 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3482 		    "per thread counts"),
3483 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3484 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3485 		    "Record the sample physical addresses"),
3486 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3487 		    "Record the sampled data address data page size"),
3488 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3489 		    "Record the sampled code address (ip) page size"),
3490 	OPT_BOOLEAN(0, "sample-mem-info", &record.opts.sample_data_src,
3491 		    "Record the data source for memory operations"),
3492 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3493 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3494 		    "Record the sample identifier"),
3495 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3496 			&record.opts.sample_time_set,
3497 			"Record the sample timestamps"),
3498 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3499 			"Record the sample period"),
3500 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3501 		    "don't sample"),
3502 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3503 			&record.no_buildid_cache_set,
3504 			"do not update the buildid cache"),
3505 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3506 			&record.no_buildid_set,
3507 			"do not collect buildids in perf.data"),
3508 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3509 		     "monitor event in cgroup name only",
3510 		     parse_cgroups),
3511 	OPT_CALLBACK('D', "delay", &record, "ms",
3512 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3513 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3514 		     record__parse_event_enable_time),
3515 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3516 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3517 		   "user to profile"),
3518 
3519 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3520 		     "branch any", "sample any taken branches",
3521 		     parse_branch_stack),
3522 
3523 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3524 		     "branch filter mask", "branch stack filter modes",
3525 		     parse_branch_stack),
3526 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3527 		    "sample by weight (on special events only)"),
3528 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3529 		    "sample transaction flags (special events only)"),
3530 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3531 		    "use per-thread mmaps"),
3532 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3533 		    "sample selected machine registers on interrupt,"
3534 		    " use '-I?' to list register names", parse_intr_regs),
3535 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3536 		    "sample selected machine registers in user space,"
3537 		    " use '--user-regs=?' to list register names", parse_user_regs),
3538 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3539 		    "Record running/enabled time of read (:S) events"),
3540 	OPT_CALLBACK('k', "clockid", &record.opts,
3541 	"clockid", "clockid to use for events, see clock_gettime()",
3542 	parse_clockid),
3543 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3544 			  "opts", "AUX area tracing Snapshot Mode", ""),
3545 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3546 			  "opts", "sample AUX area", ""),
3547 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3548 			"per thread proc mmap processing timeout in ms"),
3549 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3550 		    "Record namespaces events"),
3551 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3552 		    "Record cgroup events"),
3553 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3554 			&record.opts.record_switch_events_set,
3555 			"Record context switch events"),
3556 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3557 			 "Configure all used events to run in kernel space.",
3558 			 PARSE_OPT_EXCLUSIVE),
3559 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3560 			 "Configure all used events to run in user space.",
3561 			 PARSE_OPT_EXCLUSIVE),
3562 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3563 		    "collect kernel callchains"),
3564 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3565 		    "collect user callchains"),
3566 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3567 		   "file", "vmlinux pathname"),
3568 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3569 		    "Record build-id of all DSOs regardless of hits"),
3570 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3571 		    "Record build-id in map events"),
3572 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3573 		    "append timestamp to output filename"),
3574 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3575 		    "Record timestamp boundary (time of first/last samples)"),
3576 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3577 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3578 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3579 			  "signal"),
3580 	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3581 			 &record.switch_output_event_set, "switch output event",
3582 			 "switch output event selector. use 'perf list' to list available events",
3583 			 parse_events_option_new_evlist),
3584 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3585 		   "Limit number of switch output generated files"),
3586 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3587 		    "Parse options then exit"),
3588 #ifdef HAVE_AIO_SUPPORT
3589 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3590 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3591 		     record__aio_parse),
3592 #endif
3593 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3594 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3595 		     record__parse_affinity),
3596 #ifdef HAVE_ZSTD_SUPPORT
3597 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3598 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3599 			    record__parse_comp_level),
3600 #endif
3601 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3602 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3603 	OPT_UINTEGER(0, "num-thread-synthesize",
3604 		     &record.opts.nr_threads_synthesize,
3605 		     "number of threads to run for event synthesis"),
3606 #ifdef HAVE_LIBPFM
3607 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3608 		"libpfm4 event selector. use 'perf list' to list available events",
3609 		parse_libpfm_events_option),
3610 #endif
3611 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3612 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3613 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3614 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3615 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3616 		      parse_control_option),
3617 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3618 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3619 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3620 			  &record.debuginfod.set, "debuginfod urls",
3621 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3622 			  "system"),
3623 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3624 			    "write collected trace data into several data files using parallel threads",
3625 			    record__parse_threads),
3626 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3627 	OPT_STRING(0, "setup-filter", &record.filter_action, "pin|unpin",
3628 		   "BPF filter action"),
3629 	OPT_CALLBACK(0, "off-cpu-thresh", &record.opts, "ms",
3630 		     "Dump off-cpu samples if off-cpu time exceeds this threshold (in milliseconds). (Default: 500ms)",
3631 		     record__parse_off_cpu_thresh),
3632 	OPT_END()
3633 };
3634 
3635 struct option *record_options = __record_options;
3636 
3637 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3638 {
3639 	struct perf_cpu cpu;
3640 	int idx;
3641 
3642 	if (cpu_map__is_dummy(cpus))
3643 		return 0;
3644 
3645 	perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3646 		/* Return ENODEV is input cpu is greater than max cpu */
3647 		if ((unsigned long)cpu.cpu > mask->nbits)
3648 			return -ENODEV;
3649 		__set_bit(cpu.cpu, mask->bits);
3650 	}
3651 
3652 	return 0;
3653 }
3654 
3655 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3656 {
3657 	struct perf_cpu_map *cpus;
3658 
3659 	cpus = perf_cpu_map__new(mask_spec);
3660 	if (!cpus)
3661 		return -ENOMEM;
3662 
3663 	bitmap_zero(mask->bits, mask->nbits);
3664 	if (record__mmap_cpu_mask_init(mask, cpus))
3665 		return -ENODEV;
3666 
3667 	perf_cpu_map__put(cpus);
3668 
3669 	return 0;
3670 }
3671 
3672 static void record__free_thread_masks(struct record *rec, int nr_threads)
3673 {
3674 	int t;
3675 
3676 	if (rec->thread_masks)
3677 		for (t = 0; t < nr_threads; t++)
3678 			record__thread_mask_free(&rec->thread_masks[t]);
3679 
3680 	zfree(&rec->thread_masks);
3681 }
3682 
3683 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3684 {
3685 	int t, ret;
3686 
3687 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3688 	if (!rec->thread_masks) {
3689 		pr_err("Failed to allocate thread masks\n");
3690 		return -ENOMEM;
3691 	}
3692 
3693 	for (t = 0; t < nr_threads; t++) {
3694 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3695 		if (ret) {
3696 			pr_err("Failed to allocate thread masks[%d]\n", t);
3697 			goto out_free;
3698 		}
3699 	}
3700 
3701 	return 0;
3702 
3703 out_free:
3704 	record__free_thread_masks(rec, nr_threads);
3705 
3706 	return ret;
3707 }
3708 
3709 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3710 {
3711 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3712 
3713 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3714 	if (ret)
3715 		return ret;
3716 
3717 	rec->nr_threads = nr_cpus;
3718 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3719 
3720 	for (t = 0; t < rec->nr_threads; t++) {
3721 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3722 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3723 		if (verbose > 0) {
3724 			pr_debug("thread_masks[%d]: ", t);
3725 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3726 			pr_debug("thread_masks[%d]: ", t);
3727 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3728 		}
3729 	}
3730 
3731 	return 0;
3732 }
3733 
3734 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3735 					  const char **maps_spec, const char **affinity_spec,
3736 					  u32 nr_spec)
3737 {
3738 	u32 s;
3739 	int ret = 0, t = 0;
3740 	struct mmap_cpu_mask cpus_mask;
3741 	struct thread_mask thread_mask, full_mask, *thread_masks;
3742 
3743 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3744 	if (ret) {
3745 		pr_err("Failed to allocate CPUs mask\n");
3746 		return ret;
3747 	}
3748 
3749 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3750 	if (ret) {
3751 		pr_err("Failed to init cpu mask\n");
3752 		goto out_free_cpu_mask;
3753 	}
3754 
3755 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3756 	if (ret) {
3757 		pr_err("Failed to allocate full mask\n");
3758 		goto out_free_cpu_mask;
3759 	}
3760 
3761 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3762 	if (ret) {
3763 		pr_err("Failed to allocate thread mask\n");
3764 		goto out_free_full_and_cpu_masks;
3765 	}
3766 
3767 	for (s = 0; s < nr_spec; s++) {
3768 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3769 		if (ret) {
3770 			pr_err("Failed to initialize maps thread mask\n");
3771 			goto out_free;
3772 		}
3773 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3774 		if (ret) {
3775 			pr_err("Failed to initialize affinity thread mask\n");
3776 			goto out_free;
3777 		}
3778 
3779 		/* ignore invalid CPUs but do not allow empty masks */
3780 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3781 				cpus_mask.bits, thread_mask.maps.nbits)) {
3782 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3783 			ret = -EINVAL;
3784 			goto out_free;
3785 		}
3786 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3787 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3788 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3789 			ret = -EINVAL;
3790 			goto out_free;
3791 		}
3792 
3793 		/* do not allow intersection with other masks (full_mask) */
3794 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3795 				      thread_mask.maps.nbits)) {
3796 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3797 			ret = -EINVAL;
3798 			goto out_free;
3799 		}
3800 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3801 				      thread_mask.affinity.nbits)) {
3802 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3803 			ret = -EINVAL;
3804 			goto out_free;
3805 		}
3806 
3807 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3808 			  thread_mask.maps.bits, full_mask.maps.nbits);
3809 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3810 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3811 
3812 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3813 		if (!thread_masks) {
3814 			pr_err("Failed to reallocate thread masks\n");
3815 			ret = -ENOMEM;
3816 			goto out_free;
3817 		}
3818 		rec->thread_masks = thread_masks;
3819 		rec->thread_masks[t] = thread_mask;
3820 		if (verbose > 0) {
3821 			pr_debug("thread_masks[%d]: ", t);
3822 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3823 			pr_debug("thread_masks[%d]: ", t);
3824 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3825 		}
3826 		t++;
3827 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3828 		if (ret) {
3829 			pr_err("Failed to allocate thread mask\n");
3830 			goto out_free_full_and_cpu_masks;
3831 		}
3832 	}
3833 	rec->nr_threads = t;
3834 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3835 	if (!rec->nr_threads)
3836 		ret = -EINVAL;
3837 
3838 out_free:
3839 	record__thread_mask_free(&thread_mask);
3840 out_free_full_and_cpu_masks:
3841 	record__thread_mask_free(&full_mask);
3842 out_free_cpu_mask:
3843 	record__mmap_cpu_mask_free(&cpus_mask);
3844 
3845 	return ret;
3846 }
3847 
3848 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3849 {
3850 	int ret;
3851 	struct cpu_topology *topo;
3852 
3853 	topo = cpu_topology__new();
3854 	if (!topo) {
3855 		pr_err("Failed to allocate CPU topology\n");
3856 		return -ENOMEM;
3857 	}
3858 
3859 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3860 					     topo->core_cpus_list, topo->core_cpus_lists);
3861 	cpu_topology__delete(topo);
3862 
3863 	return ret;
3864 }
3865 
3866 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3867 {
3868 	int ret;
3869 	struct cpu_topology *topo;
3870 
3871 	topo = cpu_topology__new();
3872 	if (!topo) {
3873 		pr_err("Failed to allocate CPU topology\n");
3874 		return -ENOMEM;
3875 	}
3876 
3877 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3878 					     topo->package_cpus_list, topo->package_cpus_lists);
3879 	cpu_topology__delete(topo);
3880 
3881 	return ret;
3882 }
3883 
3884 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3885 {
3886 	u32 s;
3887 	int ret;
3888 	const char **spec;
3889 	struct numa_topology *topo;
3890 
3891 	topo = numa_topology__new();
3892 	if (!topo) {
3893 		pr_err("Failed to allocate NUMA topology\n");
3894 		return -ENOMEM;
3895 	}
3896 
3897 	spec = zalloc(topo->nr * sizeof(char *));
3898 	if (!spec) {
3899 		pr_err("Failed to allocate NUMA spec\n");
3900 		ret = -ENOMEM;
3901 		goto out_delete_topo;
3902 	}
3903 	for (s = 0; s < topo->nr; s++)
3904 		spec[s] = topo->nodes[s].cpus;
3905 
3906 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3907 
3908 	zfree(&spec);
3909 
3910 out_delete_topo:
3911 	numa_topology__delete(topo);
3912 
3913 	return ret;
3914 }
3915 
3916 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3917 {
3918 	int t, ret;
3919 	u32 s, nr_spec = 0;
3920 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3921 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3922 
3923 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3924 		spec = strtok_r(user_spec, ":", &spec_ptr);
3925 		if (spec == NULL)
3926 			break;
3927 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3928 		mask = strtok_r(spec, "/", &mask_ptr);
3929 		if (mask == NULL)
3930 			break;
3931 		pr_debug2("  maps mask: %s\n", mask);
3932 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3933 		if (!tmp_spec) {
3934 			pr_err("Failed to reallocate maps spec\n");
3935 			ret = -ENOMEM;
3936 			goto out_free;
3937 		}
3938 		maps_spec = tmp_spec;
3939 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3940 		if (!maps_spec[nr_spec]) {
3941 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3942 			ret = -ENOMEM;
3943 			goto out_free;
3944 		}
3945 		mask = strtok_r(NULL, "/", &mask_ptr);
3946 		if (mask == NULL) {
3947 			pr_err("Invalid thread maps or affinity specs\n");
3948 			ret = -EINVAL;
3949 			goto out_free;
3950 		}
3951 		pr_debug2("  affinity mask: %s\n", mask);
3952 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3953 		if (!tmp_spec) {
3954 			pr_err("Failed to reallocate affinity spec\n");
3955 			ret = -ENOMEM;
3956 			goto out_free;
3957 		}
3958 		affinity_spec = tmp_spec;
3959 		affinity_spec[nr_spec] = strdup(mask);
3960 		if (!affinity_spec[nr_spec]) {
3961 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3962 			ret = -ENOMEM;
3963 			goto out_free;
3964 		}
3965 		dup_mask = NULL;
3966 		nr_spec++;
3967 	}
3968 
3969 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3970 					     (const char **)affinity_spec, nr_spec);
3971 
3972 out_free:
3973 	free(dup_mask);
3974 	for (s = 0; s < nr_spec; s++) {
3975 		if (maps_spec)
3976 			free(maps_spec[s]);
3977 		if (affinity_spec)
3978 			free(affinity_spec[s]);
3979 	}
3980 	free(affinity_spec);
3981 	free(maps_spec);
3982 
3983 	return ret;
3984 }
3985 
3986 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3987 {
3988 	int ret;
3989 
3990 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3991 	if (ret)
3992 		return ret;
3993 
3994 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3995 		return -ENODEV;
3996 
3997 	rec->nr_threads = 1;
3998 
3999 	return 0;
4000 }
4001 
4002 static int record__init_thread_masks(struct record *rec)
4003 {
4004 	int ret = 0;
4005 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
4006 
4007 	if (!record__threads_enabled(rec))
4008 		return record__init_thread_default_masks(rec, cpus);
4009 
4010 	if (evlist__per_thread(rec->evlist)) {
4011 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
4012 		return -EINVAL;
4013 	}
4014 
4015 	switch (rec->opts.threads_spec) {
4016 	case THREAD_SPEC__CPU:
4017 		ret = record__init_thread_cpu_masks(rec, cpus);
4018 		break;
4019 	case THREAD_SPEC__CORE:
4020 		ret = record__init_thread_core_masks(rec, cpus);
4021 		break;
4022 	case THREAD_SPEC__PACKAGE:
4023 		ret = record__init_thread_package_masks(rec, cpus);
4024 		break;
4025 	case THREAD_SPEC__NUMA:
4026 		ret = record__init_thread_numa_masks(rec, cpus);
4027 		break;
4028 	case THREAD_SPEC__USER:
4029 		ret = record__init_thread_user_masks(rec, cpus);
4030 		break;
4031 	default:
4032 		break;
4033 	}
4034 
4035 	return ret;
4036 }
4037 
4038 int cmd_record(int argc, const char **argv)
4039 {
4040 	int err;
4041 	struct record *rec = &record;
4042 	char errbuf[BUFSIZ];
4043 
4044 	setlocale(LC_ALL, "");
4045 
4046 #ifndef HAVE_BPF_SKEL
4047 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
4048 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
4049 # undef set_nobuild
4050 #endif
4051 
4052 	/* Disable eager loading of kernel symbols that adds overhead to perf record. */
4053 	symbol_conf.lazy_load_kernel_maps = true;
4054 	rec->opts.affinity = PERF_AFFINITY_SYS;
4055 
4056 	rec->evlist = evlist__new();
4057 	if (rec->evlist == NULL)
4058 		return -ENOMEM;
4059 
4060 	err = perf_config(perf_record_config, rec);
4061 	if (err)
4062 		return err;
4063 
4064 	argc = parse_options(argc, argv, record_options, record_usage,
4065 			    PARSE_OPT_STOP_AT_NON_OPTION);
4066 	if (quiet)
4067 		perf_quiet_option();
4068 
4069 	err = symbol__validate_sym_arguments();
4070 	if (err)
4071 		return err;
4072 
4073 	perf_debuginfod_setup(&record.debuginfod);
4074 
4075 	/* Make system wide (-a) the default target. */
4076 	if (!argc && target__none(&rec->opts.target))
4077 		rec->opts.target.system_wide = true;
4078 
4079 	if (nr_cgroups && !rec->opts.target.system_wide) {
4080 		usage_with_options_msg(record_usage, record_options,
4081 			"cgroup monitoring only available in system-wide mode");
4082 
4083 	}
4084 
4085 	if (record.latency) {
4086 		/*
4087 		 * There is no fundamental reason why latency profiling
4088 		 * can't work for system-wide mode, but exact semantics
4089 		 * and details are to be defined.
4090 		 * See the following thread for details:
4091 		 * https://lore.kernel.org/all/Z4XDJyvjiie3howF@google.com/
4092 		 */
4093 		if (record.opts.target.system_wide) {
4094 			pr_err("Failed: latency profiling is not supported with system-wide collection.\n");
4095 			err = -EINVAL;
4096 			goto out_opts;
4097 		}
4098 		record.opts.record_switch_events = true;
4099 	}
4100 
4101 	if (rec->buildid_mmap) {
4102 		if (!perf_can_record_build_id()) {
4103 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4104 			err = -EINVAL;
4105 			goto out_opts;
4106 		}
4107 		pr_debug("Enabling build id in mmap2 events.\n");
4108 		/* Enable mmap build id synthesizing. */
4109 		symbol_conf.buildid_mmap2 = true;
4110 		/* Enable perf_event_attr::build_id bit. */
4111 		rec->opts.build_id = true;
4112 		/* Disable build id cache. */
4113 		rec->no_buildid = true;
4114 	}
4115 
4116 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4117 		pr_err("Kernel has no cgroup sampling support.\n");
4118 		err = -EINVAL;
4119 		goto out_opts;
4120 	}
4121 
4122 	if (rec->opts.kcore)
4123 		rec->opts.text_poke = true;
4124 
4125 	if (rec->opts.kcore || record__threads_enabled(rec))
4126 		rec->data.is_dir = true;
4127 
4128 	if (record__threads_enabled(rec)) {
4129 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4130 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4131 			goto out_opts;
4132 		}
4133 		if (record__aio_enabled(rec)) {
4134 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4135 			goto out_opts;
4136 		}
4137 	}
4138 
4139 	if (rec->opts.comp_level != 0) {
4140 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4141 		rec->no_buildid = true;
4142 	}
4143 
4144 	if (rec->opts.record_switch_events &&
4145 	    !perf_can_record_switch_events()) {
4146 		ui__error("kernel does not support recording context switch events\n");
4147 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4148 		err = -EINVAL;
4149 		goto out_opts;
4150 	}
4151 
4152 	if (switch_output_setup(rec)) {
4153 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4154 		err = -EINVAL;
4155 		goto out_opts;
4156 	}
4157 
4158 	if (rec->switch_output.time) {
4159 		signal(SIGALRM, alarm_sig_handler);
4160 		alarm(rec->switch_output.time);
4161 	}
4162 
4163 	if (rec->switch_output.num_files) {
4164 		rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4165 						      sizeof(char *));
4166 		if (!rec->switch_output.filenames) {
4167 			err = -EINVAL;
4168 			goto out_opts;
4169 		}
4170 	}
4171 
4172 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4173 		rec->timestamp_filename = false;
4174 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4175 	}
4176 
4177 	if (rec->filter_action) {
4178 		if (!strcmp(rec->filter_action, "pin"))
4179 			err = perf_bpf_filter__pin();
4180 		else if (!strcmp(rec->filter_action, "unpin"))
4181 			err = perf_bpf_filter__unpin();
4182 		else {
4183 			pr_warning("Unknown BPF filter action: %s\n", rec->filter_action);
4184 			err = -EINVAL;
4185 		}
4186 		goto out_opts;
4187 	}
4188 
4189 	/* For backward compatibility, -d implies --mem-info */
4190 	if (rec->opts.sample_address)
4191 		rec->opts.sample_data_src = true;
4192 
4193 	/*
4194 	 * Allow aliases to facilitate the lookup of symbols for address
4195 	 * filters. Refer to auxtrace_parse_filters().
4196 	 */
4197 	symbol_conf.allow_aliases = true;
4198 
4199 	symbol__init(NULL);
4200 
4201 	err = record__auxtrace_init(rec);
4202 	if (err)
4203 		goto out;
4204 
4205 	if (dry_run)
4206 		goto out;
4207 
4208 	err = -ENOMEM;
4209 
4210 	if (rec->no_buildid_cache || rec->no_buildid) {
4211 		disable_buildid_cache();
4212 	} else if (rec->switch_output.enabled) {
4213 		/*
4214 		 * In 'perf record --switch-output', disable buildid
4215 		 * generation by default to reduce data file switching
4216 		 * overhead. Still generate buildid if they are required
4217 		 * explicitly using
4218 		 *
4219 		 *  perf record --switch-output --no-no-buildid \
4220 		 *              --no-no-buildid-cache
4221 		 *
4222 		 * Following code equals to:
4223 		 *
4224 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4225 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4226 		 *         disable_buildid_cache();
4227 		 */
4228 		bool disable = true;
4229 
4230 		if (rec->no_buildid_set && !rec->no_buildid)
4231 			disable = false;
4232 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4233 			disable = false;
4234 		if (disable) {
4235 			rec->no_buildid = true;
4236 			rec->no_buildid_cache = true;
4237 			disable_buildid_cache();
4238 		}
4239 	}
4240 
4241 	if (record.opts.overwrite)
4242 		record.opts.tail_synthesize = true;
4243 
4244 	if (rec->evlist->core.nr_entries == 0) {
4245 		err = parse_event(rec->evlist, "cycles:P");
4246 		if (err)
4247 			goto out;
4248 	}
4249 
4250 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4251 		rec->opts.no_inherit = true;
4252 
4253 	err = target__validate(&rec->opts.target);
4254 	if (err) {
4255 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4256 		ui__warning("%s\n", errbuf);
4257 	}
4258 
4259 	err = target__parse_uid(&rec->opts.target);
4260 	if (err) {
4261 		int saved_errno = errno;
4262 
4263 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4264 		ui__error("%s", errbuf);
4265 
4266 		err = -saved_errno;
4267 		goto out;
4268 	}
4269 
4270 	/* Enable ignoring missing threads when -u/-p option is defined. */
4271 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4272 
4273 	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4274 
4275 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4276 		arch__add_leaf_frame_record_opts(&rec->opts);
4277 
4278 	err = -ENOMEM;
4279 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4280 		if (rec->opts.target.pid != NULL) {
4281 			pr_err("Couldn't create thread/CPU maps: %s\n",
4282 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4283 			goto out;
4284 		}
4285 		else
4286 			usage_with_options(record_usage, record_options);
4287 	}
4288 
4289 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4290 	if (err)
4291 		goto out;
4292 
4293 	/*
4294 	 * We take all buildids when the file contains
4295 	 * AUX area tracing data because we do not decode the
4296 	 * trace because it would take too long.
4297 	 */
4298 	if (rec->opts.full_auxtrace)
4299 		rec->buildid_all = true;
4300 
4301 	if (rec->opts.text_poke) {
4302 		err = record__config_text_poke(rec->evlist);
4303 		if (err) {
4304 			pr_err("record__config_text_poke failed, error %d\n", err);
4305 			goto out;
4306 		}
4307 	}
4308 
4309 	if (rec->off_cpu) {
4310 		err = record__config_off_cpu(rec);
4311 		if (err) {
4312 			pr_err("record__config_off_cpu failed, error %d\n", err);
4313 			goto out;
4314 		}
4315 	}
4316 
4317 	if (record_opts__config(&rec->opts)) {
4318 		err = -EINVAL;
4319 		goto out;
4320 	}
4321 
4322 	err = record__config_tracking_events(rec);
4323 	if (err) {
4324 		pr_err("record__config_tracking_events failed, error %d\n", err);
4325 		goto out;
4326 	}
4327 
4328 	err = record__init_thread_masks(rec);
4329 	if (err) {
4330 		pr_err("Failed to initialize parallel data streaming masks\n");
4331 		goto out;
4332 	}
4333 
4334 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4335 		rec->opts.nr_cblocks = nr_cblocks_max;
4336 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4337 
4338 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4339 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4340 
4341 	if (rec->opts.comp_level > comp_level_max)
4342 		rec->opts.comp_level = comp_level_max;
4343 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4344 
4345 	err = __cmd_record(&record, argc, argv);
4346 out:
4347 	record__free_thread_masks(rec, rec->nr_threads);
4348 	rec->nr_threads = 0;
4349 	symbol__exit();
4350 	auxtrace_record__free(rec->itr);
4351 out_opts:
4352 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4353 	evlist__delete(rec->evlist);
4354 	return err;
4355 }
4356 
4357 static void snapshot_sig_handler(int sig __maybe_unused)
4358 {
4359 	struct record *rec = &record;
4360 
4361 	hit_auxtrace_snapshot_trigger(rec);
4362 
4363 	if (switch_output_signal(rec))
4364 		trigger_hit(&switch_output_trigger);
4365 }
4366 
4367 static void alarm_sig_handler(int sig __maybe_unused)
4368 {
4369 	struct record *rec = &record;
4370 
4371 	if (switch_output_time(rec))
4372 		trigger_hit(&switch_output_trigger);
4373 }
4374