xref: /linux/tools/perf/builtin-record.c (revision a9e900bc5c5914aca750afafa459363e575d3046)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/arm64-frame-pointer-unwind-support.h"
18 #include "util/callchain.h"
19 #include "util/cgroup.h"
20 #include "util/header.h"
21 #include "util/event.h"
22 #include "util/evlist.h"
23 #include "util/evsel.h"
24 #include "util/debug.h"
25 #include "util/mmap.h"
26 #include "util/mutex.h"
27 #include "util/target.h"
28 #include "util/session.h"
29 #include "util/tool.h"
30 #include "util/stat.h"
31 #include "util/symbol.h"
32 #include "util/record.h"
33 #include "util/cpumap.h"
34 #include "util/thread_map.h"
35 #include "util/data.h"
36 #include "util/perf_regs.h"
37 #include "util/auxtrace.h"
38 #include "util/tsc.h"
39 #include "util/parse-branch-options.h"
40 #include "util/parse-regs-options.h"
41 #include "util/perf_api_probe.h"
42 #include "util/trigger.h"
43 #include "util/perf-hooks.h"
44 #include "util/synthetic-events.h"
45 #include "util/time-utils.h"
46 #include "util/units.h"
47 #include "util/bpf-event.h"
48 #include "util/util.h"
49 #include "util/pfm.h"
50 #include "util/pmu.h"
51 #include "util/pmus.h"
52 #include "util/clockid.h"
53 #include "util/off_cpu.h"
54 #include "util/bpf-filter.h"
55 #include "util/strbuf.h"
56 #include "asm/bug.h"
57 #include "perf.h"
58 #include "cputopo.h"
59 #include "dwarf-regs.h"
60 
61 #include <errno.h>
62 #include <inttypes.h>
63 #include <locale.h>
64 #include <poll.h>
65 #include <pthread.h>
66 #include <unistd.h>
67 #ifndef HAVE_GETTID
68 #include <syscall.h>
69 #endif
70 #include <sched.h>
71 #include <signal.h>
72 #ifdef HAVE_EVENTFD_SUPPORT
73 #include <sys/eventfd.h>
74 #endif
75 #include <sys/mman.h>
76 #include <sys/wait.h>
77 #include <sys/types.h>
78 #include <sys/stat.h>
79 #include <fcntl.h>
80 #include <linux/err.h>
81 #include <linux/string.h>
82 #include <linux/time64.h>
83 #include <linux/zalloc.h>
84 #include <linux/bitmap.h>
85 #include <sys/time.h>
86 
87 struct switch_output {
88 	bool		 enabled;
89 	bool		 signal;
90 	unsigned long	 size;
91 	unsigned long	 time;
92 	const char	*str;
93 	bool		 set;
94 	char		 **filenames;
95 	int		 num_files;
96 	int		 cur_file;
97 };
98 
99 struct thread_mask {
100 	struct mmap_cpu_mask	maps;
101 	struct mmap_cpu_mask	affinity;
102 };
103 
104 struct record_thread {
105 	pid_t			tid;
106 	struct thread_mask	*mask;
107 	struct {
108 		int		msg[2];
109 		int		ack[2];
110 	} pipes;
111 	struct fdarray		pollfd;
112 	int			ctlfd_pos;
113 	int			nr_mmaps;
114 	struct mmap		**maps;
115 	struct mmap		**overwrite_maps;
116 	struct record		*rec;
117 	unsigned long long	samples;
118 	unsigned long		waking;
119 	u64			bytes_written;
120 	u64			bytes_transferred;
121 	u64			bytes_compressed;
122 };
123 
124 static __thread struct record_thread *thread;
125 
126 enum thread_msg {
127 	THREAD_MSG__UNDEFINED = 0,
128 	THREAD_MSG__READY,
129 	THREAD_MSG__MAX,
130 };
131 
132 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
133 	"UNDEFINED", "READY"
134 };
135 
136 enum thread_spec {
137 	THREAD_SPEC__UNDEFINED = 0,
138 	THREAD_SPEC__CPU,
139 	THREAD_SPEC__CORE,
140 	THREAD_SPEC__PACKAGE,
141 	THREAD_SPEC__NUMA,
142 	THREAD_SPEC__USER,
143 	THREAD_SPEC__MAX,
144 };
145 
146 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
147 	"undefined", "cpu", "core", "package", "numa", "user"
148 };
149 
150 struct pollfd_index_map {
151 	int evlist_pollfd_index;
152 	int thread_pollfd_index;
153 };
154 
155 struct record {
156 	struct perf_tool	tool;
157 	struct record_opts	opts;
158 	u64			bytes_written;
159 	u64			thread_bytes_written;
160 	struct perf_data	data;
161 	struct auxtrace_record	*itr;
162 	struct evlist	*evlist;
163 	struct perf_session	*session;
164 	struct evlist		*sb_evlist;
165 	pthread_t		thread_id;
166 	int			realtime_prio;
167 	bool			latency;
168 	bool			switch_output_event_set;
169 	bool			no_buildid;
170 	bool			no_buildid_set;
171 	bool			no_buildid_cache;
172 	bool			no_buildid_cache_set;
173 	bool			buildid_all;
174 	bool			buildid_mmap;
175 	bool			buildid_mmap_set;
176 	bool			timestamp_filename;
177 	bool			timestamp_boundary;
178 	bool			off_cpu;
179 	const char		*filter_action;
180 	const char		*uid_str;
181 	struct switch_output	switch_output;
182 	unsigned long long	samples;
183 	unsigned long		output_max_size;	/* = 0: unlimited */
184 	struct perf_debuginfod	debuginfod;
185 	int			nr_threads;
186 	struct thread_mask	*thread_masks;
187 	struct record_thread	*thread_data;
188 	struct pollfd_index_map	*index_map;
189 	size_t			index_map_sz;
190 	size_t			index_map_cnt;
191 };
192 
193 static volatile int done;
194 
195 static volatile int auxtrace_record__snapshot_started;
196 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
197 static DEFINE_TRIGGER(switch_output_trigger);
198 
199 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
200 	"SYS", "NODE", "CPU"
201 };
202 
203 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
204 				  struct perf_sample *sample, struct machine *machine);
205 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
206 				   struct perf_sample *sample, struct machine *machine);
207 static int process_timestamp_boundary(const struct perf_tool *tool,
208 				      union perf_event *event,
209 				      struct perf_sample *sample,
210 				      struct machine *machine);
211 
212 #ifndef HAVE_GETTID
213 static inline pid_t gettid(void)
214 {
215 	return (pid_t)syscall(__NR_gettid);
216 }
217 #endif
218 
219 static int record__threads_enabled(struct record *rec)
220 {
221 	return rec->opts.threads_spec;
222 }
223 
224 static bool switch_output_signal(struct record *rec)
225 {
226 	return rec->switch_output.signal &&
227 	       trigger_is_ready(&switch_output_trigger);
228 }
229 
230 static bool switch_output_size(struct record *rec)
231 {
232 	return rec->switch_output.size &&
233 	       trigger_is_ready(&switch_output_trigger) &&
234 	       (rec->bytes_written >= rec->switch_output.size);
235 }
236 
237 static bool switch_output_time(struct record *rec)
238 {
239 	return rec->switch_output.time &&
240 	       trigger_is_ready(&switch_output_trigger);
241 }
242 
243 static u64 record__bytes_written(struct record *rec)
244 {
245 	return rec->bytes_written + rec->thread_bytes_written;
246 }
247 
248 static bool record__output_max_size_exceeded(struct record *rec)
249 {
250 	return rec->output_max_size &&
251 	       (record__bytes_written(rec) >= rec->output_max_size);
252 }
253 
254 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
255 			 void *bf, size_t size)
256 {
257 	struct perf_data_file *file = &rec->session->data->file;
258 
259 	if (map && map->file)
260 		file = map->file;
261 
262 	if (perf_data_file__write(file, bf, size) < 0) {
263 		pr_err("failed to write perf data, error: %m\n");
264 		return -1;
265 	}
266 
267 	if (map && map->file) {
268 		thread->bytes_written += size;
269 		rec->thread_bytes_written += size;
270 	} else {
271 		rec->bytes_written += size;
272 	}
273 
274 	if (record__output_max_size_exceeded(rec) && !done) {
275 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
276 				" stopping session ]\n",
277 				record__bytes_written(rec) >> 10);
278 		done = 1;
279 	}
280 
281 	if (switch_output_size(rec))
282 		trigger_hit(&switch_output_trigger);
283 
284 	return 0;
285 }
286 
287 static int record__aio_enabled(struct record *rec);
288 static int record__comp_enabled(struct record *rec);
289 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
290 			    void *dst, size_t dst_size, void *src, size_t src_size);
291 
292 #ifdef HAVE_AIO_SUPPORT
293 static int record__aio_write(struct aiocb *cblock, int trace_fd,
294 		void *buf, size_t size, off_t off)
295 {
296 	int rc;
297 
298 	cblock->aio_fildes = trace_fd;
299 	cblock->aio_buf    = buf;
300 	cblock->aio_nbytes = size;
301 	cblock->aio_offset = off;
302 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
303 
304 	do {
305 		rc = aio_write(cblock);
306 		if (rc == 0) {
307 			break;
308 		} else if (errno != EAGAIN) {
309 			cblock->aio_fildes = -1;
310 			pr_err("failed to queue perf data, error: %m\n");
311 			break;
312 		}
313 	} while (1);
314 
315 	return rc;
316 }
317 
318 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
319 {
320 	void *rem_buf;
321 	off_t rem_off;
322 	size_t rem_size;
323 	int rc, aio_errno;
324 	ssize_t aio_ret, written;
325 
326 	aio_errno = aio_error(cblock);
327 	if (aio_errno == EINPROGRESS)
328 		return 0;
329 
330 	written = aio_ret = aio_return(cblock);
331 	if (aio_ret < 0) {
332 		if (aio_errno != EINTR)
333 			pr_err("failed to write perf data, error: %m\n");
334 		written = 0;
335 	}
336 
337 	rem_size = cblock->aio_nbytes - written;
338 
339 	if (rem_size == 0) {
340 		cblock->aio_fildes = -1;
341 		/*
342 		 * md->refcount is incremented in record__aio_pushfn() for
343 		 * every aio write request started in record__aio_push() so
344 		 * decrement it because the request is now complete.
345 		 */
346 		perf_mmap__put(&md->core);
347 		rc = 1;
348 	} else {
349 		/*
350 		 * aio write request may require restart with the
351 		 * remainder if the kernel didn't write whole
352 		 * chunk at once.
353 		 */
354 		rem_off = cblock->aio_offset + written;
355 		rem_buf = (void *)(cblock->aio_buf + written);
356 		record__aio_write(cblock, cblock->aio_fildes,
357 				rem_buf, rem_size, rem_off);
358 		rc = 0;
359 	}
360 
361 	return rc;
362 }
363 
364 static int record__aio_sync(struct mmap *md, bool sync_all)
365 {
366 	struct aiocb **aiocb = md->aio.aiocb;
367 	struct aiocb *cblocks = md->aio.cblocks;
368 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
369 	int i, do_suspend;
370 
371 	do {
372 		do_suspend = 0;
373 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
374 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
375 				if (sync_all)
376 					aiocb[i] = NULL;
377 				else
378 					return i;
379 			} else {
380 				/*
381 				 * Started aio write is not complete yet
382 				 * so it has to be waited before the
383 				 * next allocation.
384 				 */
385 				aiocb[i] = &cblocks[i];
386 				do_suspend = 1;
387 			}
388 		}
389 		if (!do_suspend)
390 			return -1;
391 
392 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
393 			if (!(errno == EAGAIN || errno == EINTR))
394 				pr_err("failed to sync perf data, error: %m\n");
395 		}
396 	} while (1);
397 }
398 
399 struct record_aio {
400 	struct record	*rec;
401 	void		*data;
402 	size_t		size;
403 };
404 
405 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
406 {
407 	struct record_aio *aio = to;
408 
409 	/*
410 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
411 	 * to release space in the kernel buffer as fast as possible, calling
412 	 * perf_mmap__consume() from perf_mmap__push() function.
413 	 *
414 	 * That lets the kernel to proceed with storing more profiling data into
415 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
416 	 *
417 	 * Coping can be done in two steps in case the chunk of profiling data
418 	 * crosses the upper bound of the kernel buffer. In this case we first move
419 	 * part of data from map->start till the upper bound and then the remainder
420 	 * from the beginning of the kernel buffer till the end of the data chunk.
421 	 */
422 
423 	if (record__comp_enabled(aio->rec)) {
424 		ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
425 						   mmap__mmap_len(map) - aio->size,
426 						   buf, size);
427 		if (compressed < 0)
428 			return (int)compressed;
429 
430 		size = compressed;
431 	} else {
432 		memcpy(aio->data + aio->size, buf, size);
433 	}
434 
435 	if (!aio->size) {
436 		/*
437 		 * Increment map->refcount to guard map->aio.data[] buffer
438 		 * from premature deallocation because map object can be
439 		 * released earlier than aio write request started on
440 		 * map->aio.data[] buffer is complete.
441 		 *
442 		 * perf_mmap__put() is done at record__aio_complete()
443 		 * after started aio request completion or at record__aio_push()
444 		 * if the request failed to start.
445 		 */
446 		perf_mmap__get(&map->core);
447 	}
448 
449 	aio->size += size;
450 
451 	return size;
452 }
453 
454 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
455 {
456 	int ret, idx;
457 	int trace_fd = perf_data__fd(rec->session->data);
458 	struct record_aio aio = { .rec = rec, .size = 0 };
459 
460 	/*
461 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
462 	 * becomes available after previous aio write operation.
463 	 */
464 
465 	idx = record__aio_sync(map, false);
466 	aio.data = map->aio.data[idx];
467 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
468 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
469 		return ret;
470 
471 	rec->samples++;
472 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
473 	if (!ret) {
474 		*off += aio.size;
475 		rec->bytes_written += aio.size;
476 		if (switch_output_size(rec))
477 			trigger_hit(&switch_output_trigger);
478 	} else {
479 		/*
480 		 * Decrement map->refcount incremented in record__aio_pushfn()
481 		 * back if record__aio_write() operation failed to start, otherwise
482 		 * map->refcount is decremented in record__aio_complete() after
483 		 * aio write operation finishes successfully.
484 		 */
485 		perf_mmap__put(&map->core);
486 	}
487 
488 	return ret;
489 }
490 
491 static off_t record__aio_get_pos(int trace_fd)
492 {
493 	return lseek(trace_fd, 0, SEEK_CUR);
494 }
495 
496 static void record__aio_set_pos(int trace_fd, off_t pos)
497 {
498 	lseek(trace_fd, pos, SEEK_SET);
499 }
500 
501 static void record__aio_mmap_read_sync(struct record *rec)
502 {
503 	int i;
504 	struct evlist *evlist = rec->evlist;
505 	struct mmap *maps = evlist->mmap;
506 
507 	if (!record__aio_enabled(rec))
508 		return;
509 
510 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
511 		struct mmap *map = &maps[i];
512 
513 		if (map->core.base)
514 			record__aio_sync(map, true);
515 	}
516 }
517 
518 static int nr_cblocks_default = 1;
519 static int nr_cblocks_max = 4;
520 
521 static int record__aio_parse(const struct option *opt,
522 			     const char *str,
523 			     int unset)
524 {
525 	struct record_opts *opts = (struct record_opts *)opt->value;
526 
527 	if (unset) {
528 		opts->nr_cblocks = 0;
529 	} else {
530 		if (str)
531 			opts->nr_cblocks = strtol(str, NULL, 0);
532 		if (!opts->nr_cblocks)
533 			opts->nr_cblocks = nr_cblocks_default;
534 	}
535 
536 	return 0;
537 }
538 #else /* HAVE_AIO_SUPPORT */
539 static int nr_cblocks_max = 0;
540 
541 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
542 			    off_t *off __maybe_unused)
543 {
544 	return -1;
545 }
546 
547 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
548 {
549 	return -1;
550 }
551 
552 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
553 {
554 }
555 
556 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
557 {
558 }
559 #endif
560 
561 static int record__aio_enabled(struct record *rec)
562 {
563 	return rec->opts.nr_cblocks > 0;
564 }
565 
566 #define MMAP_FLUSH_DEFAULT 1
567 static int record__mmap_flush_parse(const struct option *opt,
568 				    const char *str,
569 				    int unset)
570 {
571 	int flush_max;
572 	struct record_opts *opts = (struct record_opts *)opt->value;
573 	static struct parse_tag tags[] = {
574 			{ .tag  = 'B', .mult = 1       },
575 			{ .tag  = 'K', .mult = 1 << 10 },
576 			{ .tag  = 'M', .mult = 1 << 20 },
577 			{ .tag  = 'G', .mult = 1 << 30 },
578 			{ .tag  = 0 },
579 	};
580 
581 	if (unset)
582 		return 0;
583 
584 	if (str) {
585 		opts->mmap_flush = parse_tag_value(str, tags);
586 		if (opts->mmap_flush == (int)-1)
587 			opts->mmap_flush = strtol(str, NULL, 0);
588 	}
589 
590 	if (!opts->mmap_flush)
591 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
592 
593 	flush_max = evlist__mmap_size(opts->mmap_pages);
594 	flush_max /= 4;
595 	if (opts->mmap_flush > flush_max)
596 		opts->mmap_flush = flush_max;
597 
598 	return 0;
599 }
600 
601 #ifdef HAVE_ZSTD_SUPPORT
602 static unsigned int comp_level_default = 1;
603 
604 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
605 {
606 	struct record_opts *opts = opt->value;
607 
608 	if (unset) {
609 		opts->comp_level = 0;
610 	} else {
611 		if (str)
612 			opts->comp_level = strtol(str, NULL, 0);
613 		if (!opts->comp_level)
614 			opts->comp_level = comp_level_default;
615 	}
616 
617 	return 0;
618 }
619 #endif
620 static unsigned int comp_level_max = 22;
621 
622 static int record__comp_enabled(struct record *rec)
623 {
624 	return rec->opts.comp_level > 0;
625 }
626 
627 static int process_synthesized_event(const struct perf_tool *tool,
628 				     union perf_event *event,
629 				     struct perf_sample *sample __maybe_unused,
630 				     struct machine *machine __maybe_unused)
631 {
632 	struct record *rec = container_of(tool, struct record, tool);
633 	return record__write(rec, NULL, event, event->header.size);
634 }
635 
636 static struct mutex synth_lock;
637 
638 static int process_locked_synthesized_event(const struct perf_tool *tool,
639 				     union perf_event *event,
640 				     struct perf_sample *sample __maybe_unused,
641 				     struct machine *machine __maybe_unused)
642 {
643 	int ret;
644 
645 	mutex_lock(&synth_lock);
646 	ret = process_synthesized_event(tool, event, sample, machine);
647 	mutex_unlock(&synth_lock);
648 	return ret;
649 }
650 
651 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
652 {
653 	struct record *rec = to;
654 
655 	if (record__comp_enabled(rec)) {
656 		struct perf_record_compressed2 *event = map->data;
657 		size_t padding = 0;
658 		u8 pad[8] = {0};
659 		ssize_t compressed = zstd_compress(rec->session, map, map->data,
660 						   mmap__mmap_len(map), bf, size);
661 
662 		if (compressed < 0)
663 			return (int)compressed;
664 
665 		bf = event;
666 		thread->samples++;
667 
668 		/*
669 		 * The record from `zstd_compress` is not 8 bytes aligned, which would cause asan
670 		 * error. We make it aligned here.
671 		 */
672 		event->data_size = compressed - sizeof(struct perf_record_compressed2);
673 		event->header.size = PERF_ALIGN(compressed, sizeof(u64));
674 		padding = event->header.size - compressed;
675 		return record__write(rec, map, bf, compressed) ||
676 		       record__write(rec, map, &pad, padding);
677 	}
678 
679 	thread->samples++;
680 	return record__write(rec, map, bf, size);
681 }
682 
683 static volatile sig_atomic_t signr = -1;
684 static volatile sig_atomic_t child_finished;
685 #ifdef HAVE_EVENTFD_SUPPORT
686 static volatile sig_atomic_t done_fd = -1;
687 #endif
688 
689 static void sig_handler(int sig)
690 {
691 	if (sig == SIGCHLD)
692 		child_finished = 1;
693 	else
694 		signr = sig;
695 
696 	done = 1;
697 #ifdef HAVE_EVENTFD_SUPPORT
698 	if (done_fd >= 0) {
699 		u64 tmp = 1;
700 		int orig_errno = errno;
701 
702 		/*
703 		 * It is possible for this signal handler to run after done is
704 		 * checked in the main loop, but before the perf counter fds are
705 		 * polled. If this happens, the poll() will continue to wait
706 		 * even though done is set, and will only break out if either
707 		 * another signal is received, or the counters are ready for
708 		 * read. To ensure the poll() doesn't sleep when done is set,
709 		 * use an eventfd (done_fd) to wake up the poll().
710 		 */
711 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
712 			pr_err("failed to signal wakeup fd, error: %m\n");
713 
714 		errno = orig_errno;
715 	}
716 #endif // HAVE_EVENTFD_SUPPORT
717 }
718 
719 static void sigsegv_handler(int sig)
720 {
721 	perf_hooks__recover();
722 	sighandler_dump_stack(sig);
723 }
724 
725 static void record__sig_exit(void)
726 {
727 	if (signr == -1)
728 		return;
729 
730 	signal(signr, SIG_DFL);
731 	raise(signr);
732 }
733 
734 static int record__process_auxtrace(const struct perf_tool *tool,
735 				    struct mmap *map,
736 				    union perf_event *event, void *data1,
737 				    size_t len1, void *data2, size_t len2)
738 {
739 	struct record *rec = container_of(tool, struct record, tool);
740 	struct perf_data *data = &rec->data;
741 	size_t padding;
742 	u8 pad[8] = {0};
743 
744 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
745 		off_t file_offset;
746 		int fd = perf_data__fd(data);
747 		int err;
748 
749 		file_offset = lseek(fd, 0, SEEK_CUR);
750 		if (file_offset == -1)
751 			return -1;
752 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
753 						     event, file_offset);
754 		if (err)
755 			return err;
756 	}
757 
758 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
759 	padding = (len1 + len2) & 7;
760 	if (padding)
761 		padding = 8 - padding;
762 
763 	record__write(rec, map, event, event->header.size);
764 	record__write(rec, map, data1, len1);
765 	if (len2)
766 		record__write(rec, map, data2, len2);
767 	record__write(rec, map, &pad, padding);
768 
769 	return 0;
770 }
771 
772 static int record__auxtrace_mmap_read(struct record *rec,
773 				      struct mmap *map)
774 {
775 	int ret;
776 
777 	ret = auxtrace_mmap__read(map, rec->itr,
778 				  perf_session__env(rec->session),
779 				  &rec->tool,
780 				  record__process_auxtrace);
781 	if (ret < 0)
782 		return ret;
783 
784 	if (ret)
785 		rec->samples++;
786 
787 	return 0;
788 }
789 
790 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
791 					       struct mmap *map)
792 {
793 	int ret;
794 
795 	ret = auxtrace_mmap__read_snapshot(map, rec->itr,
796 					   perf_session__env(rec->session),
797 					   &rec->tool,
798 					   record__process_auxtrace,
799 					   rec->opts.auxtrace_snapshot_size);
800 	if (ret < 0)
801 		return ret;
802 
803 	if (ret)
804 		rec->samples++;
805 
806 	return 0;
807 }
808 
809 static int record__auxtrace_read_snapshot_all(struct record *rec)
810 {
811 	int i;
812 	int rc = 0;
813 
814 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
815 		struct mmap *map = &rec->evlist->mmap[i];
816 
817 		if (!map->auxtrace_mmap.base)
818 			continue;
819 
820 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
821 			rc = -1;
822 			goto out;
823 		}
824 	}
825 out:
826 	return rc;
827 }
828 
829 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
830 {
831 	pr_debug("Recording AUX area tracing snapshot\n");
832 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
833 		trigger_error(&auxtrace_snapshot_trigger);
834 	} else {
835 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
836 			trigger_error(&auxtrace_snapshot_trigger);
837 		else
838 			trigger_ready(&auxtrace_snapshot_trigger);
839 	}
840 }
841 
842 static int record__auxtrace_snapshot_exit(struct record *rec)
843 {
844 	if (trigger_is_error(&auxtrace_snapshot_trigger))
845 		return 0;
846 
847 	if (!auxtrace_record__snapshot_started &&
848 	    auxtrace_record__snapshot_start(rec->itr))
849 		return -1;
850 
851 	record__read_auxtrace_snapshot(rec, true);
852 	if (trigger_is_error(&auxtrace_snapshot_trigger))
853 		return -1;
854 
855 	return 0;
856 }
857 
858 static int record__auxtrace_init(struct record *rec)
859 {
860 	int err;
861 
862 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
863 	    && record__threads_enabled(rec)) {
864 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
865 		return -EINVAL;
866 	}
867 
868 	if (!rec->itr) {
869 		err = -EINVAL;
870 		rec->itr = auxtrace_record__init(rec->evlist, &err);
871 		if (err)
872 			return err;
873 	}
874 
875 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
876 					      rec->opts.auxtrace_snapshot_opts);
877 	if (err)
878 		return err;
879 
880 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
881 					    rec->opts.auxtrace_sample_opts);
882 	if (err)
883 		return err;
884 
885 	err = auxtrace_parse_aux_action(rec->evlist);
886 	if (err)
887 		return err;
888 
889 	return auxtrace_parse_filters(rec->evlist);
890 }
891 
892 static int record__config_text_poke(struct evlist *evlist)
893 {
894 	struct evsel *evsel;
895 
896 	/* Nothing to do if text poke is already configured */
897 	evlist__for_each_entry(evlist, evsel) {
898 		if (evsel->core.attr.text_poke)
899 			return 0;
900 	}
901 
902 	evsel = evlist__add_dummy_on_all_cpus(evlist);
903 	if (!evsel)
904 		return -ENOMEM;
905 
906 	evsel->core.attr.text_poke = 1;
907 	evsel->core.attr.ksymbol = 1;
908 	evsel->immediate = true;
909 	evsel__set_sample_bit(evsel, TIME);
910 
911 	return 0;
912 }
913 
914 static int record__config_off_cpu(struct record *rec)
915 {
916 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
917 }
918 
919 static bool record__tracking_system_wide(struct record *rec)
920 {
921 	struct evlist *evlist = rec->evlist;
922 	struct evsel *evsel;
923 
924 	/*
925 	 * If non-dummy evsel exists, system_wide sideband is need to
926 	 * help parse sample information.
927 	 * For example, PERF_EVENT_MMAP event to help parse symbol,
928 	 * and PERF_EVENT_COMM event to help parse task executable name.
929 	 */
930 	evlist__for_each_entry(evlist, evsel) {
931 		if (!evsel__is_dummy_event(evsel))
932 			return true;
933 	}
934 
935 	return false;
936 }
937 
938 static int record__config_tracking_events(struct record *rec)
939 {
940 	struct record_opts *opts = &rec->opts;
941 	struct evlist *evlist = rec->evlist;
942 	bool system_wide = false;
943 	struct evsel *evsel;
944 
945 	/*
946 	 * For initial_delay, system wide or a hybrid system, we need to add
947 	 * tracking event so that we can track PERF_RECORD_MMAP to cover the
948 	 * delay of waiting or event synthesis.
949 	 */
950 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
951 	    perf_pmus__num_core_pmus() > 1) {
952 		/*
953 		 * User space tasks can migrate between CPUs, so when tracing
954 		 * selected CPUs, sideband for all CPUs is still needed.
955 		 */
956 		if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
957 			system_wide = true;
958 
959 		evsel = evlist__findnew_tracking_event(evlist, system_wide);
960 		if (!evsel)
961 			return -ENOMEM;
962 
963 		/*
964 		 * Enable the tracking event when the process is forked for
965 		 * initial_delay, immediately for system wide.
966 		 */
967 		if (opts->target.initial_delay && !evsel->immediate &&
968 		    !target__has_cpu(&opts->target))
969 			evsel->core.attr.enable_on_exec = 1;
970 		else
971 			evsel->immediate = 1;
972 	}
973 
974 	return 0;
975 }
976 
977 static bool record__kcore_readable(struct machine *machine)
978 {
979 	char kcore[PATH_MAX];
980 	int fd;
981 
982 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
983 
984 	fd = open(kcore, O_RDONLY);
985 	if (fd < 0)
986 		return false;
987 
988 	close(fd);
989 
990 	return true;
991 }
992 
993 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
994 {
995 	char from_dir[PATH_MAX];
996 	char kcore_dir[PATH_MAX];
997 	int ret;
998 
999 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
1000 
1001 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1002 	if (ret)
1003 		return ret;
1004 
1005 	return kcore_copy(from_dir, kcore_dir);
1006 }
1007 
1008 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1009 {
1010 	thread_data->pipes.msg[0] = -1;
1011 	thread_data->pipes.msg[1] = -1;
1012 	thread_data->pipes.ack[0] = -1;
1013 	thread_data->pipes.ack[1] = -1;
1014 }
1015 
1016 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1017 {
1018 	if (pipe(thread_data->pipes.msg))
1019 		return -EINVAL;
1020 
1021 	if (pipe(thread_data->pipes.ack)) {
1022 		close(thread_data->pipes.msg[0]);
1023 		thread_data->pipes.msg[0] = -1;
1024 		close(thread_data->pipes.msg[1]);
1025 		thread_data->pipes.msg[1] = -1;
1026 		return -EINVAL;
1027 	}
1028 
1029 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1030 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1031 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1032 
1033 	return 0;
1034 }
1035 
1036 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1037 {
1038 	if (thread_data->pipes.msg[0] != -1) {
1039 		close(thread_data->pipes.msg[0]);
1040 		thread_data->pipes.msg[0] = -1;
1041 	}
1042 	if (thread_data->pipes.msg[1] != -1) {
1043 		close(thread_data->pipes.msg[1]);
1044 		thread_data->pipes.msg[1] = -1;
1045 	}
1046 	if (thread_data->pipes.ack[0] != -1) {
1047 		close(thread_data->pipes.ack[0]);
1048 		thread_data->pipes.ack[0] = -1;
1049 	}
1050 	if (thread_data->pipes.ack[1] != -1) {
1051 		close(thread_data->pipes.ack[1]);
1052 		thread_data->pipes.ack[1] = -1;
1053 	}
1054 }
1055 
1056 static bool evlist__per_thread(struct evlist *evlist)
1057 {
1058 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1059 }
1060 
1061 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1062 {
1063 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1064 	struct mmap *mmap = evlist->mmap;
1065 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1066 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1067 	bool per_thread = evlist__per_thread(evlist);
1068 
1069 	if (per_thread)
1070 		thread_data->nr_mmaps = nr_mmaps;
1071 	else
1072 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1073 						      thread_data->mask->maps.nbits);
1074 	if (mmap) {
1075 		thread_data->maps = calloc(thread_data->nr_mmaps, sizeof(struct mmap *));
1076 		if (!thread_data->maps)
1077 			return -ENOMEM;
1078 	}
1079 	if (overwrite_mmap) {
1080 		thread_data->overwrite_maps = calloc(thread_data->nr_mmaps, sizeof(struct mmap *));
1081 		if (!thread_data->overwrite_maps) {
1082 			zfree(&thread_data->maps);
1083 			return -ENOMEM;
1084 		}
1085 	}
1086 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1087 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1088 
1089 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1090 		if (per_thread ||
1091 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1092 			if (thread_data->maps) {
1093 				thread_data->maps[tm] = &mmap[m];
1094 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1095 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1096 			}
1097 			if (thread_data->overwrite_maps) {
1098 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1099 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1100 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1101 			}
1102 			tm++;
1103 		}
1104 	}
1105 
1106 	return 0;
1107 }
1108 
1109 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1110 {
1111 	int f, tm, pos;
1112 	struct mmap *map, *overwrite_map;
1113 
1114 	fdarray__init(&thread_data->pollfd, 64);
1115 
1116 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1117 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1118 		overwrite_map = thread_data->overwrite_maps ?
1119 				thread_data->overwrite_maps[tm] : NULL;
1120 
1121 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1122 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1123 
1124 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1125 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1126 							      &evlist->core.pollfd);
1127 				if (pos < 0)
1128 					return pos;
1129 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1130 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1131 			}
1132 		}
1133 	}
1134 
1135 	return 0;
1136 }
1137 
1138 static void record__free_thread_data(struct record *rec)
1139 {
1140 	int t;
1141 	struct record_thread *thread_data = rec->thread_data;
1142 
1143 	if (thread_data == NULL)
1144 		return;
1145 
1146 	for (t = 0; t < rec->nr_threads; t++) {
1147 		record__thread_data_close_pipes(&thread_data[t]);
1148 		zfree(&thread_data[t].maps);
1149 		zfree(&thread_data[t].overwrite_maps);
1150 		fdarray__exit(&thread_data[t].pollfd);
1151 	}
1152 
1153 	zfree(&rec->thread_data);
1154 }
1155 
1156 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1157 						    int evlist_pollfd_index,
1158 						    int thread_pollfd_index)
1159 {
1160 	size_t x = rec->index_map_cnt;
1161 
1162 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1163 		return -ENOMEM;
1164 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1165 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1166 	rec->index_map_cnt += 1;
1167 	return 0;
1168 }
1169 
1170 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1171 						    struct evlist *evlist,
1172 						    struct record_thread *thread_data)
1173 {
1174 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1175 	struct pollfd *t_entries = thread_data->pollfd.entries;
1176 	int err = 0;
1177 	size_t i;
1178 
1179 	for (i = 0; i < rec->index_map_cnt; i++) {
1180 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1181 		int t_pos = rec->index_map[i].thread_pollfd_index;
1182 
1183 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1184 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1185 			pr_err("Thread and evlist pollfd index mismatch\n");
1186 			err = -EINVAL;
1187 			continue;
1188 		}
1189 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1190 	}
1191 	return err;
1192 }
1193 
1194 static int record__dup_non_perf_events(struct record *rec,
1195 				       struct evlist *evlist,
1196 				       struct record_thread *thread_data)
1197 {
1198 	struct fdarray *fda = &evlist->core.pollfd;
1199 	int i, ret;
1200 
1201 	for (i = 0; i < fda->nr; i++) {
1202 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1203 			continue;
1204 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1205 		if (ret < 0) {
1206 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1207 			return ret;
1208 		}
1209 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1210 			  thread_data, ret, fda->entries[i].fd);
1211 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1212 		if (ret < 0) {
1213 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1214 			return ret;
1215 		}
1216 	}
1217 	return 0;
1218 }
1219 
1220 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1221 {
1222 	int t, ret;
1223 	struct record_thread *thread_data;
1224 
1225 	rec->thread_data = calloc(rec->nr_threads, sizeof(*(rec->thread_data)));
1226 	if (!rec->thread_data) {
1227 		pr_err("Failed to allocate thread data\n");
1228 		return -ENOMEM;
1229 	}
1230 	thread_data = rec->thread_data;
1231 
1232 	for (t = 0; t < rec->nr_threads; t++)
1233 		record__thread_data_init_pipes(&thread_data[t]);
1234 
1235 	for (t = 0; t < rec->nr_threads; t++) {
1236 		thread_data[t].rec = rec;
1237 		thread_data[t].mask = &rec->thread_masks[t];
1238 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1239 		if (ret) {
1240 			pr_err("Failed to initialize thread[%d] maps\n", t);
1241 			goto out_free;
1242 		}
1243 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1244 		if (ret) {
1245 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1246 			goto out_free;
1247 		}
1248 		if (t) {
1249 			thread_data[t].tid = -1;
1250 			ret = record__thread_data_open_pipes(&thread_data[t]);
1251 			if (ret) {
1252 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1253 				goto out_free;
1254 			}
1255 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1256 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1257 			if (ret < 0) {
1258 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1259 				goto out_free;
1260 			}
1261 			thread_data[t].ctlfd_pos = ret;
1262 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1263 				 thread_data, thread_data[t].ctlfd_pos,
1264 				 thread_data[t].pipes.msg[0]);
1265 		} else {
1266 			thread_data[t].tid = gettid();
1267 
1268 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1269 			if (ret < 0)
1270 				goto out_free;
1271 
1272 			thread_data[t].ctlfd_pos = -1; /* Not used */
1273 		}
1274 	}
1275 
1276 	return 0;
1277 
1278 out_free:
1279 	record__free_thread_data(rec);
1280 
1281 	return ret;
1282 }
1283 
1284 static int record__mmap_evlist(struct record *rec,
1285 			       struct evlist *evlist)
1286 {
1287 	int i, ret;
1288 	struct record_opts *opts = &rec->opts;
1289 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1290 				  opts->auxtrace_sample_mode;
1291 
1292 	if (opts->affinity != PERF_AFFINITY_SYS)
1293 		cpu__setup_cpunode_map();
1294 
1295 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1296 				 opts->auxtrace_mmap_pages,
1297 				 auxtrace_overwrite,
1298 				 opts->nr_cblocks, opts->affinity,
1299 				 opts->mmap_flush, opts->comp_level) < 0) {
1300 		if (errno == EPERM) {
1301 			pr_err("Permission error mapping pages.\n"
1302 			       "Consider increasing "
1303 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1304 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1305 			       "(current value: %u,%u)\n",
1306 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1307 			return -errno;
1308 		} else {
1309 			pr_err("failed to mmap: %m\n");
1310 			if (errno)
1311 				return -errno;
1312 			else
1313 				return -EINVAL;
1314 		}
1315 	}
1316 
1317 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1318 		return -1;
1319 
1320 	ret = record__alloc_thread_data(rec, evlist);
1321 	if (ret)
1322 		return ret;
1323 
1324 	if (record__threads_enabled(rec)) {
1325 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1326 		if (ret) {
1327 			errno = -ret;
1328 			pr_err("Failed to create data directory: %m\n");
1329 			return ret;
1330 		}
1331 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1332 			if (evlist->mmap)
1333 				evlist->mmap[i].file = &rec->data.dir.files[i];
1334 			if (evlist->overwrite_mmap)
1335 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1336 		}
1337 	}
1338 
1339 	return 0;
1340 }
1341 
1342 static int record__mmap(struct record *rec)
1343 {
1344 	return record__mmap_evlist(rec, rec->evlist);
1345 }
1346 
1347 static int record__open(struct record *rec)
1348 {
1349 	char msg[BUFSIZ];
1350 	struct evsel *pos;
1351 	struct evlist *evlist = rec->evlist;
1352 	struct perf_session *session = rec->session;
1353 	struct record_opts *opts = &rec->opts;
1354 	int rc = 0;
1355 	bool skipped = false;
1356 	bool removed_tracking = false;
1357 
1358 	evlist__for_each_entry(evlist, pos) {
1359 		if (removed_tracking) {
1360 			/*
1361 			 * Normally the head of the list has tracking enabled
1362 			 * for sideband data like mmaps. If this event is
1363 			 * removed, make sure to add tracking to the next
1364 			 * processed event.
1365 			 */
1366 			if (!pos->tracking) {
1367 				pos->tracking = true;
1368 				evsel__config(pos, opts, &callchain_param);
1369 			}
1370 			removed_tracking = false;
1371 		}
1372 try_again:
1373 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1374 			bool report_error = true;
1375 
1376 			if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1377 				if (verbose > 0)
1378 					ui__warning("%s\n", msg);
1379 				goto try_again;
1380 			}
1381 			if ((errno == EINVAL || errno == EBADF) &&
1382 			    pos->core.leader != &pos->core &&
1383 			    pos->weak_group) {
1384 			        pos = evlist__reset_weak_group(evlist, pos, true);
1385 				goto try_again;
1386 			}
1387 #if defined(__aarch64__) || defined(__arm__)
1388 			if (strstr(evsel__name(pos), "cycles")) {
1389 				struct evsel *pos2;
1390 				/*
1391 				 * Unfortunately ARM has many events named
1392 				 * "cycles" on PMUs like the system-level (L3)
1393 				 * cache which don't support sampling. Only
1394 				 * display such failures to open when there is
1395 				 * only 1 cycles event or verbose is enabled.
1396 				 */
1397 				evlist__for_each_entry(evlist, pos2) {
1398 					if (pos2 == pos)
1399 						continue;
1400 					if (strstr(evsel__name(pos2), "cycles")) {
1401 						report_error = false;
1402 						break;
1403 					}
1404 				}
1405 			}
1406 #endif
1407 			if (report_error || verbose > 0) {
1408 				evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1409 				ui__error("Failure to open event '%s' on PMU '%s' which will be "
1410 					  "removed.\n%s\n",
1411 					  evsel__name(pos), evsel__pmu_name(pos), msg);
1412 			}
1413 			if (pos->tracking)
1414 				removed_tracking = true;
1415 			pos->skippable = true;
1416 			skipped = true;
1417 		}
1418 	}
1419 
1420 	if (skipped) {
1421 		struct evsel *tmp;
1422 		int idx = 0;
1423 		bool evlist_empty = true;
1424 
1425 		/* Remove evsels that failed to open and update indices. */
1426 		evlist__for_each_entry_safe(evlist, tmp, pos) {
1427 			if (pos->skippable) {
1428 				evlist__remove(evlist, pos);
1429 				continue;
1430 			}
1431 
1432 			/*
1433 			 * Note, dummy events may be command line parsed or
1434 			 * added by the tool. We care about supporting `perf
1435 			 * record -e dummy` which may be used as a permission
1436 			 * check. Dummy events that are added to the command
1437 			 * line and opened along with other events that fail,
1438 			 * will still fail as if the dummy events were tool
1439 			 * added events for the sake of code simplicity.
1440 			 */
1441 			if (!evsel__is_dummy_event(pos))
1442 				evlist_empty = false;
1443 		}
1444 		evlist__for_each_entry(evlist, pos) {
1445 			pos->core.idx = idx++;
1446 		}
1447 		/* If list is empty then fail. */
1448 		if (evlist_empty) {
1449 			ui__error("Failure to open any events for recording.\n");
1450 			rc = -1;
1451 			goto out;
1452 		}
1453 	}
1454 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1455 		pr_warning(
1456 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1457 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1458 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1459 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1460 "Samples in kernel modules won't be resolved at all.\n\n"
1461 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1462 "even with a suitable vmlinux or kallsyms file.\n\n");
1463 	}
1464 
1465 	if (evlist__apply_filters(evlist, &pos, &opts->target)) {
1466 		pr_err("failed to set filter \"%s\" on event %s: %m\n",
1467 			pos->filter ?: "BPF", evsel__name(pos));
1468 		rc = -1;
1469 		goto out;
1470 	}
1471 
1472 	rc = record__mmap(rec);
1473 	if (rc)
1474 		goto out;
1475 
1476 	session->evlist = evlist;
1477 	perf_session__set_id_hdr_size(session);
1478 out:
1479 	return rc;
1480 }
1481 
1482 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1483 {
1484 	if (rec->evlist->first_sample_time == 0)
1485 		rec->evlist->first_sample_time = sample_time;
1486 
1487 	if (sample_time)
1488 		rec->evlist->last_sample_time = sample_time;
1489 }
1490 
1491 static int process_sample_event(const struct perf_tool *tool,
1492 				union perf_event *event,
1493 				struct perf_sample *sample,
1494 				struct machine *machine)
1495 {
1496 	struct record *rec = container_of(tool, struct record, tool);
1497 
1498 	set_timestamp_boundary(rec, sample->time);
1499 
1500 	if (rec->buildid_all)
1501 		return 0;
1502 
1503 	rec->samples++;
1504 	return build_id__mark_dso_hit(tool, event, sample, machine);
1505 }
1506 
1507 static int process_buildids(struct record *rec)
1508 {
1509 	struct perf_session *session = rec->session;
1510 
1511 	if (perf_data__size(&rec->data) == 0)
1512 		return 0;
1513 
1514 	/* A single DSO is needed and not all inline frames. */
1515 	symbol_conf.inline_name = false;
1516 	/*
1517 	 * During this process, it'll load kernel map and replace the
1518 	 * dso->long_name to a real pathname it found.  In this case
1519 	 * we prefer the vmlinux path like
1520 	 *   /lib/modules/3.16.4/build/vmlinux
1521 	 *
1522 	 * rather than build-id path (in debug directory).
1523 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1524 	 */
1525 	symbol_conf.ignore_vmlinux_buildid = true;
1526 	/*
1527 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1528 	 * so no need to process samples. But if timestamp_boundary is enabled,
1529 	 * it still needs to walk on all samples to get the timestamps of
1530 	 * first/last samples.
1531 	 */
1532 	if (rec->buildid_all && !rec->timestamp_boundary)
1533 		rec->tool.sample = process_event_sample_stub;
1534 
1535 	return perf_session__process_events(session);
1536 }
1537 
1538 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1539 {
1540 	int err;
1541 	struct perf_tool *tool = data;
1542 	/*
1543 	 *As for guest kernel when processing subcommand record&report,
1544 	 *we arrange module mmap prior to guest kernel mmap and trigger
1545 	 *a preload dso because default guest module symbols are loaded
1546 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1547 	 *method is used to avoid symbol missing when the first addr is
1548 	 *in module instead of in guest kernel.
1549 	 */
1550 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1551 					     machine);
1552 	if (err < 0)
1553 		pr_err("Couldn't record guest kernel [%d]'s reference"
1554 		       " relocation symbol.\n", machine->pid);
1555 
1556 	/*
1557 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1558 	 * have no _text sometimes.
1559 	 */
1560 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1561 						 machine);
1562 	if (err < 0)
1563 		pr_err("Couldn't record guest kernel [%d]'s reference"
1564 		       " relocation symbol.\n", machine->pid);
1565 }
1566 
1567 static struct perf_event_header finished_round_event = {
1568 	.size = sizeof(struct perf_event_header),
1569 	.type = PERF_RECORD_FINISHED_ROUND,
1570 };
1571 
1572 static struct perf_event_header finished_init_event = {
1573 	.size = sizeof(struct perf_event_header),
1574 	.type = PERF_RECORD_FINISHED_INIT,
1575 };
1576 
1577 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1578 {
1579 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1580 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1581 			  thread->mask->affinity.nbits)) {
1582 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1583 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1584 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1585 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1586 					(cpu_set_t *)thread->mask->affinity.bits);
1587 		if (verbose == 2) {
1588 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1589 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1590 		}
1591 	}
1592 }
1593 
1594 static size_t process_comp_header(void *record, size_t increment)
1595 {
1596 	struct perf_record_compressed2 *event = record;
1597 	size_t size = sizeof(*event);
1598 
1599 	if (increment) {
1600 		event->header.size += increment;
1601 		return increment;
1602 	}
1603 
1604 	event->header.type = PERF_RECORD_COMPRESSED2;
1605 	event->header.size = size;
1606 
1607 	return size;
1608 }
1609 
1610 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1611 			    void *dst, size_t dst_size, void *src, size_t src_size)
1612 {
1613 	ssize_t compressed;
1614 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed2) - 1;
1615 	struct zstd_data *zstd_data = &session->zstd_data;
1616 
1617 	if (map && map->file)
1618 		zstd_data = &map->zstd_data;
1619 
1620 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1621 						     max_record_size, process_comp_header);
1622 	if (compressed < 0)
1623 		return compressed;
1624 
1625 	if (map && map->file) {
1626 		thread->bytes_transferred += src_size;
1627 		thread->bytes_compressed  += compressed;
1628 	} else {
1629 		session->bytes_transferred += src_size;
1630 		session->bytes_compressed  += compressed;
1631 	}
1632 
1633 	return compressed;
1634 }
1635 
1636 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1637 				    bool overwrite, bool synch)
1638 {
1639 	u64 bytes_written = rec->bytes_written;
1640 	int i;
1641 	int rc = 0;
1642 	int nr_mmaps;
1643 	struct mmap **maps;
1644 	int trace_fd = perf_data__fd(&rec->data);
1645 	off_t off = 0;
1646 
1647 	if (!evlist)
1648 		return 0;
1649 
1650 	nr_mmaps = thread->nr_mmaps;
1651 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1652 
1653 	if (!maps)
1654 		return 0;
1655 
1656 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1657 		return 0;
1658 
1659 	if (record__aio_enabled(rec))
1660 		off = record__aio_get_pos(trace_fd);
1661 
1662 	for (i = 0; i < nr_mmaps; i++) {
1663 		u64 flush = 0;
1664 		struct mmap *map = maps[i];
1665 
1666 		if (map->core.base) {
1667 			record__adjust_affinity(rec, map);
1668 			if (synch) {
1669 				flush = map->core.flush;
1670 				map->core.flush = 1;
1671 			}
1672 			if (!record__aio_enabled(rec)) {
1673 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1674 					if (synch)
1675 						map->core.flush = flush;
1676 					rc = -1;
1677 					goto out;
1678 				}
1679 			} else {
1680 				if (record__aio_push(rec, map, &off) < 0) {
1681 					record__aio_set_pos(trace_fd, off);
1682 					if (synch)
1683 						map->core.flush = flush;
1684 					rc = -1;
1685 					goto out;
1686 				}
1687 			}
1688 			if (synch)
1689 				map->core.flush = flush;
1690 		}
1691 
1692 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1693 		    !rec->opts.auxtrace_sample_mode &&
1694 		    record__auxtrace_mmap_read(rec, map) != 0) {
1695 			rc = -1;
1696 			goto out;
1697 		}
1698 	}
1699 
1700 	if (record__aio_enabled(rec))
1701 		record__aio_set_pos(trace_fd, off);
1702 
1703 	/*
1704 	 * Mark the round finished in case we wrote
1705 	 * at least one event.
1706 	 *
1707 	 * No need for round events in directory mode,
1708 	 * because per-cpu maps and files have data
1709 	 * sorted by kernel.
1710 	 */
1711 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1712 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1713 
1714 	if (overwrite)
1715 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1716 out:
1717 	return rc;
1718 }
1719 
1720 static int record__mmap_read_all(struct record *rec, bool synch)
1721 {
1722 	int err;
1723 
1724 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1725 	if (err)
1726 		return err;
1727 
1728 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1729 }
1730 
1731 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1732 					   void *arg __maybe_unused)
1733 {
1734 	struct perf_mmap *map = fda->priv[fd].ptr;
1735 
1736 	if (map)
1737 		perf_mmap__put(map);
1738 }
1739 
1740 static void *record__thread(void *arg)
1741 {
1742 	enum thread_msg msg = THREAD_MSG__READY;
1743 	bool terminate = false;
1744 	struct fdarray *pollfd;
1745 	int err, ctlfd_pos;
1746 
1747 	thread = arg;
1748 	thread->tid = gettid();
1749 
1750 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1751 	if (err == -1)
1752 		pr_warning("threads[%d]: failed to notify on start: %m\n", thread->tid);
1753 
1754 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1755 
1756 	pollfd = &thread->pollfd;
1757 	ctlfd_pos = thread->ctlfd_pos;
1758 
1759 	for (;;) {
1760 		unsigned long long hits = thread->samples;
1761 
1762 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1763 			break;
1764 
1765 		if (hits == thread->samples) {
1766 
1767 			err = fdarray__poll(pollfd, -1);
1768 			/*
1769 			 * Propagate error, only if there's any. Ignore positive
1770 			 * number of returned events and interrupt error.
1771 			 */
1772 			if (err > 0 || (err < 0 && errno == EINTR))
1773 				err = 0;
1774 			thread->waking++;
1775 
1776 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1777 					    record__thread_munmap_filtered, NULL) == 0)
1778 				break;
1779 		}
1780 
1781 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1782 			terminate = true;
1783 			close(thread->pipes.msg[0]);
1784 			thread->pipes.msg[0] = -1;
1785 			pollfd->entries[ctlfd_pos].fd = -1;
1786 			pollfd->entries[ctlfd_pos].events = 0;
1787 		}
1788 
1789 		pollfd->entries[ctlfd_pos].revents = 0;
1790 	}
1791 	record__mmap_read_all(thread->rec, true);
1792 
1793 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1794 	if (err == -1)
1795 		pr_warning("threads[%d]: failed to notify on termination: %m\n", thread->tid);
1796 
1797 	return NULL;
1798 }
1799 
1800 static void record__init_features(struct record *rec)
1801 {
1802 	struct perf_session *session = rec->session;
1803 	int feat;
1804 
1805 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1806 		perf_header__set_feat(&session->header, feat);
1807 
1808 	if (rec->no_buildid)
1809 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1810 
1811 	if (!have_tracepoints(&rec->evlist->core.entries))
1812 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1813 
1814 	if (!rec->opts.branch_stack)
1815 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1816 
1817 	if (!rec->opts.full_auxtrace)
1818 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1819 
1820 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1821 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1822 
1823 	if (!rec->opts.use_clockid)
1824 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1825 
1826 	if (!record__threads_enabled(rec))
1827 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1828 
1829 	if (!record__comp_enabled(rec))
1830 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1831 
1832 	perf_header__clear_feat(&session->header, HEADER_STAT);
1833 }
1834 
1835 static void
1836 record__finish_output(struct record *rec)
1837 {
1838 	int i;
1839 	struct perf_data *data = &rec->data;
1840 	int fd = perf_data__fd(data);
1841 
1842 	if (data->is_pipe) {
1843 		/* Just to display approx. size */
1844 		data->file.size = rec->bytes_written;
1845 		return;
1846 	}
1847 
1848 	rec->session->header.data_size += rec->bytes_written;
1849 	data->file.size = perf_data__seek(data, 0, SEEK_CUR);
1850 	if (record__threads_enabled(rec)) {
1851 		for (i = 0; i < data->dir.nr; i++) {
1852 			data->dir.files[i].size =
1853 				perf_data_file__seek(&data->dir.files[i], 0, SEEK_CUR);
1854 		}
1855 	}
1856 
1857 	/* Buildid scanning disabled or build ID in kernel and synthesized map events. */
1858 	if (!rec->no_buildid || !rec->no_buildid_cache) {
1859 		process_buildids(rec);
1860 
1861 		if (rec->buildid_all)
1862 			perf_session__dsos_hit_all(rec->session);
1863 	}
1864 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1865 	perf_session__cache_build_ids(rec->session);
1866 }
1867 
1868 static int record__synthesize_workload(struct record *rec, bool tail)
1869 {
1870 	int err;
1871 	struct perf_thread_map *thread_map;
1872 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1873 
1874 	if (rec->opts.tail_synthesize != tail)
1875 		return 0;
1876 
1877 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1878 	if (thread_map == NULL)
1879 		return -1;
1880 
1881 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1882 						 process_synthesized_event,
1883 						 &rec->session->machines.host,
1884 						 needs_mmap,
1885 						 rec->opts.record_data_mmap);
1886 	perf_thread_map__put(thread_map);
1887 	return err;
1888 }
1889 
1890 static int write_finished_init(struct record *rec, bool tail)
1891 {
1892 	if (rec->opts.tail_synthesize != tail)
1893 		return 0;
1894 
1895 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1896 }
1897 
1898 static int record__synthesize(struct record *rec, bool tail);
1899 
1900 static int
1901 record__switch_output(struct record *rec, bool at_exit)
1902 {
1903 	struct perf_data *data = &rec->data;
1904 	char *new_filename = NULL;
1905 	int fd, err;
1906 
1907 	/* Same Size:      "2015122520103046"*/
1908 	char timestamp[] = "InvalidTimestamp";
1909 
1910 	record__aio_mmap_read_sync(rec);
1911 
1912 	write_finished_init(rec, true);
1913 
1914 	record__synthesize(rec, true);
1915 	if (target__none(&rec->opts.target))
1916 		record__synthesize_workload(rec, true);
1917 
1918 	rec->samples = 0;
1919 	record__finish_output(rec);
1920 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1921 	if (err) {
1922 		pr_err("Failed to get current timestamp\n");
1923 		return -EINVAL;
1924 	}
1925 
1926 	fd = perf_data__switch(data, timestamp,
1927 			       rec->session->header.data_offset,
1928 			       at_exit, &new_filename);
1929 	if (fd >= 0 && !at_exit) {
1930 		rec->bytes_written = 0;
1931 		rec->session->header.data_size = 0;
1932 	}
1933 
1934 	if (!quiet) {
1935 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1936 			data->path, timestamp);
1937 	}
1938 
1939 	if (rec->switch_output.num_files) {
1940 		int n = rec->switch_output.cur_file + 1;
1941 
1942 		if (n >= rec->switch_output.num_files)
1943 			n = 0;
1944 		rec->switch_output.cur_file = n;
1945 		if (rec->switch_output.filenames[n]) {
1946 			remove(rec->switch_output.filenames[n]);
1947 			zfree(&rec->switch_output.filenames[n]);
1948 		}
1949 		rec->switch_output.filenames[n] = new_filename;
1950 	} else {
1951 		free(new_filename);
1952 	}
1953 
1954 	/* Output tracking events */
1955 	if (!at_exit) {
1956 		record__synthesize(rec, false);
1957 
1958 		/*
1959 		 * In 'perf record --switch-output' without -a,
1960 		 * record__synthesize() in record__switch_output() won't
1961 		 * generate tracking events because there's no thread_map
1962 		 * in evlist. Which causes newly created perf.data doesn't
1963 		 * contain map and comm information.
1964 		 * Create a fake thread_map and directly call
1965 		 * perf_event__synthesize_thread_map() for those events.
1966 		 */
1967 		if (target__none(&rec->opts.target))
1968 			record__synthesize_workload(rec, false);
1969 		write_finished_init(rec, false);
1970 	}
1971 	return fd;
1972 }
1973 
1974 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1975 					struct perf_record_lost_samples *lost,
1976 					int cpu_idx, int thread_idx, u64 lost_count,
1977 					u16 misc_flag)
1978 {
1979 	struct perf_sample_id *sid;
1980 	struct perf_sample sample;
1981 	int id_hdr_size;
1982 
1983 	perf_sample__init(&sample, /*all=*/true);
1984 	lost->lost = lost_count;
1985 	if (evsel->core.ids) {
1986 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1987 		sample.id = sid->id;
1988 	}
1989 
1990 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1991 						       evsel->core.attr.sample_type, &sample);
1992 	lost->header.size = sizeof(*lost) + id_hdr_size;
1993 	lost->header.misc = misc_flag;
1994 	record__write(rec, NULL, lost, lost->header.size);
1995 	perf_sample__exit(&sample);
1996 }
1997 
1998 static void record__read_lost_samples(struct record *rec)
1999 {
2000 	struct perf_session *session = rec->session;
2001 	struct perf_record_lost_samples_and_ids lost;
2002 	struct evsel *evsel;
2003 
2004 	/* there was an error during record__open */
2005 	if (session->evlist == NULL)
2006 		return;
2007 
2008 	evlist__for_each_entry(session->evlist, evsel) {
2009 		struct xyarray *xy = evsel->core.sample_id;
2010 		u64 lost_count;
2011 
2012 		if (xy == NULL || evsel->core.fd == NULL)
2013 			continue;
2014 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
2015 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
2016 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
2017 			continue;
2018 		}
2019 
2020 		for (int x = 0; x < xyarray__max_x(xy); x++) {
2021 			for (int y = 0; y < xyarray__max_y(xy); y++) {
2022 				struct perf_counts_values count;
2023 
2024 				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
2025 					pr_debug("read LOST count failed\n");
2026 					return;
2027 				}
2028 
2029 				if (count.lost) {
2030 					memset(&lost, 0, sizeof(lost));
2031 					lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
2032 					__record__save_lost_samples(rec, evsel, &lost.lost,
2033 								    x, y, count.lost, 0);
2034 				}
2035 			}
2036 		}
2037 
2038 		lost_count = perf_bpf_filter__lost_count(evsel);
2039 		if (lost_count) {
2040 			memset(&lost, 0, sizeof(lost));
2041 			lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
2042 			__record__save_lost_samples(rec, evsel, &lost.lost, 0, 0, lost_count,
2043 						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
2044 		}
2045 	}
2046 }
2047 
2048 static volatile sig_atomic_t workload_exec_errno;
2049 
2050 /*
2051  * evlist__prepare_workload will send a SIGUSR1
2052  * if the fork fails, since we asked by setting its
2053  * want_signal to true.
2054  */
2055 static void workload_exec_failed_signal(int signo __maybe_unused,
2056 					siginfo_t *info,
2057 					void *ucontext __maybe_unused)
2058 {
2059 	workload_exec_errno = info->si_value.sival_int;
2060 	done = 1;
2061 	child_finished = 1;
2062 }
2063 
2064 static void snapshot_sig_handler(int sig);
2065 static void alarm_sig_handler(int sig);
2066 
2067 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
2068 {
2069 	if (evlist) {
2070 		if (evlist->mmap && evlist->mmap[0].core.base)
2071 			return evlist->mmap[0].core.base;
2072 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2073 			return evlist->overwrite_mmap[0].core.base;
2074 	}
2075 	return NULL;
2076 }
2077 
2078 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2079 {
2080 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2081 	if (pc)
2082 		return pc;
2083 	return NULL;
2084 }
2085 
2086 static int record__synthesize(struct record *rec, bool tail)
2087 {
2088 	struct perf_session *session = rec->session;
2089 	struct machine *machine = &session->machines.host;
2090 	struct perf_data *data = &rec->data;
2091 	struct record_opts *opts = &rec->opts;
2092 	struct perf_tool *tool = &rec->tool;
2093 	int err = 0;
2094 	event_op f = process_synthesized_event;
2095 
2096 	if (rec->opts.tail_synthesize != tail)
2097 		return 0;
2098 
2099 	if (data->is_pipe) {
2100 		err = perf_event__synthesize_for_pipe(tool, session, data,
2101 						      process_synthesized_event);
2102 		if (err < 0)
2103 			goto out;
2104 
2105 		rec->bytes_written += err;
2106 	}
2107 
2108 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2109 					  process_synthesized_event, machine);
2110 	if (err)
2111 		goto out;
2112 
2113 	/* Synthesize id_index before auxtrace_info */
2114 	err = perf_event__synthesize_id_index(tool,
2115 					      process_synthesized_event,
2116 					      session->evlist, machine);
2117 	if (err)
2118 		goto out;
2119 
2120 	if (rec->opts.full_auxtrace) {
2121 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2122 					session, process_synthesized_event);
2123 		if (err)
2124 			goto out;
2125 	}
2126 
2127 	if (!evlist__exclude_kernel(rec->evlist)) {
2128 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2129 							 machine);
2130 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2131 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2132 				   "Check /proc/kallsyms permission or run as root.\n");
2133 
2134 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2135 						     machine);
2136 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2137 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2138 				   "Check /proc/modules permission or run as root.\n");
2139 	}
2140 
2141 	if (perf_guest) {
2142 		machines__process_guests(&session->machines,
2143 					 perf_event__synthesize_guest_os, tool);
2144 	}
2145 
2146 	err = perf_event__synthesize_extra_attr(&rec->tool,
2147 						rec->evlist,
2148 						process_synthesized_event,
2149 						data->is_pipe);
2150 	if (err)
2151 		goto out;
2152 
2153 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2154 						 process_synthesized_event,
2155 						NULL);
2156 	if (err < 0) {
2157 		pr_err("Couldn't synthesize thread map.\n");
2158 		return err;
2159 	}
2160 
2161 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2162 					     process_synthesized_event, NULL);
2163 	if (err < 0) {
2164 		pr_err("Couldn't synthesize cpu map.\n");
2165 		return err;
2166 	}
2167 
2168 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2169 						machine, opts);
2170 	if (err < 0) {
2171 		pr_warning("Couldn't synthesize bpf events.\n");
2172 		err = 0;
2173 	}
2174 
2175 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2176 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2177 						     machine);
2178 		if (err < 0) {
2179 			pr_warning("Couldn't synthesize cgroup events.\n");
2180 			err = 0;
2181 		}
2182 	}
2183 
2184 	if (rec->opts.nr_threads_synthesize > 1) {
2185 		mutex_init(&synth_lock);
2186 		perf_set_multithreaded();
2187 		f = process_locked_synthesized_event;
2188 	}
2189 
2190 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2191 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2192 
2193 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2194 						    rec->evlist->core.threads,
2195 						    f, needs_mmap, opts->record_data_mmap,
2196 						    rec->opts.nr_threads_synthesize);
2197 	}
2198 
2199 	if (rec->opts.nr_threads_synthesize > 1) {
2200 		perf_set_singlethreaded();
2201 		mutex_destroy(&synth_lock);
2202 	}
2203 
2204 out:
2205 	return err;
2206 }
2207 
2208 static void record__synthesize_final_bpf_metadata(struct record *rec __maybe_unused)
2209 {
2210 #ifdef HAVE_LIBBPF_SUPPORT
2211 	perf_event__synthesize_final_bpf_metadata(rec->session,
2212 						  process_synthesized_event);
2213 #endif
2214 }
2215 
2216 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2217 {
2218 	struct record *rec = data;
2219 	pthread_kill(rec->thread_id, SIGUSR2);
2220 	return 0;
2221 }
2222 
2223 static int record__setup_sb_evlist(struct record *rec)
2224 {
2225 	struct record_opts *opts = &rec->opts;
2226 
2227 	if (rec->sb_evlist != NULL) {
2228 		/*
2229 		 * We get here if --switch-output-event populated the
2230 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2231 		 * to the main thread.
2232 		 */
2233 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2234 		rec->thread_id = pthread_self();
2235 	}
2236 #ifdef HAVE_LIBBPF_SUPPORT
2237 	if (!opts->no_bpf_event) {
2238 		if (rec->sb_evlist == NULL) {
2239 			rec->sb_evlist = evlist__new();
2240 
2241 			if (rec->sb_evlist == NULL) {
2242 				pr_err("Couldn't create side band evlist.\n.");
2243 				return -1;
2244 			}
2245 		}
2246 
2247 		if (evlist__add_bpf_sb_event(rec->sb_evlist, perf_session__env(rec->session))) {
2248 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2249 			return -1;
2250 		}
2251 	}
2252 #endif
2253 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2254 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2255 		opts->no_bpf_event = true;
2256 	}
2257 
2258 	return 0;
2259 }
2260 
2261 static int record__init_clock(struct record *rec)
2262 {
2263 	struct perf_session *session = rec->session;
2264 	struct timespec ref_clockid;
2265 	struct timeval ref_tod;
2266 	struct perf_env *env = perf_session__env(session);
2267 	u64 ref;
2268 
2269 	if (!rec->opts.use_clockid)
2270 		return 0;
2271 
2272 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2273 		env->clock.clockid_res_ns = rec->opts.clockid_res_ns;
2274 
2275 	env->clock.clockid = rec->opts.clockid;
2276 
2277 	if (gettimeofday(&ref_tod, NULL) != 0) {
2278 		pr_err("gettimeofday failed, cannot set reference time.\n");
2279 		return -1;
2280 	}
2281 
2282 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2283 		pr_err("clock_gettime failed, cannot set reference time.\n");
2284 		return -1;
2285 	}
2286 
2287 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2288 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2289 
2290 	env->clock.tod_ns = ref;
2291 
2292 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2293 	      (u64) ref_clockid.tv_nsec;
2294 
2295 	env->clock.clockid_ns = ref;
2296 	return 0;
2297 }
2298 
2299 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2300 {
2301 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2302 		trigger_hit(&auxtrace_snapshot_trigger);
2303 		auxtrace_record__snapshot_started = 1;
2304 		if (auxtrace_record__snapshot_start(rec->itr))
2305 			trigger_error(&auxtrace_snapshot_trigger);
2306 	}
2307 }
2308 
2309 static int record__terminate_thread(struct record_thread *thread_data)
2310 {
2311 	int err;
2312 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2313 	pid_t tid = thread_data->tid;
2314 
2315 	close(thread_data->pipes.msg[1]);
2316 	thread_data->pipes.msg[1] = -1;
2317 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2318 	if (err > 0)
2319 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2320 	else
2321 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2322 			   thread->tid, tid);
2323 
2324 	return 0;
2325 }
2326 
2327 static int record__start_threads(struct record *rec)
2328 {
2329 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2330 	struct record_thread *thread_data = rec->thread_data;
2331 	sigset_t full, mask;
2332 	pthread_t handle;
2333 	pthread_attr_t attrs;
2334 
2335 	thread = &thread_data[0];
2336 
2337 	if (!record__threads_enabled(rec))
2338 		return 0;
2339 
2340 	sigfillset(&full);
2341 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2342 		pr_err("Failed to block signals on threads start: %m\n");
2343 		return -1;
2344 	}
2345 
2346 	pthread_attr_init(&attrs);
2347 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2348 
2349 	for (t = 1; t < nr_threads; t++) {
2350 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2351 
2352 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2353 		pthread_attr_setaffinity_np(&attrs,
2354 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2355 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2356 #endif
2357 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2358 			for (tt = 1; tt < t; tt++)
2359 				record__terminate_thread(&thread_data[t]);
2360 			pr_err("Failed to start threads: %m\n");
2361 			ret = -1;
2362 			goto out_err;
2363 		}
2364 
2365 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2366 		if (err > 0)
2367 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2368 				  thread_msg_tags[msg]);
2369 		else
2370 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2371 				   thread->tid, rec->thread_data[t].tid);
2372 	}
2373 
2374 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2375 			(cpu_set_t *)thread->mask->affinity.bits);
2376 
2377 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2378 
2379 out_err:
2380 	pthread_attr_destroy(&attrs);
2381 
2382 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2383 		pr_err("Failed to unblock signals on threads start: %m\n");
2384 		ret = -1;
2385 	}
2386 
2387 	return ret;
2388 }
2389 
2390 static int record__stop_threads(struct record *rec)
2391 {
2392 	int t;
2393 	struct record_thread *thread_data = rec->thread_data;
2394 
2395 	for (t = 1; t < rec->nr_threads; t++)
2396 		record__terminate_thread(&thread_data[t]);
2397 
2398 	for (t = 0; t < rec->nr_threads; t++) {
2399 		rec->samples += thread_data[t].samples;
2400 		if (!record__threads_enabled(rec))
2401 			continue;
2402 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2403 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2404 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2405 			 thread_data[t].samples, thread_data[t].waking);
2406 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2407 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2408 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2409 		else
2410 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2411 	}
2412 
2413 	return 0;
2414 }
2415 
2416 static unsigned long record__waking(struct record *rec)
2417 {
2418 	int t;
2419 	unsigned long waking = 0;
2420 	struct record_thread *thread_data = rec->thread_data;
2421 
2422 	for (t = 0; t < rec->nr_threads; t++)
2423 		waking += thread_data[t].waking;
2424 
2425 	return waking;
2426 }
2427 
2428 static int __cmd_record(struct record *rec, int argc, const char **argv)
2429 {
2430 	int err;
2431 	int status = 0;
2432 	const bool forks = argc > 0;
2433 	struct perf_tool *tool = &rec->tool;
2434 	struct record_opts *opts = &rec->opts;
2435 	struct perf_data *data = &rec->data;
2436 	struct perf_session *session;
2437 	bool disabled = false, draining = false;
2438 	int fd;
2439 	float ratio = 0;
2440 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2441 	struct perf_env *env;
2442 
2443 	atexit(record__sig_exit);
2444 	signal(SIGCHLD, sig_handler);
2445 	signal(SIGINT, sig_handler);
2446 	signal(SIGTERM, sig_handler);
2447 	signal(SIGSEGV, sigsegv_handler);
2448 
2449 	if (rec->opts.record_cgroup) {
2450 #ifndef HAVE_FILE_HANDLE
2451 		pr_err("cgroup tracking is not supported\n");
2452 		return -1;
2453 #endif
2454 	}
2455 
2456 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2457 		signal(SIGUSR2, snapshot_sig_handler);
2458 		if (rec->opts.auxtrace_snapshot_mode)
2459 			trigger_on(&auxtrace_snapshot_trigger);
2460 		if (rec->switch_output.enabled)
2461 			trigger_on(&switch_output_trigger);
2462 	} else {
2463 		signal(SIGUSR2, SIG_IGN);
2464 	}
2465 
2466 	perf_tool__init(tool, /*ordered_events=*/true);
2467 	tool->sample		= process_sample_event;
2468 	tool->fork		= perf_event__process_fork;
2469 	tool->exit		= perf_event__process_exit;
2470 	tool->comm		= perf_event__process_comm;
2471 	tool->namespaces	= perf_event__process_namespaces;
2472 	tool->mmap		= build_id__process_mmap;
2473 	tool->mmap2		= build_id__process_mmap2;
2474 	tool->itrace_start	= process_timestamp_boundary;
2475 	tool->aux		= process_timestamp_boundary;
2476 	tool->namespace_events	= rec->opts.record_namespaces;
2477 	tool->cgroup_events	= rec->opts.record_cgroup;
2478 	session = perf_session__new(data, tool);
2479 	if (IS_ERR(session)) {
2480 		pr_err("Perf session creation failed.\n");
2481 		return PTR_ERR(session);
2482 	}
2483 	env = perf_session__env(session);
2484 	if (record__threads_enabled(rec)) {
2485 		if (perf_data__is_pipe(&rec->data)) {
2486 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2487 			return -1;
2488 		}
2489 		if (rec->opts.full_auxtrace) {
2490 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2491 			return -1;
2492 		}
2493 	}
2494 
2495 	fd = perf_data__fd(data);
2496 	rec->session = session;
2497 
2498 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2499 		pr_err("Compression initialization failed.\n");
2500 		return -1;
2501 	}
2502 #ifdef HAVE_EVENTFD_SUPPORT
2503 	done_fd = eventfd(0, EFD_NONBLOCK);
2504 	if (done_fd < 0) {
2505 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2506 		status = -1;
2507 		goto out_delete_session;
2508 	}
2509 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2510 	if (err < 0) {
2511 		pr_err("Failed to add wakeup eventfd to poll list\n");
2512 		status = err;
2513 		goto out_delete_session;
2514 	}
2515 #endif // HAVE_EVENTFD_SUPPORT
2516 
2517 	env->comp_type  = PERF_COMP_ZSTD;
2518 	env->comp_level = rec->opts.comp_level;
2519 
2520 	if (rec->opts.kcore &&
2521 	    !record__kcore_readable(&session->machines.host)) {
2522 		pr_err("ERROR: kcore is not readable.\n");
2523 		return -1;
2524 	}
2525 
2526 	if (record__init_clock(rec))
2527 		return -1;
2528 
2529 	record__init_features(rec);
2530 
2531 	if (forks) {
2532 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2533 					       workload_exec_failed_signal);
2534 		if (err < 0) {
2535 			pr_err("Couldn't run the workload!\n");
2536 			status = err;
2537 			goto out_delete_session;
2538 		}
2539 	}
2540 
2541 	/*
2542 	 * If we have just single event and are sending data
2543 	 * through pipe, we need to force the ids allocation,
2544 	 * because we synthesize event name through the pipe
2545 	 * and need the id for that.
2546 	 */
2547 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2548 		rec->opts.sample_id = true;
2549 
2550 	if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2551 		rec->timestamp_filename = false;
2552 		pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2553 	}
2554 
2555 	/*
2556 	 * Use global stat_config that is zero meaning aggr_mode is AGGR_NONE
2557 	 * and hybrid_merge is false.
2558 	 */
2559 	evlist__uniquify_evsel_names(rec->evlist, &stat_config);
2560 
2561 	evlist__config(rec->evlist, opts, &callchain_param);
2562 
2563 	/* Debug message used by test scripts */
2564 	pr_debug3("perf record opening and mmapping events\n");
2565 	if (record__open(rec) != 0) {
2566 		err = -1;
2567 		goto out_free_threads;
2568 	}
2569 	/* Debug message used by test scripts */
2570 	pr_debug3("perf record done opening and mmapping events\n");
2571 	env->comp_mmap_len = session->evlist->core.mmap_len;
2572 
2573 	if (rec->opts.kcore) {
2574 		err = record__kcore_copy(&session->machines.host, data);
2575 		if (err) {
2576 			pr_err("ERROR: Failed to copy kcore\n");
2577 			goto out_free_threads;
2578 		}
2579 	}
2580 
2581 	/*
2582 	 * Normally perf_session__new would do this, but it doesn't have the
2583 	 * evlist.
2584 	 */
2585 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2586 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2587 		rec->tool.ordered_events = false;
2588 	}
2589 
2590 	if (evlist__nr_groups(rec->evlist) == 0)
2591 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2592 
2593 	if (data->is_pipe) {
2594 		err = perf_header__write_pipe(fd);
2595 		if (err < 0)
2596 			goto out_free_threads;
2597 	} else {
2598 		err = perf_session__write_header(session, rec->evlist, fd, false);
2599 		if (err < 0)
2600 			goto out_free_threads;
2601 	}
2602 
2603 	err = -1;
2604 	if (!rec->no_buildid
2605 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2606 		pr_err("Couldn't generate buildids. "
2607 		       "Use --no-buildid to profile anyway.\n");
2608 		goto out_free_threads;
2609 	}
2610 
2611 	if (!evlist__needs_bpf_sb_event(rec->evlist))
2612 		opts->no_bpf_event = true;
2613 
2614 	err = record__setup_sb_evlist(rec);
2615 	if (err)
2616 		goto out_free_threads;
2617 
2618 	err = record__synthesize(rec, false);
2619 	if (err < 0)
2620 		goto out_free_threads;
2621 
2622 	if (rec->realtime_prio) {
2623 		struct sched_param param;
2624 
2625 		param.sched_priority = rec->realtime_prio;
2626 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2627 			pr_err("Could not set realtime priority.\n");
2628 			err = -1;
2629 			goto out_free_threads;
2630 		}
2631 	}
2632 
2633 	if (record__start_threads(rec))
2634 		goto out_free_threads;
2635 
2636 	/*
2637 	 * When perf is starting the traced process, all the events
2638 	 * (apart from group members) have enable_on_exec=1 set,
2639 	 * so don't spoil it by prematurely enabling them.
2640 	 */
2641 	if (!target__none(&opts->target) && !opts->target.initial_delay)
2642 		evlist__enable(rec->evlist);
2643 
2644 	/*
2645 	 * offcpu-time does not call execve, so enable_on_exe wouldn't work
2646 	 * when recording a workload, do it manually
2647 	 */
2648 	if (rec->off_cpu)
2649 		evlist__enable_evsel(rec->evlist, (char *)OFFCPU_EVENT);
2650 
2651 	/*
2652 	 * Let the child rip
2653 	 */
2654 	if (forks) {
2655 		struct machine *machine = &session->machines.host;
2656 		union perf_event *event;
2657 		pid_t tgid;
2658 
2659 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2660 		if (event == NULL) {
2661 			err = -ENOMEM;
2662 			goto out_child;
2663 		}
2664 
2665 		/*
2666 		 * Some H/W events are generated before COMM event
2667 		 * which is emitted during exec(), so perf script
2668 		 * cannot see a correct process name for those events.
2669 		 * Synthesize COMM event to prevent it.
2670 		 */
2671 		tgid = perf_event__synthesize_comm(tool, event,
2672 						   rec->evlist->workload.pid,
2673 						   process_synthesized_event,
2674 						   machine);
2675 		free(event);
2676 
2677 		if (tgid == -1)
2678 			goto out_child;
2679 
2680 		event = malloc(sizeof(event->namespaces) +
2681 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2682 			       machine->id_hdr_size);
2683 		if (event == NULL) {
2684 			err = -ENOMEM;
2685 			goto out_child;
2686 		}
2687 
2688 		/*
2689 		 * Synthesize NAMESPACES event for the command specified.
2690 		 */
2691 		perf_event__synthesize_namespaces(tool, event,
2692 						  rec->evlist->workload.pid,
2693 						  tgid, process_synthesized_event,
2694 						  machine);
2695 		free(event);
2696 
2697 		evlist__start_workload(rec->evlist);
2698 	}
2699 
2700 	if (opts->target.initial_delay) {
2701 		pr_info(EVLIST_DISABLED_MSG);
2702 		if (opts->target.initial_delay > 0) {
2703 			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2704 			evlist__enable(rec->evlist);
2705 			pr_info(EVLIST_ENABLED_MSG);
2706 		}
2707 	}
2708 
2709 	err = event_enable_timer__start(rec->evlist->eet);
2710 	if (err)
2711 		goto out_child;
2712 
2713 	/* Debug message used by test scripts */
2714 	pr_debug3("perf record has started\n");
2715 	fflush(stderr);
2716 
2717 	trigger_ready(&auxtrace_snapshot_trigger);
2718 	trigger_ready(&switch_output_trigger);
2719 	perf_hooks__invoke_record_start();
2720 
2721 	/*
2722 	 * Must write FINISHED_INIT so it will be seen after all other
2723 	 * synthesized user events, but before any regular events.
2724 	 */
2725 	err = write_finished_init(rec, false);
2726 	if (err < 0)
2727 		goto out_child;
2728 
2729 	for (;;) {
2730 		unsigned long long hits = thread->samples;
2731 
2732 		/*
2733 		 * rec->evlist->bkw_mmap_state is possible to be
2734 		 * BKW_MMAP_EMPTY here: when done == true and
2735 		 * hits != rec->samples in previous round.
2736 		 *
2737 		 * evlist__toggle_bkw_mmap ensure we never
2738 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2739 		 */
2740 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2741 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2742 
2743 		if (record__mmap_read_all(rec, false) < 0) {
2744 			trigger_error(&auxtrace_snapshot_trigger);
2745 			trigger_error(&switch_output_trigger);
2746 			err = -1;
2747 			goto out_child_no_flush;
2748 		}
2749 
2750 		if (auxtrace_record__snapshot_started) {
2751 			auxtrace_record__snapshot_started = 0;
2752 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2753 				record__read_auxtrace_snapshot(rec, false);
2754 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2755 				pr_err("AUX area tracing snapshot failed\n");
2756 				err = -1;
2757 				goto out_child;
2758 			}
2759 		}
2760 
2761 		if (trigger_is_hit(&switch_output_trigger)) {
2762 			/*
2763 			 * If switch_output_trigger is hit, the data in
2764 			 * overwritable ring buffer should have been collected,
2765 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2766 			 *
2767 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2768 			 * record__mmap_read_all() didn't collect data from
2769 			 * overwritable ring buffer. Read again.
2770 			 */
2771 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2772 				continue;
2773 			trigger_ready(&switch_output_trigger);
2774 
2775 			/*
2776 			 * Reenable events in overwrite ring buffer after
2777 			 * record__mmap_read_all(): we should have collected
2778 			 * data from it.
2779 			 */
2780 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2781 
2782 			if (!quiet)
2783 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2784 					record__waking(rec));
2785 			thread->waking = 0;
2786 			fd = record__switch_output(rec, false);
2787 			if (fd < 0) {
2788 				pr_err("Failed to switch to new file\n");
2789 				trigger_error(&switch_output_trigger);
2790 				err = fd;
2791 				goto out_child;
2792 			}
2793 
2794 			/* re-arm the alarm */
2795 			if (rec->switch_output.time)
2796 				alarm(rec->switch_output.time);
2797 		}
2798 
2799 		if (hits == thread->samples) {
2800 			if (done || draining)
2801 				break;
2802 			err = fdarray__poll(&thread->pollfd, -1);
2803 			/*
2804 			 * Propagate error, only if there's any. Ignore positive
2805 			 * number of returned events and interrupt error.
2806 			 */
2807 			if (err > 0 || (err < 0 && errno == EINTR))
2808 				err = 0;
2809 			thread->waking++;
2810 
2811 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2812 					    record__thread_munmap_filtered, NULL) == 0)
2813 				draining = true;
2814 
2815 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2816 			if (err)
2817 				goto out_child;
2818 		}
2819 
2820 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2821 			switch (cmd) {
2822 			case EVLIST_CTL_CMD_SNAPSHOT:
2823 				hit_auxtrace_snapshot_trigger(rec);
2824 				evlist__ctlfd_ack(rec->evlist);
2825 				break;
2826 			case EVLIST_CTL_CMD_STOP:
2827 				done = 1;
2828 				break;
2829 			case EVLIST_CTL_CMD_ACK:
2830 			case EVLIST_CTL_CMD_UNSUPPORTED:
2831 			case EVLIST_CTL_CMD_ENABLE:
2832 			case EVLIST_CTL_CMD_DISABLE:
2833 			case EVLIST_CTL_CMD_EVLIST:
2834 			case EVLIST_CTL_CMD_PING:
2835 			default:
2836 				break;
2837 			}
2838 		}
2839 
2840 		err = event_enable_timer__process(rec->evlist->eet);
2841 		if (err < 0)
2842 			goto out_child;
2843 		if (err) {
2844 			err = 0;
2845 			done = 1;
2846 		}
2847 
2848 		/*
2849 		 * When perf is starting the traced process, at the end events
2850 		 * die with the process and we wait for that. Thus no need to
2851 		 * disable events in this case.
2852 		 */
2853 		if (done && !disabled && !target__none(&opts->target)) {
2854 			trigger_off(&auxtrace_snapshot_trigger);
2855 			evlist__disable(rec->evlist);
2856 			disabled = true;
2857 		}
2858 	}
2859 
2860 	trigger_off(&auxtrace_snapshot_trigger);
2861 	trigger_off(&switch_output_trigger);
2862 
2863 	record__synthesize_final_bpf_metadata(rec);
2864 
2865 	if (opts->auxtrace_snapshot_on_exit)
2866 		record__auxtrace_snapshot_exit(rec);
2867 
2868 	if (forks && workload_exec_errno) {
2869 		char msg[STRERR_BUFSIZE];
2870 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2871 		struct strbuf sb = STRBUF_INIT;
2872 
2873 		evlist__format_evsels(rec->evlist, &sb, 2048);
2874 
2875 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2876 			sb.buf, argv[0], emsg);
2877 		strbuf_release(&sb);
2878 		err = -1;
2879 		goto out_child;
2880 	}
2881 
2882 	if (!quiet)
2883 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2884 			record__waking(rec));
2885 
2886 	write_finished_init(rec, true);
2887 
2888 	if (target__none(&rec->opts.target))
2889 		record__synthesize_workload(rec, true);
2890 
2891 out_child:
2892 	record__stop_threads(rec);
2893 	record__mmap_read_all(rec, true);
2894 	goto out_free_threads;
2895 out_child_no_flush:
2896 	/* mmap read already failed — retrying would just fail again */
2897 	record__stop_threads(rec);
2898 out_free_threads:
2899 	record__free_thread_data(rec);
2900 	evlist__finalize_ctlfd(rec->evlist);
2901 	record__aio_mmap_read_sync(rec);
2902 
2903 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2904 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2905 		env->comp_ratio = ratio + 0.5;
2906 	}
2907 
2908 	if (forks) {
2909 		int exit_status;
2910 
2911 		if (!child_finished)
2912 			kill(rec->evlist->workload.pid, SIGTERM);
2913 
2914 		wait(&exit_status);
2915 
2916 		if (err < 0)
2917 			status = err;
2918 		else if (WIFEXITED(exit_status))
2919 			status = WEXITSTATUS(exit_status);
2920 		else if (WIFSIGNALED(exit_status))
2921 			signr = WTERMSIG(exit_status);
2922 	} else
2923 		status = err;
2924 
2925 	if (rec->off_cpu)
2926 		rec->bytes_written += off_cpu_write(rec->session);
2927 
2928 	record__read_lost_samples(rec);
2929 	/* this will be recalculated during process_buildids() */
2930 	rec->samples = 0;
2931 
2932 	if (!err) {
2933 		record__synthesize(rec, true);
2934 		if (!rec->timestamp_filename) {
2935 			record__finish_output(rec);
2936 		} else {
2937 			fd = record__switch_output(rec, true);
2938 			if (fd < 0) {
2939 				status = fd;
2940 				goto out_delete_session;
2941 			}
2942 		}
2943 	}
2944 
2945 	perf_hooks__invoke_record_end();
2946 
2947 	if (!err && !quiet) {
2948 		char samples[128];
2949 		const char *postfix = rec->timestamp_filename ?
2950 					".<timestamp>" : "";
2951 
2952 		if (rec->samples && !rec->opts.full_auxtrace)
2953 			scnprintf(samples, sizeof(samples),
2954 				  " (%" PRIu64 " samples)", rec->samples);
2955 		else
2956 			samples[0] = '\0';
2957 
2958 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2959 			perf_data__size(data) / 1024.0 / 1024.0,
2960 			data->path, postfix, samples);
2961 		if (ratio) {
2962 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2963 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2964 					ratio);
2965 		}
2966 		fprintf(stderr, " ]\n");
2967 	}
2968 
2969 out_delete_session:
2970 #ifdef HAVE_EVENTFD_SUPPORT
2971 	if (done_fd >= 0) {
2972 		fd = done_fd;
2973 		done_fd = -1;
2974 
2975 		close(fd);
2976 	}
2977 #endif
2978 	zstd_fini(&session->zstd_data);
2979 	if (!opts->no_bpf_event)
2980 		evlist__stop_sb_thread(rec->sb_evlist);
2981 
2982 	perf_session__delete(session);
2983 	return status;
2984 }
2985 
2986 static int record_parse_callchain_opt(const struct option *opt,
2987 			       const char *arg,
2988 			       int unset)
2989 {
2990 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2991 }
2992 
2993 static int record_callchain_opt(const struct option *opt,
2994 				const char *arg __maybe_unused,
2995 				int unset)
2996 {
2997 	/*
2998 	 * The -g option only sets the callchain if not already configured by
2999 	 * .perfconfig. It does, however, enable it.
3000 	 */
3001 	if (callchain_param.record_mode != CALLCHAIN_NONE) {
3002 		callchain_param.enabled = true;
3003 		return 0;
3004 	}
3005 
3006 	return record_opts__parse_callchain(opt->value, &callchain_param,
3007 					    EM_HOST != EM_S390 ? "fp" : "dwarf",
3008 					    unset);
3009 }
3010 
3011 
3012 static int perf_record_config(const char *var, const char *value, void *cb)
3013 {
3014 	struct record *rec = cb;
3015 
3016 	if (!strcmp(var, "record.build-id")) {
3017 		if (!strcmp(value, "cache"))
3018 			rec->no_buildid_cache = false;
3019 		else if (!strcmp(value, "no-cache"))
3020 			rec->no_buildid_cache = true;
3021 		else if (!strcmp(value, "skip"))
3022 			rec->no_buildid = rec->no_buildid_cache = true;
3023 		else if (!strcmp(value, "mmap"))
3024 			rec->buildid_mmap = true;
3025 		else if (!strcmp(value, "no-mmap"))
3026 			rec->buildid_mmap = false;
3027 		else
3028 			return -1;
3029 		return 0;
3030 	}
3031 	if (!strcmp(var, "record.call-graph")) {
3032 		var = "call-graph.record-mode";
3033 		return perf_default_config(var, value, cb);
3034 	}
3035 #ifdef HAVE_AIO_SUPPORT
3036 	if (!strcmp(var, "record.aio")) {
3037 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
3038 		if (!rec->opts.nr_cblocks)
3039 			rec->opts.nr_cblocks = nr_cblocks_default;
3040 	}
3041 #endif
3042 	if (!strcmp(var, "record.debuginfod")) {
3043 		rec->debuginfod.urls = strdup(value);
3044 		if (!rec->debuginfod.urls)
3045 			return -ENOMEM;
3046 		rec->debuginfod.set = true;
3047 	}
3048 
3049 	return 0;
3050 }
3051 
3052 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
3053 {
3054 	struct record *rec = (struct record *)opt->value;
3055 
3056 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
3057 }
3058 
3059 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
3060 {
3061 	struct record_opts *opts = (struct record_opts *)opt->value;
3062 
3063 	if (unset || !str)
3064 		return 0;
3065 
3066 	if (!strcasecmp(str, "node"))
3067 		opts->affinity = PERF_AFFINITY_NODE;
3068 	else if (!strcasecmp(str, "cpu"))
3069 		opts->affinity = PERF_AFFINITY_CPU;
3070 
3071 	return 0;
3072 }
3073 
3074 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3075 {
3076 	mask->nbits = nr_bits;
3077 	mask->bits = bitmap_zalloc(mask->nbits);
3078 	if (!mask->bits)
3079 		return -ENOMEM;
3080 
3081 	return 0;
3082 }
3083 
3084 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3085 {
3086 	bitmap_free(mask->bits);
3087 	mask->bits = NULL;
3088 	mask->nbits = 0;
3089 }
3090 
3091 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3092 {
3093 	int ret;
3094 
3095 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3096 	if (ret) {
3097 		mask->affinity.bits = NULL;
3098 		return ret;
3099 	}
3100 
3101 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3102 	if (ret) {
3103 		record__mmap_cpu_mask_free(&mask->maps);
3104 		mask->maps.bits = NULL;
3105 	}
3106 
3107 	return ret;
3108 }
3109 
3110 static void record__thread_mask_free(struct thread_mask *mask)
3111 {
3112 	record__mmap_cpu_mask_free(&mask->maps);
3113 	record__mmap_cpu_mask_free(&mask->affinity);
3114 }
3115 
3116 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3117 {
3118 	int s;
3119 	struct record_opts *opts = opt->value;
3120 
3121 	if (unset || !str || !strlen(str)) {
3122 		opts->threads_spec = THREAD_SPEC__CPU;
3123 	} else {
3124 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3125 			if (s == THREAD_SPEC__USER) {
3126 				opts->threads_user_spec = strdup(str);
3127 				if (!opts->threads_user_spec)
3128 					return -ENOMEM;
3129 				opts->threads_spec = THREAD_SPEC__USER;
3130 				break;
3131 			}
3132 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3133 				opts->threads_spec = s;
3134 				break;
3135 			}
3136 		}
3137 	}
3138 
3139 	if (opts->threads_spec == THREAD_SPEC__USER)
3140 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3141 	else
3142 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3143 
3144 	return 0;
3145 }
3146 
3147 static int parse_output_max_size(const struct option *opt,
3148 				 const char *str, int unset)
3149 {
3150 	unsigned long *s = (unsigned long *)opt->value;
3151 	static struct parse_tag tags_size[] = {
3152 		{ .tag  = 'B', .mult = 1       },
3153 		{ .tag  = 'K', .mult = 1 << 10 },
3154 		{ .tag  = 'M', .mult = 1 << 20 },
3155 		{ .tag  = 'G', .mult = 1 << 30 },
3156 		{ .tag  = 0 },
3157 	};
3158 	unsigned long val;
3159 
3160 	if (unset) {
3161 		*s = 0;
3162 		return 0;
3163 	}
3164 
3165 	val = parse_tag_value(str, tags_size);
3166 	if (val != (unsigned long) -1) {
3167 		*s = val;
3168 		return 0;
3169 	}
3170 
3171 	return -1;
3172 }
3173 
3174 static int record__parse_mmap_pages(const struct option *opt,
3175 				    const char *str,
3176 				    int unset __maybe_unused)
3177 {
3178 	struct record_opts *opts = opt->value;
3179 	char *s, *p;
3180 	unsigned int mmap_pages;
3181 	int ret;
3182 
3183 	if (!str)
3184 		return -EINVAL;
3185 
3186 	s = strdup(str);
3187 	if (!s)
3188 		return -ENOMEM;
3189 
3190 	p = strchr(s, ',');
3191 	if (p)
3192 		*p = '\0';
3193 
3194 	if (*s) {
3195 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3196 		if (ret)
3197 			goto out_free;
3198 		opts->mmap_pages = mmap_pages;
3199 	}
3200 
3201 	if (!p) {
3202 		ret = 0;
3203 		goto out_free;
3204 	}
3205 
3206 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3207 	if (ret)
3208 		goto out_free;
3209 
3210 	opts->auxtrace_mmap_pages = mmap_pages;
3211 
3212 out_free:
3213 	free(s);
3214 	return ret;
3215 }
3216 
3217 static int record__parse_off_cpu_thresh(const struct option *opt,
3218 					const char *str,
3219 					int unset __maybe_unused)
3220 {
3221 	struct record_opts *opts = opt->value;
3222 	char *endptr;
3223 	u64 off_cpu_thresh_ms;
3224 
3225 	if (!str)
3226 		return -EINVAL;
3227 
3228 	off_cpu_thresh_ms = strtoull(str, &endptr, 10);
3229 
3230 	/* the threshold isn't string "0", yet strtoull() returns 0, parsing failed */
3231 	if (*endptr || (off_cpu_thresh_ms == 0 && strcmp(str, "0")))
3232 		return -EINVAL;
3233 	else
3234 		opts->off_cpu_thresh_ns = off_cpu_thresh_ms * NSEC_PER_MSEC;
3235 
3236 	return 0;
3237 }
3238 
3239 static int parse_control_option(const struct option *opt,
3240 				const char *str,
3241 				int unset __maybe_unused)
3242 {
3243 	struct record_opts *opts = opt->value;
3244 
3245 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3246 }
3247 
3248 static void switch_output_size_warn(struct record *rec)
3249 {
3250 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3251 	struct switch_output *s = &rec->switch_output;
3252 
3253 	wakeup_size /= 2;
3254 
3255 	if (s->size < wakeup_size) {
3256 		char buf[100];
3257 
3258 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3259 		pr_warning("WARNING: switch-output data size lower than "
3260 			   "wakeup kernel buffer size (%s) "
3261 			   "expect bigger perf.data sizes\n", buf);
3262 	}
3263 }
3264 
3265 static int switch_output_setup(struct record *rec)
3266 {
3267 	struct switch_output *s = &rec->switch_output;
3268 	static struct parse_tag tags_size[] = {
3269 		{ .tag  = 'B', .mult = 1       },
3270 		{ .tag  = 'K', .mult = 1 << 10 },
3271 		{ .tag  = 'M', .mult = 1 << 20 },
3272 		{ .tag  = 'G', .mult = 1 << 30 },
3273 		{ .tag  = 0 },
3274 	};
3275 	static struct parse_tag tags_time[] = {
3276 		{ .tag  = 's', .mult = 1        },
3277 		{ .tag  = 'm', .mult = 60       },
3278 		{ .tag  = 'h', .mult = 60*60    },
3279 		{ .tag  = 'd', .mult = 60*60*24 },
3280 		{ .tag  = 0 },
3281 	};
3282 	unsigned long val;
3283 
3284 	/*
3285 	 * If we're using --switch-output-events, then we imply its
3286 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3287 	 *  thread to its parent.
3288 	 */
3289 	if (rec->switch_output_event_set) {
3290 		if (record__threads_enabled(rec)) {
3291 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3292 			return 0;
3293 		}
3294 		goto do_signal;
3295 	}
3296 
3297 	if (!s->set)
3298 		return 0;
3299 
3300 	if (record__threads_enabled(rec)) {
3301 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3302 		return 0;
3303 	}
3304 
3305 	if (!strcmp(s->str, "signal")) {
3306 do_signal:
3307 		s->signal = true;
3308 		pr_debug("switch-output with SIGUSR2 signal\n");
3309 		goto enabled;
3310 	}
3311 
3312 	val = parse_tag_value(s->str, tags_size);
3313 	if (val != (unsigned long) -1) {
3314 		s->size = val;
3315 		pr_debug("switch-output with %s size threshold\n", s->str);
3316 		goto enabled;
3317 	}
3318 
3319 	val = parse_tag_value(s->str, tags_time);
3320 	if (val != (unsigned long) -1) {
3321 		s->time = val;
3322 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3323 			 s->str, s->time);
3324 		goto enabled;
3325 	}
3326 
3327 	return -1;
3328 
3329 enabled:
3330 	rec->timestamp_filename = true;
3331 	s->enabled              = true;
3332 
3333 	if (s->size && !rec->opts.no_buffering)
3334 		switch_output_size_warn(rec);
3335 
3336 	return 0;
3337 }
3338 
3339 static const char * const __record_usage[] = {
3340 	"perf record [<options>] [<command>]",
3341 	"perf record [<options>] -- <command> [<options>]",
3342 	NULL
3343 };
3344 const char * const *record_usage = __record_usage;
3345 
3346 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
3347 				  struct perf_sample *sample, struct machine *machine)
3348 {
3349 	/*
3350 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3351 	 * no need to add them twice.
3352 	 */
3353 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3354 		return 0;
3355 	return perf_event__process_mmap(tool, event, sample, machine);
3356 }
3357 
3358 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
3359 				   struct perf_sample *sample, struct machine *machine)
3360 {
3361 	/*
3362 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3363 	 * no need to add them twice.
3364 	 */
3365 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3366 		return 0;
3367 
3368 	return perf_event__process_mmap2(tool, event, sample, machine);
3369 }
3370 
3371 static int process_timestamp_boundary(const struct perf_tool *tool,
3372 				      union perf_event *event __maybe_unused,
3373 				      struct perf_sample *sample,
3374 				      struct machine *machine __maybe_unused)
3375 {
3376 	struct record *rec = container_of(tool, struct record, tool);
3377 
3378 	set_timestamp_boundary(rec, sample->time);
3379 	return 0;
3380 }
3381 
3382 static int parse_record_synth_option(const struct option *opt,
3383 				     const char *str,
3384 				     int unset __maybe_unused)
3385 {
3386 	struct record_opts *opts = opt->value;
3387 	char *p = strdup(str);
3388 
3389 	if (p == NULL)
3390 		return -1;
3391 
3392 	opts->synth = parse_synth_opt(p);
3393 	free(p);
3394 
3395 	if (opts->synth < 0) {
3396 		pr_err("Invalid synth option: %s\n", str);
3397 		return -1;
3398 	}
3399 	return 0;
3400 }
3401 
3402 /*
3403  * XXX Ideally would be local to cmd_record() and passed to a record__new
3404  * because we need to have access to it in record__exit, that is called
3405  * after cmd_record() exits, but since record_options need to be accessible to
3406  * builtin-script, leave it here.
3407  *
3408  * At least we don't ouch it in all the other functions here directly.
3409  *
3410  * Just say no to tons of global variables, sigh.
3411  */
3412 static struct record record = {
3413 	.opts = {
3414 		.sample_time	     = true,
3415 		.mmap_pages	     = UINT_MAX,
3416 		.user_freq	     = UINT_MAX,
3417 		.user_interval	     = ULLONG_MAX,
3418 		.freq		     = 4000,
3419 		.target		     = {
3420 			.uses_mmap   = true,
3421 			.default_per_cpu = true,
3422 		},
3423 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3424 		.nr_threads_synthesize = 1,
3425 		.ctl_fd              = -1,
3426 		.ctl_fd_ack          = -1,
3427 		.synth               = PERF_SYNTH_ALL,
3428 		.off_cpu_thresh_ns   = OFFCPU_THRESH,
3429 	},
3430 	.buildid_mmap = true,
3431 };
3432 
3433 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3434 	"\n\t\t\t\tDefault: fp";
3435 
3436 static bool dry_run;
3437 
3438 static struct parse_events_option_args parse_events_option_args = {
3439 	.evlistp = &record.evlist,
3440 };
3441 
3442 static struct parse_events_option_args switch_output_parse_events_option_args = {
3443 	.evlistp = &record.sb_evlist,
3444 };
3445 
3446 /*
3447  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3448  * with it and switch to use the library functions in perf_evlist that came
3449  * from builtin-record.c, i.e. use record_opts,
3450  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3451  * using pipes, etc.
3452  */
3453 static struct option __record_options[] = {
3454 	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3455 		     "event selector. use 'perf list' to list available events",
3456 		     parse_events_option),
3457 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3458 		     "event filter", parse_filter),
3459 	OPT_BOOLEAN(0, "latency", &record.latency,
3460 		    "Enable data collection for latency profiling.\n"
3461 		    "\t\t\t  Use perf report --latency for latency-centric profile."),
3462 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3463 			   NULL, "don't record events from perf itself",
3464 			   exclude_perf),
3465 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3466 		    "record events on existing process id"),
3467 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3468 		    "record events on existing thread id"),
3469 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3470 		    "collect data with this RT SCHED_FIFO priority"),
3471 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3472 		    "collect data without buffering"),
3473 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3474 		    "collect raw sample records from all opened counters"),
3475 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3476 			    "system-wide collection from all CPUs"),
3477 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3478 		    "list of cpus to monitor"),
3479 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3480 	OPT_STRING('o', "output", &record.data.path, "file",
3481 		    "output file name"),
3482 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3483 			&record.opts.no_inherit_set,
3484 			"child tasks do not inherit counters"),
3485 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3486 		    "synthesize non-sample events at the end of output"),
3487 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3488 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3489 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3490 		    "Fail if the specified frequency can't be used"),
3491 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3492 		     "profile at this frequency",
3493 		      record__parse_freq),
3494 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3495 		     "number of mmap data pages and AUX area tracing mmap pages",
3496 		     record__parse_mmap_pages),
3497 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3498 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3499 		     record__mmap_flush_parse),
3500 	OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
3501 			   NULL, "enables call-graph recording" ,
3502 			   &record_callchain_opt),
3503 	OPT_CALLBACK(0, "call-graph", &record.opts,
3504 		     "record_mode[,record_size]", record_callchain_help,
3505 		     &record_parse_callchain_opt),
3506 	OPT_INCR('v', "verbose", &verbose,
3507 		    "be more verbose (show counter open errors, etc)"),
3508 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3509 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3510 		    "per thread counts"),
3511 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3512 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3513 		    "Record the sample physical addresses"),
3514 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3515 		    "Record the sampled data address data page size"),
3516 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3517 		    "Record the sampled code address (ip) page size"),
3518 	OPT_BOOLEAN(0, "sample-mem-info", &record.opts.sample_data_src,
3519 		    "Record the data source for memory operations"),
3520 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3521 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3522 		    "Record the sample identifier"),
3523 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3524 			&record.opts.sample_time_set,
3525 			"Record the sample timestamps"),
3526 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3527 			"Record the sample period"),
3528 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3529 		    "don't sample"),
3530 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3531 			&record.no_buildid_cache_set,
3532 			"do not update the buildid cache"),
3533 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3534 			&record.no_buildid_set,
3535 			"do not collect buildids in perf.data"),
3536 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3537 		     "monitor event in cgroup name only",
3538 		     parse_cgroups),
3539 	OPT_CALLBACK('D', "delay", &record, "ms",
3540 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3541 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3542 		     record__parse_event_enable_time),
3543 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3544 	OPT_STRING('u', "uid", &record.uid_str, "user", "user to profile"),
3545 
3546 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3547 		     "branch any", "sample any taken branches",
3548 		     parse_branch_stack),
3549 
3550 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3551 		     "branch filter mask", "branch stack filter modes",
3552 		     parse_branch_stack),
3553 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3554 		    "sample by weight (on special events only)"),
3555 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3556 		    "sample transaction flags (special events only)"),
3557 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3558 		    "use per-thread mmaps"),
3559 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3560 		    "sample selected machine registers on interrupt,"
3561 		    " use '-I?' to list register names", parse_intr_regs),
3562 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3563 		    "sample selected machine registers in user space,"
3564 		    " use '--user-regs=?' to list register names", parse_user_regs),
3565 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3566 		    "Record running/enabled time of read (:S) events"),
3567 	OPT_CALLBACK('k', "clockid", &record.opts,
3568 	"clockid", "clockid to use for events, see clock_gettime()",
3569 	parse_clockid),
3570 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3571 			  "opts", "AUX area tracing Snapshot Mode", ""),
3572 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3573 			  "opts", "sample AUX area", ""),
3574 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3575 			"per thread proc mmap processing timeout in ms"),
3576 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3577 		    "Record namespaces events"),
3578 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3579 		    "Record cgroup events"),
3580 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3581 			&record.opts.record_switch_events_set,
3582 			"Record context switch events"),
3583 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3584 			 "Configure all used events to run in kernel space.",
3585 			 PARSE_OPT_EXCLUSIVE),
3586 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3587 			 "Configure all used events to run in user space.",
3588 			 PARSE_OPT_EXCLUSIVE),
3589 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3590 		    "collect kernel callchains"),
3591 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3592 		    "collect user callchains"),
3593 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3594 		   "file", "vmlinux pathname"),
3595 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3596 		    "Record build-id of all DSOs regardless of hits"),
3597 	OPT_BOOLEAN_SET(0, "buildid-mmap", &record.buildid_mmap, &record.buildid_mmap_set,
3598 			"Record build-id in mmap events and skip build-id processing."),
3599 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3600 		    "append timestamp to output filename"),
3601 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3602 		    "Record timestamp boundary (time of first/last samples)"),
3603 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3604 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3605 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3606 			  "signal"),
3607 	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3608 			 &record.switch_output_event_set, "switch output event",
3609 			 "switch output event selector. use 'perf list' to list available events",
3610 			 parse_events_option_new_evlist),
3611 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3612 		   "Limit number of switch output generated files"),
3613 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3614 		    "Parse options then exit"),
3615 #ifdef HAVE_AIO_SUPPORT
3616 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3617 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3618 		     record__aio_parse),
3619 #endif
3620 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3621 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3622 		     record__parse_affinity),
3623 #ifdef HAVE_ZSTD_SUPPORT
3624 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3625 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3626 			    record__parse_comp_level),
3627 #endif
3628 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3629 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3630 	OPT_UINTEGER(0, "num-thread-synthesize",
3631 		     &record.opts.nr_threads_synthesize,
3632 		     "number of threads to run for event synthesis"),
3633 #ifdef HAVE_LIBPFM
3634 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3635 		"libpfm4 event selector. use 'perf list' to list available events",
3636 		parse_libpfm_events_option),
3637 #endif
3638 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3639 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3640 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3641 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3642 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3643 		      parse_control_option),
3644 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3645 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3646 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3647 			  &record.debuginfod.set, "debuginfod urls",
3648 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3649 			  "system"),
3650 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3651 			    "write collected trace data into several data files using parallel threads",
3652 			    record__parse_threads),
3653 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3654 	OPT_STRING(0, "setup-filter", &record.filter_action, "pin|unpin",
3655 		   "BPF filter action"),
3656 	OPT_CALLBACK(0, "off-cpu-thresh", &record.opts, "ms",
3657 		     "Dump off-cpu samples if off-cpu time exceeds this threshold (in milliseconds). (Default: 500ms)",
3658 		     record__parse_off_cpu_thresh),
3659 	OPT_BOOLEAN_SET(0, "data-mmap", &record.opts.record_data_mmap,
3660 			&record.opts.record_data_mmap_set,
3661 			"Record mmap events for non-executable mappings"),
3662 	OPT_END()
3663 };
3664 
3665 struct option *record_options = __record_options;
3666 
3667 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3668 {
3669 	struct perf_cpu cpu;
3670 	unsigned int idx;
3671 
3672 	if (cpu_map__is_dummy(cpus))
3673 		return 0;
3674 
3675 	perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3676 		/* Return ENODEV is input cpu is greater than max cpu */
3677 		if ((unsigned long)cpu.cpu > mask->nbits)
3678 			return -ENODEV;
3679 		__set_bit(cpu.cpu, mask->bits);
3680 	}
3681 
3682 	return 0;
3683 }
3684 
3685 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3686 {
3687 	struct perf_cpu_map *cpus;
3688 
3689 	cpus = perf_cpu_map__new(mask_spec);
3690 	if (!cpus)
3691 		return -ENOMEM;
3692 
3693 	bitmap_zero(mask->bits, mask->nbits);
3694 	if (record__mmap_cpu_mask_init(mask, cpus))
3695 		return -ENODEV;
3696 
3697 	perf_cpu_map__put(cpus);
3698 
3699 	return 0;
3700 }
3701 
3702 static void record__free_thread_masks(struct record *rec, int nr_threads)
3703 {
3704 	int t;
3705 
3706 	if (rec->thread_masks)
3707 		for (t = 0; t < nr_threads; t++)
3708 			record__thread_mask_free(&rec->thread_masks[t]);
3709 
3710 	zfree(&rec->thread_masks);
3711 }
3712 
3713 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3714 {
3715 	int t, ret;
3716 
3717 	rec->thread_masks = calloc(nr_threads, sizeof(*(rec->thread_masks)));
3718 	if (!rec->thread_masks) {
3719 		pr_err("Failed to allocate thread masks\n");
3720 		return -ENOMEM;
3721 	}
3722 
3723 	for (t = 0; t < nr_threads; t++) {
3724 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3725 		if (ret) {
3726 			pr_err("Failed to allocate thread masks[%d]\n", t);
3727 			goto out_free;
3728 		}
3729 	}
3730 
3731 	return 0;
3732 
3733 out_free:
3734 	record__free_thread_masks(rec, nr_threads);
3735 
3736 	return ret;
3737 }
3738 
3739 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3740 {
3741 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3742 
3743 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3744 	if (ret)
3745 		return ret;
3746 
3747 	rec->nr_threads = nr_cpus;
3748 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3749 
3750 	for (t = 0; t < rec->nr_threads; t++) {
3751 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3752 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3753 		if (verbose > 0) {
3754 			pr_debug("thread_masks[%d]: ", t);
3755 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3756 			pr_debug("thread_masks[%d]: ", t);
3757 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3758 		}
3759 	}
3760 
3761 	return 0;
3762 }
3763 
3764 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3765 					  const char **maps_spec, const char **affinity_spec,
3766 					  u32 nr_spec)
3767 {
3768 	u32 s;
3769 	int ret = 0, t = 0;
3770 	struct mmap_cpu_mask cpus_mask;
3771 	struct thread_mask thread_mask, full_mask, *thread_masks;
3772 
3773 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3774 	if (ret) {
3775 		pr_err("Failed to allocate CPUs mask\n");
3776 		return ret;
3777 	}
3778 
3779 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3780 	if (ret) {
3781 		pr_err("Failed to init cpu mask\n");
3782 		goto out_free_cpu_mask;
3783 	}
3784 
3785 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3786 	if (ret) {
3787 		pr_err("Failed to allocate full mask\n");
3788 		goto out_free_cpu_mask;
3789 	}
3790 
3791 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3792 	if (ret) {
3793 		pr_err("Failed to allocate thread mask\n");
3794 		goto out_free_full_and_cpu_masks;
3795 	}
3796 
3797 	for (s = 0; s < nr_spec; s++) {
3798 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3799 		if (ret) {
3800 			pr_err("Failed to initialize maps thread mask\n");
3801 			goto out_free;
3802 		}
3803 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3804 		if (ret) {
3805 			pr_err("Failed to initialize affinity thread mask\n");
3806 			goto out_free;
3807 		}
3808 
3809 		/* ignore invalid CPUs but do not allow empty masks */
3810 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3811 				cpus_mask.bits, thread_mask.maps.nbits)) {
3812 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3813 			ret = -EINVAL;
3814 			goto out_free;
3815 		}
3816 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3817 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3818 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3819 			ret = -EINVAL;
3820 			goto out_free;
3821 		}
3822 
3823 		/* do not allow intersection with other masks (full_mask) */
3824 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3825 				      thread_mask.maps.nbits)) {
3826 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3827 			ret = -EINVAL;
3828 			goto out_free;
3829 		}
3830 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3831 				      thread_mask.affinity.nbits)) {
3832 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3833 			ret = -EINVAL;
3834 			goto out_free;
3835 		}
3836 
3837 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3838 			  thread_mask.maps.bits, full_mask.maps.nbits);
3839 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3840 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3841 
3842 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3843 		if (!thread_masks) {
3844 			pr_err("Failed to reallocate thread masks\n");
3845 			ret = -ENOMEM;
3846 			goto out_free;
3847 		}
3848 		rec->thread_masks = thread_masks;
3849 		rec->thread_masks[t] = thread_mask;
3850 		if (verbose > 0) {
3851 			pr_debug("thread_masks[%d]: ", t);
3852 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3853 			pr_debug("thread_masks[%d]: ", t);
3854 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3855 		}
3856 		t++;
3857 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3858 		if (ret) {
3859 			pr_err("Failed to allocate thread mask\n");
3860 			goto out_free_full_and_cpu_masks;
3861 		}
3862 	}
3863 	rec->nr_threads = t;
3864 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3865 	if (!rec->nr_threads)
3866 		ret = -EINVAL;
3867 
3868 out_free:
3869 	record__thread_mask_free(&thread_mask);
3870 out_free_full_and_cpu_masks:
3871 	record__thread_mask_free(&full_mask);
3872 out_free_cpu_mask:
3873 	record__mmap_cpu_mask_free(&cpus_mask);
3874 
3875 	return ret;
3876 }
3877 
3878 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3879 {
3880 	int ret;
3881 	struct cpu_topology *topo;
3882 
3883 	topo = cpu_topology__new();
3884 	if (!topo) {
3885 		pr_err("Failed to allocate CPU topology\n");
3886 		return -ENOMEM;
3887 	}
3888 
3889 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3890 					     topo->core_cpus_list, topo->core_cpus_lists);
3891 	cpu_topology__delete(topo);
3892 
3893 	return ret;
3894 }
3895 
3896 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3897 {
3898 	int ret;
3899 	struct cpu_topology *topo;
3900 
3901 	topo = cpu_topology__new();
3902 	if (!topo) {
3903 		pr_err("Failed to allocate CPU topology\n");
3904 		return -ENOMEM;
3905 	}
3906 
3907 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3908 					     topo->package_cpus_list, topo->package_cpus_lists);
3909 	cpu_topology__delete(topo);
3910 
3911 	return ret;
3912 }
3913 
3914 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3915 {
3916 	u32 s;
3917 	int ret;
3918 	const char **spec;
3919 	struct numa_topology *topo;
3920 
3921 	topo = numa_topology__new();
3922 	if (!topo) {
3923 		pr_err("Failed to allocate NUMA topology\n");
3924 		return -ENOMEM;
3925 	}
3926 
3927 	spec = calloc(topo->nr, sizeof(char *));
3928 	if (!spec) {
3929 		pr_err("Failed to allocate NUMA spec\n");
3930 		ret = -ENOMEM;
3931 		goto out_delete_topo;
3932 	}
3933 	for (s = 0; s < topo->nr; s++)
3934 		spec[s] = topo->nodes[s].cpus;
3935 
3936 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3937 
3938 	zfree(&spec);
3939 
3940 out_delete_topo:
3941 	numa_topology__delete(topo);
3942 
3943 	return ret;
3944 }
3945 
3946 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3947 {
3948 	int t, ret;
3949 	u32 s, nr_spec = 0;
3950 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3951 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3952 
3953 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3954 		spec = strtok_r(user_spec, ":", &spec_ptr);
3955 		if (spec == NULL)
3956 			break;
3957 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3958 		mask = strtok_r(spec, "/", &mask_ptr);
3959 		if (mask == NULL)
3960 			break;
3961 		pr_debug2("  maps mask: %s\n", mask);
3962 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3963 		if (!tmp_spec) {
3964 			pr_err("Failed to reallocate maps spec\n");
3965 			ret = -ENOMEM;
3966 			goto out_free;
3967 		}
3968 		maps_spec = tmp_spec;
3969 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3970 		if (!maps_spec[nr_spec]) {
3971 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3972 			ret = -ENOMEM;
3973 			goto out_free;
3974 		}
3975 		mask = strtok_r(NULL, "/", &mask_ptr);
3976 		if (mask == NULL) {
3977 			pr_err("Invalid thread maps or affinity specs\n");
3978 			ret = -EINVAL;
3979 			goto out_free;
3980 		}
3981 		pr_debug2("  affinity mask: %s\n", mask);
3982 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3983 		if (!tmp_spec) {
3984 			pr_err("Failed to reallocate affinity spec\n");
3985 			ret = -ENOMEM;
3986 			goto out_free;
3987 		}
3988 		affinity_spec = tmp_spec;
3989 		affinity_spec[nr_spec] = strdup(mask);
3990 		if (!affinity_spec[nr_spec]) {
3991 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3992 			ret = -ENOMEM;
3993 			goto out_free;
3994 		}
3995 		dup_mask = NULL;
3996 		nr_spec++;
3997 	}
3998 
3999 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
4000 					     (const char **)affinity_spec, nr_spec);
4001 
4002 out_free:
4003 	free(dup_mask);
4004 	for (s = 0; s < nr_spec; s++) {
4005 		if (maps_spec)
4006 			free(maps_spec[s]);
4007 		if (affinity_spec)
4008 			free(affinity_spec[s]);
4009 	}
4010 	free(affinity_spec);
4011 	free(maps_spec);
4012 
4013 	return ret;
4014 }
4015 
4016 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
4017 {
4018 	int ret;
4019 
4020 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
4021 	if (ret)
4022 		return ret;
4023 
4024 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
4025 		return -ENODEV;
4026 
4027 	rec->nr_threads = 1;
4028 
4029 	return 0;
4030 }
4031 
4032 static int record__init_thread_masks(struct record *rec)
4033 {
4034 	int ret = 0;
4035 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
4036 
4037 	if (!record__threads_enabled(rec))
4038 		return record__init_thread_default_masks(rec, cpus);
4039 
4040 	if (evlist__per_thread(rec->evlist)) {
4041 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
4042 		return -EINVAL;
4043 	}
4044 
4045 	switch (rec->opts.threads_spec) {
4046 	case THREAD_SPEC__CPU:
4047 		ret = record__init_thread_cpu_masks(rec, cpus);
4048 		break;
4049 	case THREAD_SPEC__CORE:
4050 		ret = record__init_thread_core_masks(rec, cpus);
4051 		break;
4052 	case THREAD_SPEC__PACKAGE:
4053 		ret = record__init_thread_package_masks(rec, cpus);
4054 		break;
4055 	case THREAD_SPEC__NUMA:
4056 		ret = record__init_thread_numa_masks(rec, cpus);
4057 		break;
4058 	case THREAD_SPEC__USER:
4059 		ret = record__init_thread_user_masks(rec, cpus);
4060 		break;
4061 	default:
4062 		break;
4063 	}
4064 
4065 	return ret;
4066 }
4067 
4068 int cmd_record(int argc, const char **argv)
4069 {
4070 	int err;
4071 	struct record *rec = &record;
4072 	char errbuf[BUFSIZ];
4073 
4074 	setlocale(LC_ALL, "");
4075 
4076 #ifndef HAVE_BPF_SKEL
4077 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
4078 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
4079 # undef set_nobuild
4080 #endif
4081 
4082 	/* Disable eager loading of kernel symbols that adds overhead to perf record. */
4083 	symbol_conf.lazy_load_kernel_maps = true;
4084 	rec->opts.affinity = PERF_AFFINITY_SYS;
4085 
4086 	rec->evlist = evlist__new();
4087 	if (rec->evlist == NULL)
4088 		return -ENOMEM;
4089 
4090 	err = perf_config(perf_record_config, rec);
4091 	if (err)
4092 		return err;
4093 
4094 	argc = parse_options(argc, argv, record_options, record_usage,
4095 			    PARSE_OPT_STOP_AT_NON_OPTION);
4096 	if (quiet)
4097 		perf_quiet_option();
4098 
4099 	err = symbol__validate_sym_arguments();
4100 	if (err)
4101 		return err;
4102 
4103 	perf_debuginfod_setup(&record.debuginfod);
4104 
4105 	/*
4106 	 * Use system wide (-a) for the default target (ie. when no
4107 	 * workload). User ID filtering also implies system-wide.
4108 	 */
4109 	if ((!argc && target__none(&rec->opts.target)) || rec->uid_str)
4110 		rec->opts.target.system_wide = true;
4111 
4112 	if (nr_cgroups && !rec->opts.target.system_wide) {
4113 		usage_with_options_msg(record_usage, record_options,
4114 			"cgroup monitoring only available in system-wide mode");
4115 
4116 	}
4117 
4118 	if (record.latency) {
4119 		/*
4120 		 * There is no fundamental reason why latency profiling
4121 		 * can't work for system-wide mode, but exact semantics
4122 		 * and details are to be defined.
4123 		 * See the following thread for details:
4124 		 * https://lore.kernel.org/all/Z4XDJyvjiie3howF@google.com/
4125 		 */
4126 		if (record.opts.target.system_wide) {
4127 			pr_err("Failed: latency profiling is not supported with system-wide collection.\n");
4128 			err = -EINVAL;
4129 			goto out_opts;
4130 		}
4131 		record.opts.record_switch_events = true;
4132 	}
4133 
4134 	if (rec->buildid_mmap && !perf_can_record_build_id()) {
4135 		pr_warning("Missing support for build id in kernel mmap events.\n"
4136 			   "Disable this warning with --no-buildid-mmap\n");
4137 		rec->buildid_mmap = false;
4138 	}
4139 
4140 	if (rec->buildid_mmap) {
4141 		/* Enable perf_event_attr::build_id bit. */
4142 		rec->opts.build_id = true;
4143 		/* Disable build-ID table in the header. */
4144 		rec->no_buildid = true;
4145 	} else {
4146 		pr_debug("Disabling build id in synthesized mmap2 events.\n");
4147 		symbol_conf.no_buildid_mmap2 = true;
4148 	}
4149 
4150 	if (rec->no_buildid_set && rec->no_buildid) {
4151 		/* -B implies -N for historic reasons. */
4152 		rec->no_buildid_cache = true;
4153 	}
4154 
4155 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4156 		pr_err("Kernel has no cgroup sampling support.\n");
4157 		err = -EINVAL;
4158 		goto out_opts;
4159 	}
4160 
4161 	if (rec->opts.kcore)
4162 		rec->opts.text_poke = true;
4163 
4164 	if (rec->opts.kcore || record__threads_enabled(rec))
4165 		rec->data.is_dir = true;
4166 
4167 	if (record__threads_enabled(rec)) {
4168 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4169 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4170 			goto out_opts;
4171 		}
4172 		if (record__aio_enabled(rec)) {
4173 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4174 			goto out_opts;
4175 		}
4176 	}
4177 
4178 	if (rec->opts.comp_level != 0) {
4179 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4180 		rec->no_buildid = true;
4181 	}
4182 
4183 	if (rec->opts.record_switch_events &&
4184 	    !perf_can_record_switch_events()) {
4185 		ui__error("kernel does not support recording context switch events\n");
4186 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4187 		err = -EINVAL;
4188 		goto out_opts;
4189 	}
4190 
4191 	if (switch_output_setup(rec)) {
4192 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4193 		err = -EINVAL;
4194 		goto out_opts;
4195 	}
4196 
4197 	if (rec->switch_output.time) {
4198 		signal(SIGALRM, alarm_sig_handler);
4199 		alarm(rec->switch_output.time);
4200 	}
4201 
4202 	if (rec->switch_output.num_files) {
4203 		rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4204 						      sizeof(char *));
4205 		if (!rec->switch_output.filenames) {
4206 			err = -EINVAL;
4207 			goto out_opts;
4208 		}
4209 	}
4210 
4211 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4212 		rec->timestamp_filename = false;
4213 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4214 	}
4215 
4216 	if (rec->filter_action) {
4217 		if (!strcmp(rec->filter_action, "pin"))
4218 			err = perf_bpf_filter__pin();
4219 		else if (!strcmp(rec->filter_action, "unpin"))
4220 			err = perf_bpf_filter__unpin();
4221 		else {
4222 			pr_warning("Unknown BPF filter action: %s\n", rec->filter_action);
4223 			err = -EINVAL;
4224 		}
4225 		goto out_opts;
4226 	}
4227 
4228 	/* For backward compatibility, -d implies --mem-info and --data-mmap */
4229 	if (rec->opts.sample_address) {
4230 		rec->opts.sample_data_src = true;
4231 		if (!rec->opts.record_data_mmap_set)
4232 			rec->opts.record_data_mmap = true;
4233 	}
4234 
4235 	/*
4236 	 * Allow aliases to facilitate the lookup of symbols for address
4237 	 * filters. Refer to auxtrace_parse_filters().
4238 	 */
4239 	symbol_conf.allow_aliases = true;
4240 
4241 	symbol__init(NULL);
4242 
4243 	err = record__auxtrace_init(rec);
4244 	if (err)
4245 		goto out;
4246 
4247 	if (dry_run)
4248 		goto out;
4249 
4250 	err = -ENOMEM;
4251 
4252 	if (rec->no_buildid_cache) {
4253 		disable_buildid_cache();
4254 	} else if (rec->switch_output.enabled) {
4255 		/*
4256 		 * In 'perf record --switch-output', disable buildid
4257 		 * generation by default to reduce data file switching
4258 		 * overhead. Still generate buildid if they are required
4259 		 * explicitly using
4260 		 *
4261 		 *  perf record --switch-output --no-no-buildid \
4262 		 *              --no-no-buildid-cache
4263 		 *
4264 		 * Following code equals to:
4265 		 *
4266 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4267 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4268 		 *         disable_buildid_cache();
4269 		 */
4270 		bool disable = true;
4271 
4272 		if (rec->no_buildid_set && !rec->no_buildid)
4273 			disable = false;
4274 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4275 			disable = false;
4276 		if (disable) {
4277 			rec->no_buildid = true;
4278 			rec->no_buildid_cache = true;
4279 			disable_buildid_cache();
4280 		}
4281 	}
4282 
4283 	if (record.opts.overwrite)
4284 		record.opts.tail_synthesize = true;
4285 
4286 	if (rec->evlist->core.nr_entries == 0) {
4287 		struct evlist *def_evlist = evlist__new_default(&rec->opts.target,
4288 								callchain_param.enabled);
4289 
4290 		if (!def_evlist)
4291 			goto out;
4292 
4293 		evlist__splice_list_tail(rec->evlist, &def_evlist->core.entries);
4294 		evlist__delete(def_evlist);
4295 	}
4296 
4297 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4298 		rec->opts.no_inherit = true;
4299 
4300 	err = target__validate(&rec->opts.target);
4301 	if (err) {
4302 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4303 		ui__warning("%s\n", errbuf);
4304 	}
4305 
4306 	if (rec->uid_str) {
4307 		uid_t uid = parse_uid(rec->uid_str);
4308 
4309 		if (uid == UINT_MAX) {
4310 			ui__error("Invalid User: %s", rec->uid_str);
4311 			err = -EINVAL;
4312 			goto out;
4313 		}
4314 		err = parse_uid_filter(rec->evlist, uid);
4315 		if (err)
4316 			goto out;
4317 	}
4318 
4319 	/* Enable ignoring missing threads when -p option is defined. */
4320 	rec->opts.ignore_missing_thread = rec->opts.target.pid;
4321 
4322 	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4323 
4324 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP) {
4325 		if (EM_HOST == EM_AARCH64)
4326 			add_leaf_frame_caller_opts_aarch64(&rec->opts);
4327 	}
4328 
4329 	err = -ENOMEM;
4330 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4331 		if (rec->opts.target.pid != NULL) {
4332 			pr_err("Couldn't create thread/CPU maps: %s\n",
4333 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4334 			goto out;
4335 		}
4336 		else
4337 			usage_with_options(record_usage, record_options);
4338 	}
4339 
4340 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4341 	if (err)
4342 		goto out;
4343 
4344 	/*
4345 	 * We take all buildids when the file contains
4346 	 * AUX area tracing data because we do not decode the
4347 	 * trace because it would take too long.
4348 	 */
4349 	if (rec->opts.full_auxtrace)
4350 		rec->buildid_all = true;
4351 
4352 	if (rec->opts.text_poke) {
4353 		err = record__config_text_poke(rec->evlist);
4354 		if (err) {
4355 			pr_err("record__config_text_poke failed, error %d\n", err);
4356 			goto out;
4357 		}
4358 	}
4359 
4360 	if (rec->off_cpu) {
4361 		err = record__config_off_cpu(rec);
4362 		if (err) {
4363 			pr_err("record__config_off_cpu failed, error %d\n", err);
4364 			goto out;
4365 		}
4366 	}
4367 
4368 	if (record_opts__config(&rec->opts)) {
4369 		err = -EINVAL;
4370 		goto out;
4371 	}
4372 
4373 	err = record__config_tracking_events(rec);
4374 	if (err) {
4375 		pr_err("record__config_tracking_events failed, error %d\n", err);
4376 		goto out;
4377 	}
4378 
4379 	err = record__init_thread_masks(rec);
4380 	if (err) {
4381 		pr_err("Failed to initialize parallel data streaming masks\n");
4382 		goto out;
4383 	}
4384 
4385 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4386 		rec->opts.nr_cblocks = nr_cblocks_max;
4387 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4388 
4389 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4390 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4391 
4392 	if (rec->opts.comp_level > comp_level_max)
4393 		rec->opts.comp_level = comp_level_max;
4394 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4395 
4396 	err = __cmd_record(&record, argc, argv);
4397 out:
4398 	record__free_thread_masks(rec, rec->nr_threads);
4399 	rec->nr_threads = 0;
4400 	symbol__exit();
4401 	auxtrace_record__free(rec->itr);
4402 out_opts:
4403 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4404 	evlist__delete(rec->evlist);
4405 	return err;
4406 }
4407 
4408 static void snapshot_sig_handler(int sig __maybe_unused)
4409 {
4410 	struct record *rec = &record;
4411 
4412 	hit_auxtrace_snapshot_trigger(rec);
4413 
4414 	if (switch_output_signal(rec))
4415 		trigger_hit(&switch_output_trigger);
4416 }
4417 
4418 static void alarm_sig_handler(int sig __maybe_unused)
4419 {
4420 	struct record *rec = &record;
4421 
4422 	if (switch_output_time(rec))
4423 		trigger_hit(&switch_output_trigger);
4424 }
4425