xref: /linux/tools/perf/builtin-record.c (revision 80b549be27de0f11124c66eaeb5307c7b4582edd)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/stat.h"
30 #include "util/symbol.h"
31 #include "util/record.h"
32 #include "util/cpumap.h"
33 #include "util/thread_map.h"
34 #include "util/data.h"
35 #include "util/perf_regs.h"
36 #include "util/auxtrace.h"
37 #include "util/tsc.h"
38 #include "util/parse-branch-options.h"
39 #include "util/parse-regs-options.h"
40 #include "util/perf_api_probe.h"
41 #include "util/trigger.h"
42 #include "util/perf-hooks.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/pmu.h"
50 #include "util/pmus.h"
51 #include "util/clockid.h"
52 #include "util/off_cpu.h"
53 #include "util/bpf-filter.h"
54 #include "util/strbuf.h"
55 #include "asm/bug.h"
56 #include "perf.h"
57 #include "cputopo.h"
58 #include "dwarf-regs.h"
59 
60 #include <errno.h>
61 #include <inttypes.h>
62 #include <locale.h>
63 #include <poll.h>
64 #include <pthread.h>
65 #include <unistd.h>
66 #ifndef HAVE_GETTID
67 #include <syscall.h>
68 #endif
69 #include <sched.h>
70 #include <signal.h>
71 #ifdef HAVE_EVENTFD_SUPPORT
72 #include <sys/eventfd.h>
73 #endif
74 #include <sys/mman.h>
75 #include <sys/wait.h>
76 #include <sys/types.h>
77 #include <sys/stat.h>
78 #include <fcntl.h>
79 #include <linux/err.h>
80 #include <linux/string.h>
81 #include <linux/time64.h>
82 #include <linux/zalloc.h>
83 #include <linux/bitmap.h>
84 #include <sys/time.h>
85 
86 struct switch_output {
87 	bool		 enabled;
88 	bool		 signal;
89 	unsigned long	 size;
90 	unsigned long	 time;
91 	const char	*str;
92 	bool		 set;
93 	char		 **filenames;
94 	int		 num_files;
95 	int		 cur_file;
96 };
97 
98 struct thread_mask {
99 	struct mmap_cpu_mask	maps;
100 	struct mmap_cpu_mask	affinity;
101 };
102 
103 struct record_thread {
104 	pid_t			tid;
105 	struct thread_mask	*mask;
106 	struct {
107 		int		msg[2];
108 		int		ack[2];
109 	} pipes;
110 	struct fdarray		pollfd;
111 	int			ctlfd_pos;
112 	int			nr_mmaps;
113 	struct mmap		**maps;
114 	struct mmap		**overwrite_maps;
115 	struct record		*rec;
116 	unsigned long long	samples;
117 	unsigned long		waking;
118 	u64			bytes_written;
119 	u64			bytes_transferred;
120 	u64			bytes_compressed;
121 };
122 
123 static __thread struct record_thread *thread;
124 
125 enum thread_msg {
126 	THREAD_MSG__UNDEFINED = 0,
127 	THREAD_MSG__READY,
128 	THREAD_MSG__MAX,
129 };
130 
131 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
132 	"UNDEFINED", "READY"
133 };
134 
135 enum thread_spec {
136 	THREAD_SPEC__UNDEFINED = 0,
137 	THREAD_SPEC__CPU,
138 	THREAD_SPEC__CORE,
139 	THREAD_SPEC__PACKAGE,
140 	THREAD_SPEC__NUMA,
141 	THREAD_SPEC__USER,
142 	THREAD_SPEC__MAX,
143 };
144 
145 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
146 	"undefined", "cpu", "core", "package", "numa", "user"
147 };
148 
149 struct pollfd_index_map {
150 	int evlist_pollfd_index;
151 	int thread_pollfd_index;
152 };
153 
154 struct record {
155 	struct perf_tool	tool;
156 	struct record_opts	opts;
157 	u64			bytes_written;
158 	u64			thread_bytes_written;
159 	struct perf_data	data;
160 	struct auxtrace_record	*itr;
161 	struct evlist	*evlist;
162 	struct perf_session	*session;
163 	struct evlist		*sb_evlist;
164 	pthread_t		thread_id;
165 	int			realtime_prio;
166 	bool			latency;
167 	bool			switch_output_event_set;
168 	bool			no_buildid;
169 	bool			no_buildid_set;
170 	bool			no_buildid_cache;
171 	bool			no_buildid_cache_set;
172 	bool			buildid_all;
173 	bool			buildid_mmap;
174 	bool			buildid_mmap_set;
175 	bool			timestamp_filename;
176 	bool			timestamp_boundary;
177 	bool			off_cpu;
178 	const char		*filter_action;
179 	const char		*uid_str;
180 	struct switch_output	switch_output;
181 	unsigned long long	samples;
182 	unsigned long		output_max_size;	/* = 0: unlimited */
183 	struct perf_debuginfod	debuginfod;
184 	int			nr_threads;
185 	struct thread_mask	*thread_masks;
186 	struct record_thread	*thread_data;
187 	struct pollfd_index_map	*index_map;
188 	size_t			index_map_sz;
189 	size_t			index_map_cnt;
190 };
191 
192 static volatile int done;
193 
194 static volatile int auxtrace_record__snapshot_started;
195 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
196 static DEFINE_TRIGGER(switch_output_trigger);
197 
198 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
199 	"SYS", "NODE", "CPU"
200 };
201 
202 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
203 				  struct perf_sample *sample, struct machine *machine);
204 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
205 				   struct perf_sample *sample, struct machine *machine);
206 static int process_timestamp_boundary(const struct perf_tool *tool,
207 				      union perf_event *event,
208 				      struct perf_sample *sample,
209 				      struct machine *machine);
210 
211 #ifndef HAVE_GETTID
212 static inline pid_t gettid(void)
213 {
214 	return (pid_t)syscall(__NR_gettid);
215 }
216 #endif
217 
218 static int record__threads_enabled(struct record *rec)
219 {
220 	return rec->opts.threads_spec;
221 }
222 
223 static bool switch_output_signal(struct record *rec)
224 {
225 	return rec->switch_output.signal &&
226 	       trigger_is_ready(&switch_output_trigger);
227 }
228 
229 static bool switch_output_size(struct record *rec)
230 {
231 	return rec->switch_output.size &&
232 	       trigger_is_ready(&switch_output_trigger) &&
233 	       (rec->bytes_written >= rec->switch_output.size);
234 }
235 
236 static bool switch_output_time(struct record *rec)
237 {
238 	return rec->switch_output.time &&
239 	       trigger_is_ready(&switch_output_trigger);
240 }
241 
242 static u64 record__bytes_written(struct record *rec)
243 {
244 	return rec->bytes_written + rec->thread_bytes_written;
245 }
246 
247 static bool record__output_max_size_exceeded(struct record *rec)
248 {
249 	return rec->output_max_size &&
250 	       (record__bytes_written(rec) >= rec->output_max_size);
251 }
252 
253 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
254 			 void *bf, size_t size)
255 {
256 	struct perf_data_file *file = &rec->session->data->file;
257 
258 	if (map && map->file)
259 		file = map->file;
260 
261 	if (perf_data_file__write(file, bf, size) < 0) {
262 		pr_err("failed to write perf data, error: %m\n");
263 		return -1;
264 	}
265 
266 	if (map && map->file) {
267 		thread->bytes_written += size;
268 		rec->thread_bytes_written += size;
269 	} else {
270 		rec->bytes_written += size;
271 	}
272 
273 	if (record__output_max_size_exceeded(rec) && !done) {
274 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
275 				" stopping session ]\n",
276 				record__bytes_written(rec) >> 10);
277 		done = 1;
278 	}
279 
280 	if (switch_output_size(rec))
281 		trigger_hit(&switch_output_trigger);
282 
283 	return 0;
284 }
285 
286 static int record__aio_enabled(struct record *rec);
287 static int record__comp_enabled(struct record *rec);
288 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
289 			    void *dst, size_t dst_size, void *src, size_t src_size);
290 
291 #ifdef HAVE_AIO_SUPPORT
292 static int record__aio_write(struct aiocb *cblock, int trace_fd,
293 		void *buf, size_t size, off_t off)
294 {
295 	int rc;
296 
297 	cblock->aio_fildes = trace_fd;
298 	cblock->aio_buf    = buf;
299 	cblock->aio_nbytes = size;
300 	cblock->aio_offset = off;
301 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
302 
303 	do {
304 		rc = aio_write(cblock);
305 		if (rc == 0) {
306 			break;
307 		} else if (errno != EAGAIN) {
308 			cblock->aio_fildes = -1;
309 			pr_err("failed to queue perf data, error: %m\n");
310 			break;
311 		}
312 	} while (1);
313 
314 	return rc;
315 }
316 
317 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
318 {
319 	void *rem_buf;
320 	off_t rem_off;
321 	size_t rem_size;
322 	int rc, aio_errno;
323 	ssize_t aio_ret, written;
324 
325 	aio_errno = aio_error(cblock);
326 	if (aio_errno == EINPROGRESS)
327 		return 0;
328 
329 	written = aio_ret = aio_return(cblock);
330 	if (aio_ret < 0) {
331 		if (aio_errno != EINTR)
332 			pr_err("failed to write perf data, error: %m\n");
333 		written = 0;
334 	}
335 
336 	rem_size = cblock->aio_nbytes - written;
337 
338 	if (rem_size == 0) {
339 		cblock->aio_fildes = -1;
340 		/*
341 		 * md->refcount is incremented in record__aio_pushfn() for
342 		 * every aio write request started in record__aio_push() so
343 		 * decrement it because the request is now complete.
344 		 */
345 		perf_mmap__put(&md->core);
346 		rc = 1;
347 	} else {
348 		/*
349 		 * aio write request may require restart with the
350 		 * remainder if the kernel didn't write whole
351 		 * chunk at once.
352 		 */
353 		rem_off = cblock->aio_offset + written;
354 		rem_buf = (void *)(cblock->aio_buf + written);
355 		record__aio_write(cblock, cblock->aio_fildes,
356 				rem_buf, rem_size, rem_off);
357 		rc = 0;
358 	}
359 
360 	return rc;
361 }
362 
363 static int record__aio_sync(struct mmap *md, bool sync_all)
364 {
365 	struct aiocb **aiocb = md->aio.aiocb;
366 	struct aiocb *cblocks = md->aio.cblocks;
367 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
368 	int i, do_suspend;
369 
370 	do {
371 		do_suspend = 0;
372 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
373 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
374 				if (sync_all)
375 					aiocb[i] = NULL;
376 				else
377 					return i;
378 			} else {
379 				/*
380 				 * Started aio write is not complete yet
381 				 * so it has to be waited before the
382 				 * next allocation.
383 				 */
384 				aiocb[i] = &cblocks[i];
385 				do_suspend = 1;
386 			}
387 		}
388 		if (!do_suspend)
389 			return -1;
390 
391 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
392 			if (!(errno == EAGAIN || errno == EINTR))
393 				pr_err("failed to sync perf data, error: %m\n");
394 		}
395 	} while (1);
396 }
397 
398 struct record_aio {
399 	struct record	*rec;
400 	void		*data;
401 	size_t		size;
402 };
403 
404 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
405 {
406 	struct record_aio *aio = to;
407 
408 	/*
409 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
410 	 * to release space in the kernel buffer as fast as possible, calling
411 	 * perf_mmap__consume() from perf_mmap__push() function.
412 	 *
413 	 * That lets the kernel to proceed with storing more profiling data into
414 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
415 	 *
416 	 * Coping can be done in two steps in case the chunk of profiling data
417 	 * crosses the upper bound of the kernel buffer. In this case we first move
418 	 * part of data from map->start till the upper bound and then the remainder
419 	 * from the beginning of the kernel buffer till the end of the data chunk.
420 	 */
421 
422 	if (record__comp_enabled(aio->rec)) {
423 		ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
424 						   mmap__mmap_len(map) - aio->size,
425 						   buf, size);
426 		if (compressed < 0)
427 			return (int)compressed;
428 
429 		size = compressed;
430 	} else {
431 		memcpy(aio->data + aio->size, buf, size);
432 	}
433 
434 	if (!aio->size) {
435 		/*
436 		 * Increment map->refcount to guard map->aio.data[] buffer
437 		 * from premature deallocation because map object can be
438 		 * released earlier than aio write request started on
439 		 * map->aio.data[] buffer is complete.
440 		 *
441 		 * perf_mmap__put() is done at record__aio_complete()
442 		 * after started aio request completion or at record__aio_push()
443 		 * if the request failed to start.
444 		 */
445 		perf_mmap__get(&map->core);
446 	}
447 
448 	aio->size += size;
449 
450 	return size;
451 }
452 
453 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
454 {
455 	int ret, idx;
456 	int trace_fd = perf_data__fd(rec->session->data);
457 	struct record_aio aio = { .rec = rec, .size = 0 };
458 
459 	/*
460 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
461 	 * becomes available after previous aio write operation.
462 	 */
463 
464 	idx = record__aio_sync(map, false);
465 	aio.data = map->aio.data[idx];
466 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
467 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
468 		return ret;
469 
470 	rec->samples++;
471 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
472 	if (!ret) {
473 		*off += aio.size;
474 		rec->bytes_written += aio.size;
475 		if (switch_output_size(rec))
476 			trigger_hit(&switch_output_trigger);
477 	} else {
478 		/*
479 		 * Decrement map->refcount incremented in record__aio_pushfn()
480 		 * back if record__aio_write() operation failed to start, otherwise
481 		 * map->refcount is decremented in record__aio_complete() after
482 		 * aio write operation finishes successfully.
483 		 */
484 		perf_mmap__put(&map->core);
485 	}
486 
487 	return ret;
488 }
489 
490 static off_t record__aio_get_pos(int trace_fd)
491 {
492 	return lseek(trace_fd, 0, SEEK_CUR);
493 }
494 
495 static void record__aio_set_pos(int trace_fd, off_t pos)
496 {
497 	lseek(trace_fd, pos, SEEK_SET);
498 }
499 
500 static void record__aio_mmap_read_sync(struct record *rec)
501 {
502 	int i;
503 	struct evlist *evlist = rec->evlist;
504 	struct mmap *maps = evlist->mmap;
505 
506 	if (!record__aio_enabled(rec))
507 		return;
508 
509 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
510 		struct mmap *map = &maps[i];
511 
512 		if (map->core.base)
513 			record__aio_sync(map, true);
514 	}
515 }
516 
517 static int nr_cblocks_default = 1;
518 static int nr_cblocks_max = 4;
519 
520 static int record__aio_parse(const struct option *opt,
521 			     const char *str,
522 			     int unset)
523 {
524 	struct record_opts *opts = (struct record_opts *)opt->value;
525 
526 	if (unset) {
527 		opts->nr_cblocks = 0;
528 	} else {
529 		if (str)
530 			opts->nr_cblocks = strtol(str, NULL, 0);
531 		if (!opts->nr_cblocks)
532 			opts->nr_cblocks = nr_cblocks_default;
533 	}
534 
535 	return 0;
536 }
537 #else /* HAVE_AIO_SUPPORT */
538 static int nr_cblocks_max = 0;
539 
540 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
541 			    off_t *off __maybe_unused)
542 {
543 	return -1;
544 }
545 
546 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
547 {
548 	return -1;
549 }
550 
551 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
552 {
553 }
554 
555 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
556 {
557 }
558 #endif
559 
560 static int record__aio_enabled(struct record *rec)
561 {
562 	return rec->opts.nr_cblocks > 0;
563 }
564 
565 #define MMAP_FLUSH_DEFAULT 1
566 static int record__mmap_flush_parse(const struct option *opt,
567 				    const char *str,
568 				    int unset)
569 {
570 	int flush_max;
571 	struct record_opts *opts = (struct record_opts *)opt->value;
572 	static struct parse_tag tags[] = {
573 			{ .tag  = 'B', .mult = 1       },
574 			{ .tag  = 'K', .mult = 1 << 10 },
575 			{ .tag  = 'M', .mult = 1 << 20 },
576 			{ .tag  = 'G', .mult = 1 << 30 },
577 			{ .tag  = 0 },
578 	};
579 
580 	if (unset)
581 		return 0;
582 
583 	if (str) {
584 		opts->mmap_flush = parse_tag_value(str, tags);
585 		if (opts->mmap_flush == (int)-1)
586 			opts->mmap_flush = strtol(str, NULL, 0);
587 	}
588 
589 	if (!opts->mmap_flush)
590 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
591 
592 	flush_max = evlist__mmap_size(opts->mmap_pages);
593 	flush_max /= 4;
594 	if (opts->mmap_flush > flush_max)
595 		opts->mmap_flush = flush_max;
596 
597 	return 0;
598 }
599 
600 #ifdef HAVE_ZSTD_SUPPORT
601 static unsigned int comp_level_default = 1;
602 
603 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
604 {
605 	struct record_opts *opts = opt->value;
606 
607 	if (unset) {
608 		opts->comp_level = 0;
609 	} else {
610 		if (str)
611 			opts->comp_level = strtol(str, NULL, 0);
612 		if (!opts->comp_level)
613 			opts->comp_level = comp_level_default;
614 	}
615 
616 	return 0;
617 }
618 #endif
619 static unsigned int comp_level_max = 22;
620 
621 static int record__comp_enabled(struct record *rec)
622 {
623 	return rec->opts.comp_level > 0;
624 }
625 
626 static int process_synthesized_event(const struct perf_tool *tool,
627 				     union perf_event *event,
628 				     struct perf_sample *sample __maybe_unused,
629 				     struct machine *machine __maybe_unused)
630 {
631 	struct record *rec = container_of(tool, struct record, tool);
632 	return record__write(rec, NULL, event, event->header.size);
633 }
634 
635 static struct mutex synth_lock;
636 
637 static int process_locked_synthesized_event(const struct perf_tool *tool,
638 				     union perf_event *event,
639 				     struct perf_sample *sample __maybe_unused,
640 				     struct machine *machine __maybe_unused)
641 {
642 	int ret;
643 
644 	mutex_lock(&synth_lock);
645 	ret = process_synthesized_event(tool, event, sample, machine);
646 	mutex_unlock(&synth_lock);
647 	return ret;
648 }
649 
650 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
651 {
652 	struct record *rec = to;
653 
654 	if (record__comp_enabled(rec)) {
655 		struct perf_record_compressed2 *event = map->data;
656 		size_t padding = 0;
657 		u8 pad[8] = {0};
658 		ssize_t compressed = zstd_compress(rec->session, map, map->data,
659 						   mmap__mmap_len(map), bf, size);
660 
661 		if (compressed < 0)
662 			return (int)compressed;
663 
664 		bf = event;
665 		thread->samples++;
666 
667 		/*
668 		 * The record from `zstd_compress` is not 8 bytes aligned, which would cause asan
669 		 * error. We make it aligned here.
670 		 */
671 		event->data_size = compressed - sizeof(struct perf_record_compressed2);
672 		event->header.size = PERF_ALIGN(compressed, sizeof(u64));
673 		padding = event->header.size - compressed;
674 		return record__write(rec, map, bf, compressed) ||
675 		       record__write(rec, map, &pad, padding);
676 	}
677 
678 	thread->samples++;
679 	return record__write(rec, map, bf, size);
680 }
681 
682 static volatile sig_atomic_t signr = -1;
683 static volatile sig_atomic_t child_finished;
684 #ifdef HAVE_EVENTFD_SUPPORT
685 static volatile sig_atomic_t done_fd = -1;
686 #endif
687 
688 static void sig_handler(int sig)
689 {
690 	if (sig == SIGCHLD)
691 		child_finished = 1;
692 	else
693 		signr = sig;
694 
695 	done = 1;
696 #ifdef HAVE_EVENTFD_SUPPORT
697 	if (done_fd >= 0) {
698 		u64 tmp = 1;
699 		int orig_errno = errno;
700 
701 		/*
702 		 * It is possible for this signal handler to run after done is
703 		 * checked in the main loop, but before the perf counter fds are
704 		 * polled. If this happens, the poll() will continue to wait
705 		 * even though done is set, and will only break out if either
706 		 * another signal is received, or the counters are ready for
707 		 * read. To ensure the poll() doesn't sleep when done is set,
708 		 * use an eventfd (done_fd) to wake up the poll().
709 		 */
710 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
711 			pr_err("failed to signal wakeup fd, error: %m\n");
712 
713 		errno = orig_errno;
714 	}
715 #endif // HAVE_EVENTFD_SUPPORT
716 }
717 
718 static void sigsegv_handler(int sig)
719 {
720 	perf_hooks__recover();
721 	sighandler_dump_stack(sig);
722 }
723 
724 static void record__sig_exit(void)
725 {
726 	if (signr == -1)
727 		return;
728 
729 	signal(signr, SIG_DFL);
730 	raise(signr);
731 }
732 
733 static int record__process_auxtrace(const struct perf_tool *tool,
734 				    struct mmap *map,
735 				    union perf_event *event, void *data1,
736 				    size_t len1, void *data2, size_t len2)
737 {
738 	struct record *rec = container_of(tool, struct record, tool);
739 	struct perf_data *data = &rec->data;
740 	size_t padding;
741 	u8 pad[8] = {0};
742 
743 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
744 		off_t file_offset;
745 		int fd = perf_data__fd(data);
746 		int err;
747 
748 		file_offset = lseek(fd, 0, SEEK_CUR);
749 		if (file_offset == -1)
750 			return -1;
751 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
752 						     event, file_offset);
753 		if (err)
754 			return err;
755 	}
756 
757 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
758 	padding = (len1 + len2) & 7;
759 	if (padding)
760 		padding = 8 - padding;
761 
762 	record__write(rec, map, event, event->header.size);
763 	record__write(rec, map, data1, len1);
764 	if (len2)
765 		record__write(rec, map, data2, len2);
766 	record__write(rec, map, &pad, padding);
767 
768 	return 0;
769 }
770 
771 static int record__auxtrace_mmap_read(struct record *rec,
772 				      struct mmap *map)
773 {
774 	int ret;
775 
776 	ret = auxtrace_mmap__read(map, rec->itr,
777 				  perf_session__env(rec->session),
778 				  &rec->tool,
779 				  record__process_auxtrace);
780 	if (ret < 0)
781 		return ret;
782 
783 	if (ret)
784 		rec->samples++;
785 
786 	return 0;
787 }
788 
789 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
790 					       struct mmap *map)
791 {
792 	int ret;
793 
794 	ret = auxtrace_mmap__read_snapshot(map, rec->itr,
795 					   perf_session__env(rec->session),
796 					   &rec->tool,
797 					   record__process_auxtrace,
798 					   rec->opts.auxtrace_snapshot_size);
799 	if (ret < 0)
800 		return ret;
801 
802 	if (ret)
803 		rec->samples++;
804 
805 	return 0;
806 }
807 
808 static int record__auxtrace_read_snapshot_all(struct record *rec)
809 {
810 	int i;
811 	int rc = 0;
812 
813 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
814 		struct mmap *map = &rec->evlist->mmap[i];
815 
816 		if (!map->auxtrace_mmap.base)
817 			continue;
818 
819 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
820 			rc = -1;
821 			goto out;
822 		}
823 	}
824 out:
825 	return rc;
826 }
827 
828 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
829 {
830 	pr_debug("Recording AUX area tracing snapshot\n");
831 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
832 		trigger_error(&auxtrace_snapshot_trigger);
833 	} else {
834 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
835 			trigger_error(&auxtrace_snapshot_trigger);
836 		else
837 			trigger_ready(&auxtrace_snapshot_trigger);
838 	}
839 }
840 
841 static int record__auxtrace_snapshot_exit(struct record *rec)
842 {
843 	if (trigger_is_error(&auxtrace_snapshot_trigger))
844 		return 0;
845 
846 	if (!auxtrace_record__snapshot_started &&
847 	    auxtrace_record__snapshot_start(rec->itr))
848 		return -1;
849 
850 	record__read_auxtrace_snapshot(rec, true);
851 	if (trigger_is_error(&auxtrace_snapshot_trigger))
852 		return -1;
853 
854 	return 0;
855 }
856 
857 static int record__auxtrace_init(struct record *rec)
858 {
859 	int err;
860 
861 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
862 	    && record__threads_enabled(rec)) {
863 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
864 		return -EINVAL;
865 	}
866 
867 	if (!rec->itr) {
868 		rec->itr = auxtrace_record__init(rec->evlist, &err);
869 		if (err)
870 			return err;
871 	}
872 
873 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
874 					      rec->opts.auxtrace_snapshot_opts);
875 	if (err)
876 		return err;
877 
878 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
879 					    rec->opts.auxtrace_sample_opts);
880 	if (err)
881 		return err;
882 
883 	err = auxtrace_parse_aux_action(rec->evlist);
884 	if (err)
885 		return err;
886 
887 	return auxtrace_parse_filters(rec->evlist);
888 }
889 
890 static int record__config_text_poke(struct evlist *evlist)
891 {
892 	struct evsel *evsel;
893 
894 	/* Nothing to do if text poke is already configured */
895 	evlist__for_each_entry(evlist, evsel) {
896 		if (evsel->core.attr.text_poke)
897 			return 0;
898 	}
899 
900 	evsel = evlist__add_dummy_on_all_cpus(evlist);
901 	if (!evsel)
902 		return -ENOMEM;
903 
904 	evsel->core.attr.text_poke = 1;
905 	evsel->core.attr.ksymbol = 1;
906 	evsel->immediate = true;
907 	evsel__set_sample_bit(evsel, TIME);
908 
909 	return 0;
910 }
911 
912 static int record__config_off_cpu(struct record *rec)
913 {
914 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
915 }
916 
917 static bool record__tracking_system_wide(struct record *rec)
918 {
919 	struct evlist *evlist = rec->evlist;
920 	struct evsel *evsel;
921 
922 	/*
923 	 * If non-dummy evsel exists, system_wide sideband is need to
924 	 * help parse sample information.
925 	 * For example, PERF_EVENT_MMAP event to help parse symbol,
926 	 * and PERF_EVENT_COMM event to help parse task executable name.
927 	 */
928 	evlist__for_each_entry(evlist, evsel) {
929 		if (!evsel__is_dummy_event(evsel))
930 			return true;
931 	}
932 
933 	return false;
934 }
935 
936 static int record__config_tracking_events(struct record *rec)
937 {
938 	struct record_opts *opts = &rec->opts;
939 	struct evlist *evlist = rec->evlist;
940 	bool system_wide = false;
941 	struct evsel *evsel;
942 
943 	/*
944 	 * For initial_delay, system wide or a hybrid system, we need to add
945 	 * tracking event so that we can track PERF_RECORD_MMAP to cover the
946 	 * delay of waiting or event synthesis.
947 	 */
948 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
949 	    perf_pmus__num_core_pmus() > 1) {
950 		/*
951 		 * User space tasks can migrate between CPUs, so when tracing
952 		 * selected CPUs, sideband for all CPUs is still needed.
953 		 */
954 		if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
955 			system_wide = true;
956 
957 		evsel = evlist__findnew_tracking_event(evlist, system_wide);
958 		if (!evsel)
959 			return -ENOMEM;
960 
961 		/*
962 		 * Enable the tracking event when the process is forked for
963 		 * initial_delay, immediately for system wide.
964 		 */
965 		if (opts->target.initial_delay && !evsel->immediate &&
966 		    !target__has_cpu(&opts->target))
967 			evsel->core.attr.enable_on_exec = 1;
968 		else
969 			evsel->immediate = 1;
970 	}
971 
972 	return 0;
973 }
974 
975 static bool record__kcore_readable(struct machine *machine)
976 {
977 	char kcore[PATH_MAX];
978 	int fd;
979 
980 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
981 
982 	fd = open(kcore, O_RDONLY);
983 	if (fd < 0)
984 		return false;
985 
986 	close(fd);
987 
988 	return true;
989 }
990 
991 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
992 {
993 	char from_dir[PATH_MAX];
994 	char kcore_dir[PATH_MAX];
995 	int ret;
996 
997 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
998 
999 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1000 	if (ret)
1001 		return ret;
1002 
1003 	return kcore_copy(from_dir, kcore_dir);
1004 }
1005 
1006 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1007 {
1008 	thread_data->pipes.msg[0] = -1;
1009 	thread_data->pipes.msg[1] = -1;
1010 	thread_data->pipes.ack[0] = -1;
1011 	thread_data->pipes.ack[1] = -1;
1012 }
1013 
1014 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1015 {
1016 	if (pipe(thread_data->pipes.msg))
1017 		return -EINVAL;
1018 
1019 	if (pipe(thread_data->pipes.ack)) {
1020 		close(thread_data->pipes.msg[0]);
1021 		thread_data->pipes.msg[0] = -1;
1022 		close(thread_data->pipes.msg[1]);
1023 		thread_data->pipes.msg[1] = -1;
1024 		return -EINVAL;
1025 	}
1026 
1027 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1028 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1029 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1030 
1031 	return 0;
1032 }
1033 
1034 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1035 {
1036 	if (thread_data->pipes.msg[0] != -1) {
1037 		close(thread_data->pipes.msg[0]);
1038 		thread_data->pipes.msg[0] = -1;
1039 	}
1040 	if (thread_data->pipes.msg[1] != -1) {
1041 		close(thread_data->pipes.msg[1]);
1042 		thread_data->pipes.msg[1] = -1;
1043 	}
1044 	if (thread_data->pipes.ack[0] != -1) {
1045 		close(thread_data->pipes.ack[0]);
1046 		thread_data->pipes.ack[0] = -1;
1047 	}
1048 	if (thread_data->pipes.ack[1] != -1) {
1049 		close(thread_data->pipes.ack[1]);
1050 		thread_data->pipes.ack[1] = -1;
1051 	}
1052 }
1053 
1054 static bool evlist__per_thread(struct evlist *evlist)
1055 {
1056 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1057 }
1058 
1059 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1060 {
1061 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1062 	struct mmap *mmap = evlist->mmap;
1063 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1064 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1065 	bool per_thread = evlist__per_thread(evlist);
1066 
1067 	if (per_thread)
1068 		thread_data->nr_mmaps = nr_mmaps;
1069 	else
1070 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1071 						      thread_data->mask->maps.nbits);
1072 	if (mmap) {
1073 		thread_data->maps = calloc(thread_data->nr_mmaps, sizeof(struct mmap *));
1074 		if (!thread_data->maps)
1075 			return -ENOMEM;
1076 	}
1077 	if (overwrite_mmap) {
1078 		thread_data->overwrite_maps = calloc(thread_data->nr_mmaps, sizeof(struct mmap *));
1079 		if (!thread_data->overwrite_maps) {
1080 			zfree(&thread_data->maps);
1081 			return -ENOMEM;
1082 		}
1083 	}
1084 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1085 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1086 
1087 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1088 		if (per_thread ||
1089 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1090 			if (thread_data->maps) {
1091 				thread_data->maps[tm] = &mmap[m];
1092 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1093 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1094 			}
1095 			if (thread_data->overwrite_maps) {
1096 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1097 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1098 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1099 			}
1100 			tm++;
1101 		}
1102 	}
1103 
1104 	return 0;
1105 }
1106 
1107 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1108 {
1109 	int f, tm, pos;
1110 	struct mmap *map, *overwrite_map;
1111 
1112 	fdarray__init(&thread_data->pollfd, 64);
1113 
1114 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1115 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1116 		overwrite_map = thread_data->overwrite_maps ?
1117 				thread_data->overwrite_maps[tm] : NULL;
1118 
1119 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1120 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1121 
1122 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1123 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1124 							      &evlist->core.pollfd);
1125 				if (pos < 0)
1126 					return pos;
1127 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1128 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1129 			}
1130 		}
1131 	}
1132 
1133 	return 0;
1134 }
1135 
1136 static void record__free_thread_data(struct record *rec)
1137 {
1138 	int t;
1139 	struct record_thread *thread_data = rec->thread_data;
1140 
1141 	if (thread_data == NULL)
1142 		return;
1143 
1144 	for (t = 0; t < rec->nr_threads; t++) {
1145 		record__thread_data_close_pipes(&thread_data[t]);
1146 		zfree(&thread_data[t].maps);
1147 		zfree(&thread_data[t].overwrite_maps);
1148 		fdarray__exit(&thread_data[t].pollfd);
1149 	}
1150 
1151 	zfree(&rec->thread_data);
1152 }
1153 
1154 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1155 						    int evlist_pollfd_index,
1156 						    int thread_pollfd_index)
1157 {
1158 	size_t x = rec->index_map_cnt;
1159 
1160 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1161 		return -ENOMEM;
1162 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1163 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1164 	rec->index_map_cnt += 1;
1165 	return 0;
1166 }
1167 
1168 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1169 						    struct evlist *evlist,
1170 						    struct record_thread *thread_data)
1171 {
1172 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1173 	struct pollfd *t_entries = thread_data->pollfd.entries;
1174 	int err = 0;
1175 	size_t i;
1176 
1177 	for (i = 0; i < rec->index_map_cnt; i++) {
1178 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1179 		int t_pos = rec->index_map[i].thread_pollfd_index;
1180 
1181 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1182 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1183 			pr_err("Thread and evlist pollfd index mismatch\n");
1184 			err = -EINVAL;
1185 			continue;
1186 		}
1187 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1188 	}
1189 	return err;
1190 }
1191 
1192 static int record__dup_non_perf_events(struct record *rec,
1193 				       struct evlist *evlist,
1194 				       struct record_thread *thread_data)
1195 {
1196 	struct fdarray *fda = &evlist->core.pollfd;
1197 	int i, ret;
1198 
1199 	for (i = 0; i < fda->nr; i++) {
1200 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1201 			continue;
1202 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1203 		if (ret < 0) {
1204 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1205 			return ret;
1206 		}
1207 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1208 			  thread_data, ret, fda->entries[i].fd);
1209 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1210 		if (ret < 0) {
1211 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1212 			return ret;
1213 		}
1214 	}
1215 	return 0;
1216 }
1217 
1218 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1219 {
1220 	int t, ret;
1221 	struct record_thread *thread_data;
1222 
1223 	rec->thread_data = calloc(rec->nr_threads, sizeof(*(rec->thread_data)));
1224 	if (!rec->thread_data) {
1225 		pr_err("Failed to allocate thread data\n");
1226 		return -ENOMEM;
1227 	}
1228 	thread_data = rec->thread_data;
1229 
1230 	for (t = 0; t < rec->nr_threads; t++)
1231 		record__thread_data_init_pipes(&thread_data[t]);
1232 
1233 	for (t = 0; t < rec->nr_threads; t++) {
1234 		thread_data[t].rec = rec;
1235 		thread_data[t].mask = &rec->thread_masks[t];
1236 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1237 		if (ret) {
1238 			pr_err("Failed to initialize thread[%d] maps\n", t);
1239 			goto out_free;
1240 		}
1241 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1242 		if (ret) {
1243 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1244 			goto out_free;
1245 		}
1246 		if (t) {
1247 			thread_data[t].tid = -1;
1248 			ret = record__thread_data_open_pipes(&thread_data[t]);
1249 			if (ret) {
1250 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1251 				goto out_free;
1252 			}
1253 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1254 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1255 			if (ret < 0) {
1256 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1257 				goto out_free;
1258 			}
1259 			thread_data[t].ctlfd_pos = ret;
1260 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1261 				 thread_data, thread_data[t].ctlfd_pos,
1262 				 thread_data[t].pipes.msg[0]);
1263 		} else {
1264 			thread_data[t].tid = gettid();
1265 
1266 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1267 			if (ret < 0)
1268 				goto out_free;
1269 
1270 			thread_data[t].ctlfd_pos = -1; /* Not used */
1271 		}
1272 	}
1273 
1274 	return 0;
1275 
1276 out_free:
1277 	record__free_thread_data(rec);
1278 
1279 	return ret;
1280 }
1281 
1282 static int record__mmap_evlist(struct record *rec,
1283 			       struct evlist *evlist)
1284 {
1285 	int i, ret;
1286 	struct record_opts *opts = &rec->opts;
1287 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1288 				  opts->auxtrace_sample_mode;
1289 
1290 	if (opts->affinity != PERF_AFFINITY_SYS)
1291 		cpu__setup_cpunode_map();
1292 
1293 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1294 				 opts->auxtrace_mmap_pages,
1295 				 auxtrace_overwrite,
1296 				 opts->nr_cblocks, opts->affinity,
1297 				 opts->mmap_flush, opts->comp_level) < 0) {
1298 		if (errno == EPERM) {
1299 			pr_err("Permission error mapping pages.\n"
1300 			       "Consider increasing "
1301 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1302 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1303 			       "(current value: %u,%u)\n",
1304 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1305 			return -errno;
1306 		} else {
1307 			pr_err("failed to mmap: %m\n");
1308 			if (errno)
1309 				return -errno;
1310 			else
1311 				return -EINVAL;
1312 		}
1313 	}
1314 
1315 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1316 		return -1;
1317 
1318 	ret = record__alloc_thread_data(rec, evlist);
1319 	if (ret)
1320 		return ret;
1321 
1322 	if (record__threads_enabled(rec)) {
1323 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1324 		if (ret) {
1325 			errno = -ret;
1326 			pr_err("Failed to create data directory: %m\n");
1327 			return ret;
1328 		}
1329 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1330 			if (evlist->mmap)
1331 				evlist->mmap[i].file = &rec->data.dir.files[i];
1332 			if (evlist->overwrite_mmap)
1333 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1334 		}
1335 	}
1336 
1337 	return 0;
1338 }
1339 
1340 static int record__mmap(struct record *rec)
1341 {
1342 	return record__mmap_evlist(rec, rec->evlist);
1343 }
1344 
1345 static int record__open(struct record *rec)
1346 {
1347 	char msg[BUFSIZ];
1348 	struct evsel *pos;
1349 	struct evlist *evlist = rec->evlist;
1350 	struct perf_session *session = rec->session;
1351 	struct record_opts *opts = &rec->opts;
1352 	int rc = 0;
1353 	bool skipped = false;
1354 	bool removed_tracking = false;
1355 
1356 	evlist__for_each_entry(evlist, pos) {
1357 		if (removed_tracking) {
1358 			/*
1359 			 * Normally the head of the list has tracking enabled
1360 			 * for sideband data like mmaps. If this event is
1361 			 * removed, make sure to add tracking to the next
1362 			 * processed event.
1363 			 */
1364 			if (!pos->tracking) {
1365 				pos->tracking = true;
1366 				evsel__config(pos, opts, &callchain_param);
1367 			}
1368 			removed_tracking = false;
1369 		}
1370 try_again:
1371 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1372 			bool report_error = true;
1373 
1374 			if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1375 				if (verbose > 0)
1376 					ui__warning("%s\n", msg);
1377 				goto try_again;
1378 			}
1379 			if ((errno == EINVAL || errno == EBADF) &&
1380 			    pos->core.leader != &pos->core &&
1381 			    pos->weak_group) {
1382 			        pos = evlist__reset_weak_group(evlist, pos, true);
1383 				goto try_again;
1384 			}
1385 #if defined(__aarch64__) || defined(__arm__)
1386 			if (strstr(evsel__name(pos), "cycles")) {
1387 				struct evsel *pos2;
1388 				/*
1389 				 * Unfortunately ARM has many events named
1390 				 * "cycles" on PMUs like the system-level (L3)
1391 				 * cache which don't support sampling. Only
1392 				 * display such failures to open when there is
1393 				 * only 1 cycles event or verbose is enabled.
1394 				 */
1395 				evlist__for_each_entry(evlist, pos2) {
1396 					if (pos2 == pos)
1397 						continue;
1398 					if (strstr(evsel__name(pos2), "cycles")) {
1399 						report_error = false;
1400 						break;
1401 					}
1402 				}
1403 			}
1404 #endif
1405 			if (report_error || verbose > 0) {
1406 				evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1407 				ui__error("Failure to open event '%s' on PMU '%s' which will be "
1408 					  "removed.\n%s\n",
1409 					  evsel__name(pos), evsel__pmu_name(pos), msg);
1410 			}
1411 			if (pos->tracking)
1412 				removed_tracking = true;
1413 			pos->skippable = true;
1414 			skipped = true;
1415 		}
1416 	}
1417 
1418 	if (skipped) {
1419 		struct evsel *tmp;
1420 		int idx = 0;
1421 		bool evlist_empty = true;
1422 
1423 		/* Remove evsels that failed to open and update indices. */
1424 		evlist__for_each_entry_safe(evlist, tmp, pos) {
1425 			if (pos->skippable) {
1426 				evlist__remove(evlist, pos);
1427 				continue;
1428 			}
1429 
1430 			/*
1431 			 * Note, dummy events may be command line parsed or
1432 			 * added by the tool. We care about supporting `perf
1433 			 * record -e dummy` which may be used as a permission
1434 			 * check. Dummy events that are added to the command
1435 			 * line and opened along with other events that fail,
1436 			 * will still fail as if the dummy events were tool
1437 			 * added events for the sake of code simplicity.
1438 			 */
1439 			if (!evsel__is_dummy_event(pos))
1440 				evlist_empty = false;
1441 		}
1442 		evlist__for_each_entry(evlist, pos) {
1443 			pos->core.idx = idx++;
1444 		}
1445 		/* If list is empty then fail. */
1446 		if (evlist_empty) {
1447 			ui__error("Failure to open any events for recording.\n");
1448 			rc = -1;
1449 			goto out;
1450 		}
1451 	}
1452 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1453 		pr_warning(
1454 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1455 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1456 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1457 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1458 "Samples in kernel modules won't be resolved at all.\n\n"
1459 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1460 "even with a suitable vmlinux or kallsyms file.\n\n");
1461 	}
1462 
1463 	if (evlist__apply_filters(evlist, &pos, &opts->target)) {
1464 		pr_err("failed to set filter \"%s\" on event %s: %m\n",
1465 			pos->filter ?: "BPF", evsel__name(pos));
1466 		rc = -1;
1467 		goto out;
1468 	}
1469 
1470 	rc = record__mmap(rec);
1471 	if (rc)
1472 		goto out;
1473 
1474 	session->evlist = evlist;
1475 	perf_session__set_id_hdr_size(session);
1476 out:
1477 	return rc;
1478 }
1479 
1480 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1481 {
1482 	if (rec->evlist->first_sample_time == 0)
1483 		rec->evlist->first_sample_time = sample_time;
1484 
1485 	if (sample_time)
1486 		rec->evlist->last_sample_time = sample_time;
1487 }
1488 
1489 static int process_sample_event(const struct perf_tool *tool,
1490 				union perf_event *event,
1491 				struct perf_sample *sample,
1492 				struct evsel *evsel,
1493 				struct machine *machine)
1494 {
1495 	struct record *rec = container_of(tool, struct record, tool);
1496 
1497 	set_timestamp_boundary(rec, sample->time);
1498 
1499 	if (rec->buildid_all)
1500 		return 0;
1501 
1502 	rec->samples++;
1503 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1504 }
1505 
1506 static int process_buildids(struct record *rec)
1507 {
1508 	struct perf_session *session = rec->session;
1509 
1510 	if (perf_data__size(&rec->data) == 0)
1511 		return 0;
1512 
1513 	/* A single DSO is needed and not all inline frames. */
1514 	symbol_conf.inline_name = false;
1515 	/*
1516 	 * During this process, it'll load kernel map and replace the
1517 	 * dso->long_name to a real pathname it found.  In this case
1518 	 * we prefer the vmlinux path like
1519 	 *   /lib/modules/3.16.4/build/vmlinux
1520 	 *
1521 	 * rather than build-id path (in debug directory).
1522 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1523 	 */
1524 	symbol_conf.ignore_vmlinux_buildid = true;
1525 	/*
1526 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1527 	 * so no need to process samples. But if timestamp_boundary is enabled,
1528 	 * it still needs to walk on all samples to get the timestamps of
1529 	 * first/last samples.
1530 	 */
1531 	if (rec->buildid_all && !rec->timestamp_boundary)
1532 		rec->tool.sample = process_event_sample_stub;
1533 
1534 	return perf_session__process_events(session);
1535 }
1536 
1537 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1538 {
1539 	int err;
1540 	struct perf_tool *tool = data;
1541 	/*
1542 	 *As for guest kernel when processing subcommand record&report,
1543 	 *we arrange module mmap prior to guest kernel mmap and trigger
1544 	 *a preload dso because default guest module symbols are loaded
1545 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1546 	 *method is used to avoid symbol missing when the first addr is
1547 	 *in module instead of in guest kernel.
1548 	 */
1549 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1550 					     machine);
1551 	if (err < 0)
1552 		pr_err("Couldn't record guest kernel [%d]'s reference"
1553 		       " relocation symbol.\n", machine->pid);
1554 
1555 	/*
1556 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1557 	 * have no _text sometimes.
1558 	 */
1559 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1560 						 machine);
1561 	if (err < 0)
1562 		pr_err("Couldn't record guest kernel [%d]'s reference"
1563 		       " relocation symbol.\n", machine->pid);
1564 }
1565 
1566 static struct perf_event_header finished_round_event = {
1567 	.size = sizeof(struct perf_event_header),
1568 	.type = PERF_RECORD_FINISHED_ROUND,
1569 };
1570 
1571 static struct perf_event_header finished_init_event = {
1572 	.size = sizeof(struct perf_event_header),
1573 	.type = PERF_RECORD_FINISHED_INIT,
1574 };
1575 
1576 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1577 {
1578 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1579 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1580 			  thread->mask->affinity.nbits)) {
1581 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1582 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1583 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1584 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1585 					(cpu_set_t *)thread->mask->affinity.bits);
1586 		if (verbose == 2) {
1587 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1588 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1589 		}
1590 	}
1591 }
1592 
1593 static size_t process_comp_header(void *record, size_t increment)
1594 {
1595 	struct perf_record_compressed2 *event = record;
1596 	size_t size = sizeof(*event);
1597 
1598 	if (increment) {
1599 		event->header.size += increment;
1600 		return increment;
1601 	}
1602 
1603 	event->header.type = PERF_RECORD_COMPRESSED2;
1604 	event->header.size = size;
1605 
1606 	return size;
1607 }
1608 
1609 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1610 			    void *dst, size_t dst_size, void *src, size_t src_size)
1611 {
1612 	ssize_t compressed;
1613 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed2) - 1;
1614 	struct zstd_data *zstd_data = &session->zstd_data;
1615 
1616 	if (map && map->file)
1617 		zstd_data = &map->zstd_data;
1618 
1619 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1620 						     max_record_size, process_comp_header);
1621 	if (compressed < 0)
1622 		return compressed;
1623 
1624 	if (map && map->file) {
1625 		thread->bytes_transferred += src_size;
1626 		thread->bytes_compressed  += compressed;
1627 	} else {
1628 		session->bytes_transferred += src_size;
1629 		session->bytes_compressed  += compressed;
1630 	}
1631 
1632 	return compressed;
1633 }
1634 
1635 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1636 				    bool overwrite, bool synch)
1637 {
1638 	u64 bytes_written = rec->bytes_written;
1639 	int i;
1640 	int rc = 0;
1641 	int nr_mmaps;
1642 	struct mmap **maps;
1643 	int trace_fd = perf_data__fd(&rec->data);
1644 	off_t off = 0;
1645 
1646 	if (!evlist)
1647 		return 0;
1648 
1649 	nr_mmaps = thread->nr_mmaps;
1650 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1651 
1652 	if (!maps)
1653 		return 0;
1654 
1655 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1656 		return 0;
1657 
1658 	if (record__aio_enabled(rec))
1659 		off = record__aio_get_pos(trace_fd);
1660 
1661 	for (i = 0; i < nr_mmaps; i++) {
1662 		u64 flush = 0;
1663 		struct mmap *map = maps[i];
1664 
1665 		if (map->core.base) {
1666 			record__adjust_affinity(rec, map);
1667 			if (synch) {
1668 				flush = map->core.flush;
1669 				map->core.flush = 1;
1670 			}
1671 			if (!record__aio_enabled(rec)) {
1672 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1673 					if (synch)
1674 						map->core.flush = flush;
1675 					rc = -1;
1676 					goto out;
1677 				}
1678 			} else {
1679 				if (record__aio_push(rec, map, &off) < 0) {
1680 					record__aio_set_pos(trace_fd, off);
1681 					if (synch)
1682 						map->core.flush = flush;
1683 					rc = -1;
1684 					goto out;
1685 				}
1686 			}
1687 			if (synch)
1688 				map->core.flush = flush;
1689 		}
1690 
1691 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1692 		    !rec->opts.auxtrace_sample_mode &&
1693 		    record__auxtrace_mmap_read(rec, map) != 0) {
1694 			rc = -1;
1695 			goto out;
1696 		}
1697 	}
1698 
1699 	if (record__aio_enabled(rec))
1700 		record__aio_set_pos(trace_fd, off);
1701 
1702 	/*
1703 	 * Mark the round finished in case we wrote
1704 	 * at least one event.
1705 	 *
1706 	 * No need for round events in directory mode,
1707 	 * because per-cpu maps and files have data
1708 	 * sorted by kernel.
1709 	 */
1710 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1711 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1712 
1713 	if (overwrite)
1714 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1715 out:
1716 	return rc;
1717 }
1718 
1719 static int record__mmap_read_all(struct record *rec, bool synch)
1720 {
1721 	int err;
1722 
1723 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1724 	if (err)
1725 		return err;
1726 
1727 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1728 }
1729 
1730 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1731 					   void *arg __maybe_unused)
1732 {
1733 	struct perf_mmap *map = fda->priv[fd].ptr;
1734 
1735 	if (map)
1736 		perf_mmap__put(map);
1737 }
1738 
1739 static void *record__thread(void *arg)
1740 {
1741 	enum thread_msg msg = THREAD_MSG__READY;
1742 	bool terminate = false;
1743 	struct fdarray *pollfd;
1744 	int err, ctlfd_pos;
1745 
1746 	thread = arg;
1747 	thread->tid = gettid();
1748 
1749 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1750 	if (err == -1)
1751 		pr_warning("threads[%d]: failed to notify on start: %m\n", thread->tid);
1752 
1753 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1754 
1755 	pollfd = &thread->pollfd;
1756 	ctlfd_pos = thread->ctlfd_pos;
1757 
1758 	for (;;) {
1759 		unsigned long long hits = thread->samples;
1760 
1761 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1762 			break;
1763 
1764 		if (hits == thread->samples) {
1765 
1766 			err = fdarray__poll(pollfd, -1);
1767 			/*
1768 			 * Propagate error, only if there's any. Ignore positive
1769 			 * number of returned events and interrupt error.
1770 			 */
1771 			if (err > 0 || (err < 0 && errno == EINTR))
1772 				err = 0;
1773 			thread->waking++;
1774 
1775 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1776 					    record__thread_munmap_filtered, NULL) == 0)
1777 				break;
1778 		}
1779 
1780 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1781 			terminate = true;
1782 			close(thread->pipes.msg[0]);
1783 			thread->pipes.msg[0] = -1;
1784 			pollfd->entries[ctlfd_pos].fd = -1;
1785 			pollfd->entries[ctlfd_pos].events = 0;
1786 		}
1787 
1788 		pollfd->entries[ctlfd_pos].revents = 0;
1789 	}
1790 	record__mmap_read_all(thread->rec, true);
1791 
1792 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1793 	if (err == -1)
1794 		pr_warning("threads[%d]: failed to notify on termination: %m\n", thread->tid);
1795 
1796 	return NULL;
1797 }
1798 
1799 static void record__init_features(struct record *rec)
1800 {
1801 	struct perf_session *session = rec->session;
1802 	int feat;
1803 
1804 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1805 		perf_header__set_feat(&session->header, feat);
1806 
1807 	if (rec->no_buildid)
1808 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1809 
1810 	if (!have_tracepoints(&rec->evlist->core.entries))
1811 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1812 
1813 	if (!rec->opts.branch_stack)
1814 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1815 
1816 	if (!rec->opts.full_auxtrace)
1817 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1818 
1819 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1820 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1821 
1822 	if (!rec->opts.use_clockid)
1823 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1824 
1825 	if (!record__threads_enabled(rec))
1826 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1827 
1828 	if (!record__comp_enabled(rec))
1829 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1830 
1831 	perf_header__clear_feat(&session->header, HEADER_STAT);
1832 }
1833 
1834 static void
1835 record__finish_output(struct record *rec)
1836 {
1837 	int i;
1838 	struct perf_data *data = &rec->data;
1839 	int fd = perf_data__fd(data);
1840 
1841 	if (data->is_pipe) {
1842 		/* Just to display approx. size */
1843 		data->file.size = rec->bytes_written;
1844 		return;
1845 	}
1846 
1847 	rec->session->header.data_size += rec->bytes_written;
1848 	data->file.size = perf_data__seek(data, 0, SEEK_CUR);
1849 	if (record__threads_enabled(rec)) {
1850 		for (i = 0; i < data->dir.nr; i++) {
1851 			data->dir.files[i].size =
1852 				perf_data_file__seek(&data->dir.files[i], 0, SEEK_CUR);
1853 		}
1854 	}
1855 
1856 	/* Buildid scanning disabled or build ID in kernel and synthesized map events. */
1857 	if (!rec->no_buildid || !rec->no_buildid_cache) {
1858 		process_buildids(rec);
1859 
1860 		if (rec->buildid_all)
1861 			perf_session__dsos_hit_all(rec->session);
1862 	}
1863 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1864 	perf_session__cache_build_ids(rec->session);
1865 }
1866 
1867 static int record__synthesize_workload(struct record *rec, bool tail)
1868 {
1869 	int err;
1870 	struct perf_thread_map *thread_map;
1871 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1872 
1873 	if (rec->opts.tail_synthesize != tail)
1874 		return 0;
1875 
1876 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1877 	if (thread_map == NULL)
1878 		return -1;
1879 
1880 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1881 						 process_synthesized_event,
1882 						 &rec->session->machines.host,
1883 						 needs_mmap,
1884 						 rec->opts.record_data_mmap);
1885 	perf_thread_map__put(thread_map);
1886 	return err;
1887 }
1888 
1889 static int write_finished_init(struct record *rec, bool tail)
1890 {
1891 	if (rec->opts.tail_synthesize != tail)
1892 		return 0;
1893 
1894 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1895 }
1896 
1897 static int record__synthesize(struct record *rec, bool tail);
1898 
1899 static int
1900 record__switch_output(struct record *rec, bool at_exit)
1901 {
1902 	struct perf_data *data = &rec->data;
1903 	char *new_filename = NULL;
1904 	int fd, err;
1905 
1906 	/* Same Size:      "2015122520103046"*/
1907 	char timestamp[] = "InvalidTimestamp";
1908 
1909 	record__aio_mmap_read_sync(rec);
1910 
1911 	write_finished_init(rec, true);
1912 
1913 	record__synthesize(rec, true);
1914 	if (target__none(&rec->opts.target))
1915 		record__synthesize_workload(rec, true);
1916 
1917 	rec->samples = 0;
1918 	record__finish_output(rec);
1919 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1920 	if (err) {
1921 		pr_err("Failed to get current timestamp\n");
1922 		return -EINVAL;
1923 	}
1924 
1925 	fd = perf_data__switch(data, timestamp,
1926 			       rec->session->header.data_offset,
1927 			       at_exit, &new_filename);
1928 	if (fd >= 0 && !at_exit) {
1929 		rec->bytes_written = 0;
1930 		rec->session->header.data_size = 0;
1931 	}
1932 
1933 	if (!quiet) {
1934 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1935 			data->path, timestamp);
1936 	}
1937 
1938 	if (rec->switch_output.num_files) {
1939 		int n = rec->switch_output.cur_file + 1;
1940 
1941 		if (n >= rec->switch_output.num_files)
1942 			n = 0;
1943 		rec->switch_output.cur_file = n;
1944 		if (rec->switch_output.filenames[n]) {
1945 			remove(rec->switch_output.filenames[n]);
1946 			zfree(&rec->switch_output.filenames[n]);
1947 		}
1948 		rec->switch_output.filenames[n] = new_filename;
1949 	} else {
1950 		free(new_filename);
1951 	}
1952 
1953 	/* Output tracking events */
1954 	if (!at_exit) {
1955 		record__synthesize(rec, false);
1956 
1957 		/*
1958 		 * In 'perf record --switch-output' without -a,
1959 		 * record__synthesize() in record__switch_output() won't
1960 		 * generate tracking events because there's no thread_map
1961 		 * in evlist. Which causes newly created perf.data doesn't
1962 		 * contain map and comm information.
1963 		 * Create a fake thread_map and directly call
1964 		 * perf_event__synthesize_thread_map() for those events.
1965 		 */
1966 		if (target__none(&rec->opts.target))
1967 			record__synthesize_workload(rec, false);
1968 		write_finished_init(rec, false);
1969 	}
1970 	return fd;
1971 }
1972 
1973 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1974 					struct perf_record_lost_samples *lost,
1975 					int cpu_idx, int thread_idx, u64 lost_count,
1976 					u16 misc_flag)
1977 {
1978 	struct perf_sample_id *sid;
1979 	struct perf_sample sample;
1980 	int id_hdr_size;
1981 
1982 	perf_sample__init(&sample, /*all=*/true);
1983 	lost->lost = lost_count;
1984 	if (evsel->core.ids) {
1985 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1986 		sample.id = sid->id;
1987 	}
1988 
1989 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1990 						       evsel->core.attr.sample_type, &sample);
1991 	lost->header.size = sizeof(*lost) + id_hdr_size;
1992 	lost->header.misc = misc_flag;
1993 	record__write(rec, NULL, lost, lost->header.size);
1994 	perf_sample__exit(&sample);
1995 }
1996 
1997 static void record__read_lost_samples(struct record *rec)
1998 {
1999 	struct perf_session *session = rec->session;
2000 	struct perf_record_lost_samples_and_ids lost;
2001 	struct evsel *evsel;
2002 
2003 	/* there was an error during record__open */
2004 	if (session->evlist == NULL)
2005 		return;
2006 
2007 	evlist__for_each_entry(session->evlist, evsel) {
2008 		struct xyarray *xy = evsel->core.sample_id;
2009 		u64 lost_count;
2010 
2011 		if (xy == NULL || evsel->core.fd == NULL)
2012 			continue;
2013 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
2014 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
2015 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
2016 			continue;
2017 		}
2018 
2019 		for (int x = 0; x < xyarray__max_x(xy); x++) {
2020 			for (int y = 0; y < xyarray__max_y(xy); y++) {
2021 				struct perf_counts_values count;
2022 
2023 				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
2024 					pr_debug("read LOST count failed\n");
2025 					return;
2026 				}
2027 
2028 				if (count.lost) {
2029 					memset(&lost, 0, sizeof(lost));
2030 					lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
2031 					__record__save_lost_samples(rec, evsel, &lost.lost,
2032 								    x, y, count.lost, 0);
2033 				}
2034 			}
2035 		}
2036 
2037 		lost_count = perf_bpf_filter__lost_count(evsel);
2038 		if (lost_count) {
2039 			memset(&lost, 0, sizeof(lost));
2040 			lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
2041 			__record__save_lost_samples(rec, evsel, &lost.lost, 0, 0, lost_count,
2042 						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
2043 		}
2044 	}
2045 }
2046 
2047 static volatile sig_atomic_t workload_exec_errno;
2048 
2049 /*
2050  * evlist__prepare_workload will send a SIGUSR1
2051  * if the fork fails, since we asked by setting its
2052  * want_signal to true.
2053  */
2054 static void workload_exec_failed_signal(int signo __maybe_unused,
2055 					siginfo_t *info,
2056 					void *ucontext __maybe_unused)
2057 {
2058 	workload_exec_errno = info->si_value.sival_int;
2059 	done = 1;
2060 	child_finished = 1;
2061 }
2062 
2063 static void snapshot_sig_handler(int sig);
2064 static void alarm_sig_handler(int sig);
2065 
2066 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
2067 {
2068 	if (evlist) {
2069 		if (evlist->mmap && evlist->mmap[0].core.base)
2070 			return evlist->mmap[0].core.base;
2071 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2072 			return evlist->overwrite_mmap[0].core.base;
2073 	}
2074 	return NULL;
2075 }
2076 
2077 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2078 {
2079 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2080 	if (pc)
2081 		return pc;
2082 	return NULL;
2083 }
2084 
2085 static int record__synthesize(struct record *rec, bool tail)
2086 {
2087 	struct perf_session *session = rec->session;
2088 	struct machine *machine = &session->machines.host;
2089 	struct perf_data *data = &rec->data;
2090 	struct record_opts *opts = &rec->opts;
2091 	struct perf_tool *tool = &rec->tool;
2092 	int err = 0;
2093 	event_op f = process_synthesized_event;
2094 
2095 	if (rec->opts.tail_synthesize != tail)
2096 		return 0;
2097 
2098 	if (data->is_pipe) {
2099 		err = perf_event__synthesize_for_pipe(tool, session, data,
2100 						      process_synthesized_event);
2101 		if (err < 0)
2102 			goto out;
2103 
2104 		rec->bytes_written += err;
2105 	}
2106 
2107 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2108 					  process_synthesized_event, machine);
2109 	if (err)
2110 		goto out;
2111 
2112 	/* Synthesize id_index before auxtrace_info */
2113 	err = perf_event__synthesize_id_index(tool,
2114 					      process_synthesized_event,
2115 					      session->evlist, machine);
2116 	if (err)
2117 		goto out;
2118 
2119 	if (rec->opts.full_auxtrace) {
2120 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2121 					session, process_synthesized_event);
2122 		if (err)
2123 			goto out;
2124 	}
2125 
2126 	if (!evlist__exclude_kernel(rec->evlist)) {
2127 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2128 							 machine);
2129 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2130 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2131 				   "Check /proc/kallsyms permission or run as root.\n");
2132 
2133 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2134 						     machine);
2135 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2136 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2137 				   "Check /proc/modules permission or run as root.\n");
2138 	}
2139 
2140 	if (perf_guest) {
2141 		machines__process_guests(&session->machines,
2142 					 perf_event__synthesize_guest_os, tool);
2143 	}
2144 
2145 	err = perf_event__synthesize_extra_attr(&rec->tool,
2146 						rec->evlist,
2147 						process_synthesized_event,
2148 						data->is_pipe);
2149 	if (err)
2150 		goto out;
2151 
2152 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2153 						 process_synthesized_event,
2154 						NULL);
2155 	if (err < 0) {
2156 		pr_err("Couldn't synthesize thread map.\n");
2157 		return err;
2158 	}
2159 
2160 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2161 					     process_synthesized_event, NULL);
2162 	if (err < 0) {
2163 		pr_err("Couldn't synthesize cpu map.\n");
2164 		return err;
2165 	}
2166 
2167 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2168 						machine, opts);
2169 	if (err < 0) {
2170 		pr_warning("Couldn't synthesize bpf events.\n");
2171 		err = 0;
2172 	}
2173 
2174 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2175 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2176 						     machine);
2177 		if (err < 0) {
2178 			pr_warning("Couldn't synthesize cgroup events.\n");
2179 			err = 0;
2180 		}
2181 	}
2182 
2183 	if (rec->opts.nr_threads_synthesize > 1) {
2184 		mutex_init(&synth_lock);
2185 		perf_set_multithreaded();
2186 		f = process_locked_synthesized_event;
2187 	}
2188 
2189 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2190 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2191 
2192 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2193 						    rec->evlist->core.threads,
2194 						    f, needs_mmap, opts->record_data_mmap,
2195 						    rec->opts.nr_threads_synthesize);
2196 	}
2197 
2198 	if (rec->opts.nr_threads_synthesize > 1) {
2199 		perf_set_singlethreaded();
2200 		mutex_destroy(&synth_lock);
2201 	}
2202 
2203 out:
2204 	return err;
2205 }
2206 
2207 static void record__synthesize_final_bpf_metadata(struct record *rec __maybe_unused)
2208 {
2209 #ifdef HAVE_LIBBPF_SUPPORT
2210 	perf_event__synthesize_final_bpf_metadata(rec->session,
2211 						  process_synthesized_event);
2212 #endif
2213 }
2214 
2215 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2216 {
2217 	struct record *rec = data;
2218 	pthread_kill(rec->thread_id, SIGUSR2);
2219 	return 0;
2220 }
2221 
2222 static int record__setup_sb_evlist(struct record *rec)
2223 {
2224 	struct record_opts *opts = &rec->opts;
2225 
2226 	if (rec->sb_evlist != NULL) {
2227 		/*
2228 		 * We get here if --switch-output-event populated the
2229 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2230 		 * to the main thread.
2231 		 */
2232 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2233 		rec->thread_id = pthread_self();
2234 	}
2235 #ifdef HAVE_LIBBPF_SUPPORT
2236 	if (!opts->no_bpf_event) {
2237 		if (rec->sb_evlist == NULL) {
2238 			rec->sb_evlist = evlist__new();
2239 
2240 			if (rec->sb_evlist == NULL) {
2241 				pr_err("Couldn't create side band evlist.\n.");
2242 				return -1;
2243 			}
2244 		}
2245 
2246 		if (evlist__add_bpf_sb_event(rec->sb_evlist, perf_session__env(rec->session))) {
2247 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2248 			return -1;
2249 		}
2250 	}
2251 #endif
2252 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2253 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2254 		opts->no_bpf_event = true;
2255 	}
2256 
2257 	return 0;
2258 }
2259 
2260 static int record__init_clock(struct record *rec)
2261 {
2262 	struct perf_session *session = rec->session;
2263 	struct timespec ref_clockid;
2264 	struct timeval ref_tod;
2265 	struct perf_env *env = perf_session__env(session);
2266 	u64 ref;
2267 
2268 	if (!rec->opts.use_clockid)
2269 		return 0;
2270 
2271 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2272 		env->clock.clockid_res_ns = rec->opts.clockid_res_ns;
2273 
2274 	env->clock.clockid = rec->opts.clockid;
2275 
2276 	if (gettimeofday(&ref_tod, NULL) != 0) {
2277 		pr_err("gettimeofday failed, cannot set reference time.\n");
2278 		return -1;
2279 	}
2280 
2281 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2282 		pr_err("clock_gettime failed, cannot set reference time.\n");
2283 		return -1;
2284 	}
2285 
2286 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2287 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2288 
2289 	env->clock.tod_ns = ref;
2290 
2291 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2292 	      (u64) ref_clockid.tv_nsec;
2293 
2294 	env->clock.clockid_ns = ref;
2295 	return 0;
2296 }
2297 
2298 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2299 {
2300 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2301 		trigger_hit(&auxtrace_snapshot_trigger);
2302 		auxtrace_record__snapshot_started = 1;
2303 		if (auxtrace_record__snapshot_start(rec->itr))
2304 			trigger_error(&auxtrace_snapshot_trigger);
2305 	}
2306 }
2307 
2308 static int record__terminate_thread(struct record_thread *thread_data)
2309 {
2310 	int err;
2311 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2312 	pid_t tid = thread_data->tid;
2313 
2314 	close(thread_data->pipes.msg[1]);
2315 	thread_data->pipes.msg[1] = -1;
2316 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2317 	if (err > 0)
2318 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2319 	else
2320 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2321 			   thread->tid, tid);
2322 
2323 	return 0;
2324 }
2325 
2326 static int record__start_threads(struct record *rec)
2327 {
2328 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2329 	struct record_thread *thread_data = rec->thread_data;
2330 	sigset_t full, mask;
2331 	pthread_t handle;
2332 	pthread_attr_t attrs;
2333 
2334 	thread = &thread_data[0];
2335 
2336 	if (!record__threads_enabled(rec))
2337 		return 0;
2338 
2339 	sigfillset(&full);
2340 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2341 		pr_err("Failed to block signals on threads start: %m\n");
2342 		return -1;
2343 	}
2344 
2345 	pthread_attr_init(&attrs);
2346 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2347 
2348 	for (t = 1; t < nr_threads; t++) {
2349 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2350 
2351 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2352 		pthread_attr_setaffinity_np(&attrs,
2353 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2354 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2355 #endif
2356 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2357 			for (tt = 1; tt < t; tt++)
2358 				record__terminate_thread(&thread_data[t]);
2359 			pr_err("Failed to start threads: %m\n");
2360 			ret = -1;
2361 			goto out_err;
2362 		}
2363 
2364 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2365 		if (err > 0)
2366 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2367 				  thread_msg_tags[msg]);
2368 		else
2369 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2370 				   thread->tid, rec->thread_data[t].tid);
2371 	}
2372 
2373 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2374 			(cpu_set_t *)thread->mask->affinity.bits);
2375 
2376 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2377 
2378 out_err:
2379 	pthread_attr_destroy(&attrs);
2380 
2381 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2382 		pr_err("Failed to unblock signals on threads start: %m\n");
2383 		ret = -1;
2384 	}
2385 
2386 	return ret;
2387 }
2388 
2389 static int record__stop_threads(struct record *rec)
2390 {
2391 	int t;
2392 	struct record_thread *thread_data = rec->thread_data;
2393 
2394 	for (t = 1; t < rec->nr_threads; t++)
2395 		record__terminate_thread(&thread_data[t]);
2396 
2397 	for (t = 0; t < rec->nr_threads; t++) {
2398 		rec->samples += thread_data[t].samples;
2399 		if (!record__threads_enabled(rec))
2400 			continue;
2401 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2402 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2403 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2404 			 thread_data[t].samples, thread_data[t].waking);
2405 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2406 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2407 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2408 		else
2409 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2410 	}
2411 
2412 	return 0;
2413 }
2414 
2415 static unsigned long record__waking(struct record *rec)
2416 {
2417 	int t;
2418 	unsigned long waking = 0;
2419 	struct record_thread *thread_data = rec->thread_data;
2420 
2421 	for (t = 0; t < rec->nr_threads; t++)
2422 		waking += thread_data[t].waking;
2423 
2424 	return waking;
2425 }
2426 
2427 static int __cmd_record(struct record *rec, int argc, const char **argv)
2428 {
2429 	int err;
2430 	int status = 0;
2431 	const bool forks = argc > 0;
2432 	struct perf_tool *tool = &rec->tool;
2433 	struct record_opts *opts = &rec->opts;
2434 	struct perf_data *data = &rec->data;
2435 	struct perf_session *session;
2436 	bool disabled = false, draining = false;
2437 	int fd;
2438 	float ratio = 0;
2439 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2440 	struct perf_env *env;
2441 
2442 	atexit(record__sig_exit);
2443 	signal(SIGCHLD, sig_handler);
2444 	signal(SIGINT, sig_handler);
2445 	signal(SIGTERM, sig_handler);
2446 	signal(SIGSEGV, sigsegv_handler);
2447 
2448 	if (rec->opts.record_cgroup) {
2449 #ifndef HAVE_FILE_HANDLE
2450 		pr_err("cgroup tracking is not supported\n");
2451 		return -1;
2452 #endif
2453 	}
2454 
2455 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2456 		signal(SIGUSR2, snapshot_sig_handler);
2457 		if (rec->opts.auxtrace_snapshot_mode)
2458 			trigger_on(&auxtrace_snapshot_trigger);
2459 		if (rec->switch_output.enabled)
2460 			trigger_on(&switch_output_trigger);
2461 	} else {
2462 		signal(SIGUSR2, SIG_IGN);
2463 	}
2464 
2465 	perf_tool__init(tool, /*ordered_events=*/true);
2466 	tool->sample		= process_sample_event;
2467 	tool->fork		= perf_event__process_fork;
2468 	tool->exit		= perf_event__process_exit;
2469 	tool->comm		= perf_event__process_comm;
2470 	tool->namespaces	= perf_event__process_namespaces;
2471 	tool->mmap		= build_id__process_mmap;
2472 	tool->mmap2		= build_id__process_mmap2;
2473 	tool->itrace_start	= process_timestamp_boundary;
2474 	tool->aux		= process_timestamp_boundary;
2475 	tool->namespace_events	= rec->opts.record_namespaces;
2476 	tool->cgroup_events	= rec->opts.record_cgroup;
2477 	session = perf_session__new(data, tool);
2478 	if (IS_ERR(session)) {
2479 		pr_err("Perf session creation failed.\n");
2480 		return PTR_ERR(session);
2481 	}
2482 	env = perf_session__env(session);
2483 	if (record__threads_enabled(rec)) {
2484 		if (perf_data__is_pipe(&rec->data)) {
2485 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2486 			return -1;
2487 		}
2488 		if (rec->opts.full_auxtrace) {
2489 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2490 			return -1;
2491 		}
2492 	}
2493 
2494 	fd = perf_data__fd(data);
2495 	rec->session = session;
2496 
2497 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2498 		pr_err("Compression initialization failed.\n");
2499 		return -1;
2500 	}
2501 #ifdef HAVE_EVENTFD_SUPPORT
2502 	done_fd = eventfd(0, EFD_NONBLOCK);
2503 	if (done_fd < 0) {
2504 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2505 		status = -1;
2506 		goto out_delete_session;
2507 	}
2508 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2509 	if (err < 0) {
2510 		pr_err("Failed to add wakeup eventfd to poll list\n");
2511 		status = err;
2512 		goto out_delete_session;
2513 	}
2514 #endif // HAVE_EVENTFD_SUPPORT
2515 
2516 	env->comp_type  = PERF_COMP_ZSTD;
2517 	env->comp_level = rec->opts.comp_level;
2518 
2519 	if (rec->opts.kcore &&
2520 	    !record__kcore_readable(&session->machines.host)) {
2521 		pr_err("ERROR: kcore is not readable.\n");
2522 		return -1;
2523 	}
2524 
2525 	if (record__init_clock(rec))
2526 		return -1;
2527 
2528 	record__init_features(rec);
2529 
2530 	if (forks) {
2531 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2532 					       workload_exec_failed_signal);
2533 		if (err < 0) {
2534 			pr_err("Couldn't run the workload!\n");
2535 			status = err;
2536 			goto out_delete_session;
2537 		}
2538 	}
2539 
2540 	/*
2541 	 * If we have just single event and are sending data
2542 	 * through pipe, we need to force the ids allocation,
2543 	 * because we synthesize event name through the pipe
2544 	 * and need the id for that.
2545 	 */
2546 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2547 		rec->opts.sample_id = true;
2548 
2549 	if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2550 		rec->timestamp_filename = false;
2551 		pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2552 	}
2553 
2554 	/*
2555 	 * Use global stat_config that is zero meaning aggr_mode is AGGR_NONE
2556 	 * and hybrid_merge is false.
2557 	 */
2558 	evlist__uniquify_evsel_names(rec->evlist, &stat_config);
2559 
2560 	evlist__config(rec->evlist, opts, &callchain_param);
2561 
2562 	/* Debug message used by test scripts */
2563 	pr_debug3("perf record opening and mmapping events\n");
2564 	if (record__open(rec) != 0) {
2565 		err = -1;
2566 		goto out_free_threads;
2567 	}
2568 	/* Debug message used by test scripts */
2569 	pr_debug3("perf record done opening and mmapping events\n");
2570 	env->comp_mmap_len = session->evlist->core.mmap_len;
2571 
2572 	if (rec->opts.kcore) {
2573 		err = record__kcore_copy(&session->machines.host, data);
2574 		if (err) {
2575 			pr_err("ERROR: Failed to copy kcore\n");
2576 			goto out_free_threads;
2577 		}
2578 	}
2579 
2580 	/*
2581 	 * Normally perf_session__new would do this, but it doesn't have the
2582 	 * evlist.
2583 	 */
2584 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2585 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2586 		rec->tool.ordered_events = false;
2587 	}
2588 
2589 	if (evlist__nr_groups(rec->evlist) == 0)
2590 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2591 
2592 	if (data->is_pipe) {
2593 		err = perf_header__write_pipe(fd);
2594 		if (err < 0)
2595 			goto out_free_threads;
2596 	} else {
2597 		err = perf_session__write_header(session, rec->evlist, fd, false);
2598 		if (err < 0)
2599 			goto out_free_threads;
2600 	}
2601 
2602 	err = -1;
2603 	if (!rec->no_buildid
2604 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2605 		pr_err("Couldn't generate buildids. "
2606 		       "Use --no-buildid to profile anyway.\n");
2607 		goto out_free_threads;
2608 	}
2609 
2610 	if (!evlist__needs_bpf_sb_event(rec->evlist))
2611 		opts->no_bpf_event = true;
2612 
2613 	err = record__setup_sb_evlist(rec);
2614 	if (err)
2615 		goto out_free_threads;
2616 
2617 	err = record__synthesize(rec, false);
2618 	if (err < 0)
2619 		goto out_free_threads;
2620 
2621 	if (rec->realtime_prio) {
2622 		struct sched_param param;
2623 
2624 		param.sched_priority = rec->realtime_prio;
2625 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2626 			pr_err("Could not set realtime priority.\n");
2627 			err = -1;
2628 			goto out_free_threads;
2629 		}
2630 	}
2631 
2632 	if (record__start_threads(rec))
2633 		goto out_free_threads;
2634 
2635 	/*
2636 	 * When perf is starting the traced process, all the events
2637 	 * (apart from group members) have enable_on_exec=1 set,
2638 	 * so don't spoil it by prematurely enabling them.
2639 	 */
2640 	if (!target__none(&opts->target) && !opts->target.initial_delay)
2641 		evlist__enable(rec->evlist);
2642 
2643 	/*
2644 	 * offcpu-time does not call execve, so enable_on_exe wouldn't work
2645 	 * when recording a workload, do it manually
2646 	 */
2647 	if (rec->off_cpu)
2648 		evlist__enable_evsel(rec->evlist, (char *)OFFCPU_EVENT);
2649 
2650 	/*
2651 	 * Let the child rip
2652 	 */
2653 	if (forks) {
2654 		struct machine *machine = &session->machines.host;
2655 		union perf_event *event;
2656 		pid_t tgid;
2657 
2658 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2659 		if (event == NULL) {
2660 			err = -ENOMEM;
2661 			goto out_child;
2662 		}
2663 
2664 		/*
2665 		 * Some H/W events are generated before COMM event
2666 		 * which is emitted during exec(), so perf script
2667 		 * cannot see a correct process name for those events.
2668 		 * Synthesize COMM event to prevent it.
2669 		 */
2670 		tgid = perf_event__synthesize_comm(tool, event,
2671 						   rec->evlist->workload.pid,
2672 						   process_synthesized_event,
2673 						   machine);
2674 		free(event);
2675 
2676 		if (tgid == -1)
2677 			goto out_child;
2678 
2679 		event = malloc(sizeof(event->namespaces) +
2680 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2681 			       machine->id_hdr_size);
2682 		if (event == NULL) {
2683 			err = -ENOMEM;
2684 			goto out_child;
2685 		}
2686 
2687 		/*
2688 		 * Synthesize NAMESPACES event for the command specified.
2689 		 */
2690 		perf_event__synthesize_namespaces(tool, event,
2691 						  rec->evlist->workload.pid,
2692 						  tgid, process_synthesized_event,
2693 						  machine);
2694 		free(event);
2695 
2696 		evlist__start_workload(rec->evlist);
2697 	}
2698 
2699 	if (opts->target.initial_delay) {
2700 		pr_info(EVLIST_DISABLED_MSG);
2701 		if (opts->target.initial_delay > 0) {
2702 			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2703 			evlist__enable(rec->evlist);
2704 			pr_info(EVLIST_ENABLED_MSG);
2705 		}
2706 	}
2707 
2708 	err = event_enable_timer__start(rec->evlist->eet);
2709 	if (err)
2710 		goto out_child;
2711 
2712 	/* Debug message used by test scripts */
2713 	pr_debug3("perf record has started\n");
2714 	fflush(stderr);
2715 
2716 	trigger_ready(&auxtrace_snapshot_trigger);
2717 	trigger_ready(&switch_output_trigger);
2718 	perf_hooks__invoke_record_start();
2719 
2720 	/*
2721 	 * Must write FINISHED_INIT so it will be seen after all other
2722 	 * synthesized user events, but before any regular events.
2723 	 */
2724 	err = write_finished_init(rec, false);
2725 	if (err < 0)
2726 		goto out_child;
2727 
2728 	for (;;) {
2729 		unsigned long long hits = thread->samples;
2730 
2731 		/*
2732 		 * rec->evlist->bkw_mmap_state is possible to be
2733 		 * BKW_MMAP_EMPTY here: when done == true and
2734 		 * hits != rec->samples in previous round.
2735 		 *
2736 		 * evlist__toggle_bkw_mmap ensure we never
2737 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2738 		 */
2739 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2740 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2741 
2742 		if (record__mmap_read_all(rec, false) < 0) {
2743 			trigger_error(&auxtrace_snapshot_trigger);
2744 			trigger_error(&switch_output_trigger);
2745 			err = -1;
2746 			goto out_child;
2747 		}
2748 
2749 		if (auxtrace_record__snapshot_started) {
2750 			auxtrace_record__snapshot_started = 0;
2751 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2752 				record__read_auxtrace_snapshot(rec, false);
2753 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2754 				pr_err("AUX area tracing snapshot failed\n");
2755 				err = -1;
2756 				goto out_child;
2757 			}
2758 		}
2759 
2760 		if (trigger_is_hit(&switch_output_trigger)) {
2761 			/*
2762 			 * If switch_output_trigger is hit, the data in
2763 			 * overwritable ring buffer should have been collected,
2764 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2765 			 *
2766 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2767 			 * record__mmap_read_all() didn't collect data from
2768 			 * overwritable ring buffer. Read again.
2769 			 */
2770 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2771 				continue;
2772 			trigger_ready(&switch_output_trigger);
2773 
2774 			/*
2775 			 * Reenable events in overwrite ring buffer after
2776 			 * record__mmap_read_all(): we should have collected
2777 			 * data from it.
2778 			 */
2779 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2780 
2781 			if (!quiet)
2782 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2783 					record__waking(rec));
2784 			thread->waking = 0;
2785 			fd = record__switch_output(rec, false);
2786 			if (fd < 0) {
2787 				pr_err("Failed to switch to new file\n");
2788 				trigger_error(&switch_output_trigger);
2789 				err = fd;
2790 				goto out_child;
2791 			}
2792 
2793 			/* re-arm the alarm */
2794 			if (rec->switch_output.time)
2795 				alarm(rec->switch_output.time);
2796 		}
2797 
2798 		if (hits == thread->samples) {
2799 			if (done || draining)
2800 				break;
2801 			err = fdarray__poll(&thread->pollfd, -1);
2802 			/*
2803 			 * Propagate error, only if there's any. Ignore positive
2804 			 * number of returned events and interrupt error.
2805 			 */
2806 			if (err > 0 || (err < 0 && errno == EINTR))
2807 				err = 0;
2808 			thread->waking++;
2809 
2810 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2811 					    record__thread_munmap_filtered, NULL) == 0)
2812 				draining = true;
2813 
2814 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2815 			if (err)
2816 				goto out_child;
2817 		}
2818 
2819 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2820 			switch (cmd) {
2821 			case EVLIST_CTL_CMD_SNAPSHOT:
2822 				hit_auxtrace_snapshot_trigger(rec);
2823 				evlist__ctlfd_ack(rec->evlist);
2824 				break;
2825 			case EVLIST_CTL_CMD_STOP:
2826 				done = 1;
2827 				break;
2828 			case EVLIST_CTL_CMD_ACK:
2829 			case EVLIST_CTL_CMD_UNSUPPORTED:
2830 			case EVLIST_CTL_CMD_ENABLE:
2831 			case EVLIST_CTL_CMD_DISABLE:
2832 			case EVLIST_CTL_CMD_EVLIST:
2833 			case EVLIST_CTL_CMD_PING:
2834 			default:
2835 				break;
2836 			}
2837 		}
2838 
2839 		err = event_enable_timer__process(rec->evlist->eet);
2840 		if (err < 0)
2841 			goto out_child;
2842 		if (err) {
2843 			err = 0;
2844 			done = 1;
2845 		}
2846 
2847 		/*
2848 		 * When perf is starting the traced process, at the end events
2849 		 * die with the process and we wait for that. Thus no need to
2850 		 * disable events in this case.
2851 		 */
2852 		if (done && !disabled && !target__none(&opts->target)) {
2853 			trigger_off(&auxtrace_snapshot_trigger);
2854 			evlist__disable(rec->evlist);
2855 			disabled = true;
2856 		}
2857 	}
2858 
2859 	trigger_off(&auxtrace_snapshot_trigger);
2860 	trigger_off(&switch_output_trigger);
2861 
2862 	record__synthesize_final_bpf_metadata(rec);
2863 
2864 	if (opts->auxtrace_snapshot_on_exit)
2865 		record__auxtrace_snapshot_exit(rec);
2866 
2867 	if (forks && workload_exec_errno) {
2868 		char msg[STRERR_BUFSIZE];
2869 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2870 		struct strbuf sb = STRBUF_INIT;
2871 
2872 		evlist__format_evsels(rec->evlist, &sb, 2048);
2873 
2874 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2875 			sb.buf, argv[0], emsg);
2876 		strbuf_release(&sb);
2877 		err = -1;
2878 		goto out_child;
2879 	}
2880 
2881 	if (!quiet)
2882 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2883 			record__waking(rec));
2884 
2885 	write_finished_init(rec, true);
2886 
2887 	if (target__none(&rec->opts.target))
2888 		record__synthesize_workload(rec, true);
2889 
2890 out_child:
2891 	record__stop_threads(rec);
2892 	record__mmap_read_all(rec, true);
2893 out_free_threads:
2894 	record__free_thread_data(rec);
2895 	evlist__finalize_ctlfd(rec->evlist);
2896 	record__aio_mmap_read_sync(rec);
2897 
2898 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2899 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2900 		env->comp_ratio = ratio + 0.5;
2901 	}
2902 
2903 	if (forks) {
2904 		int exit_status;
2905 
2906 		if (!child_finished)
2907 			kill(rec->evlist->workload.pid, SIGTERM);
2908 
2909 		wait(&exit_status);
2910 
2911 		if (err < 0)
2912 			status = err;
2913 		else if (WIFEXITED(exit_status))
2914 			status = WEXITSTATUS(exit_status);
2915 		else if (WIFSIGNALED(exit_status))
2916 			signr = WTERMSIG(exit_status);
2917 	} else
2918 		status = err;
2919 
2920 	if (rec->off_cpu)
2921 		rec->bytes_written += off_cpu_write(rec->session);
2922 
2923 	record__read_lost_samples(rec);
2924 	/* this will be recalculated during process_buildids() */
2925 	rec->samples = 0;
2926 
2927 	if (!err) {
2928 		record__synthesize(rec, true);
2929 		if (!rec->timestamp_filename) {
2930 			record__finish_output(rec);
2931 		} else {
2932 			fd = record__switch_output(rec, true);
2933 			if (fd < 0) {
2934 				status = fd;
2935 				goto out_delete_session;
2936 			}
2937 		}
2938 	}
2939 
2940 	perf_hooks__invoke_record_end();
2941 
2942 	if (!err && !quiet) {
2943 		char samples[128];
2944 		const char *postfix = rec->timestamp_filename ?
2945 					".<timestamp>" : "";
2946 
2947 		if (rec->samples && !rec->opts.full_auxtrace)
2948 			scnprintf(samples, sizeof(samples),
2949 				  " (%" PRIu64 " samples)", rec->samples);
2950 		else
2951 			samples[0] = '\0';
2952 
2953 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2954 			perf_data__size(data) / 1024.0 / 1024.0,
2955 			data->path, postfix, samples);
2956 		if (ratio) {
2957 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2958 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2959 					ratio);
2960 		}
2961 		fprintf(stderr, " ]\n");
2962 	}
2963 
2964 out_delete_session:
2965 #ifdef HAVE_EVENTFD_SUPPORT
2966 	if (done_fd >= 0) {
2967 		fd = done_fd;
2968 		done_fd = -1;
2969 
2970 		close(fd);
2971 	}
2972 #endif
2973 	zstd_fini(&session->zstd_data);
2974 	if (!opts->no_bpf_event)
2975 		evlist__stop_sb_thread(rec->sb_evlist);
2976 
2977 	perf_session__delete(session);
2978 	return status;
2979 }
2980 
2981 static int record_parse_callchain_opt(const struct option *opt,
2982 			       const char *arg,
2983 			       int unset)
2984 {
2985 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2986 }
2987 
2988 static int record_callchain_opt(const struct option *opt,
2989 				const char *arg __maybe_unused,
2990 				int unset)
2991 {
2992 	/*
2993 	 * The -g option only sets the callchain if not already configured by
2994 	 * .perfconfig. It does, however, enable it.
2995 	 */
2996 	if (callchain_param.record_mode != CALLCHAIN_NONE) {
2997 		callchain_param.enabled = true;
2998 		return 0;
2999 	}
3000 
3001 	return record_opts__parse_callchain(opt->value, &callchain_param,
3002 					    EM_HOST != EM_S390 ? "fp" : "dwarf",
3003 					    unset);
3004 }
3005 
3006 
3007 static int perf_record_config(const char *var, const char *value, void *cb)
3008 {
3009 	struct record *rec = cb;
3010 
3011 	if (!strcmp(var, "record.build-id")) {
3012 		if (!strcmp(value, "cache"))
3013 			rec->no_buildid_cache = false;
3014 		else if (!strcmp(value, "no-cache"))
3015 			rec->no_buildid_cache = true;
3016 		else if (!strcmp(value, "skip"))
3017 			rec->no_buildid = rec->no_buildid_cache = true;
3018 		else if (!strcmp(value, "mmap"))
3019 			rec->buildid_mmap = true;
3020 		else if (!strcmp(value, "no-mmap"))
3021 			rec->buildid_mmap = false;
3022 		else
3023 			return -1;
3024 		return 0;
3025 	}
3026 	if (!strcmp(var, "record.call-graph")) {
3027 		var = "call-graph.record-mode";
3028 		return perf_default_config(var, value, cb);
3029 	}
3030 #ifdef HAVE_AIO_SUPPORT
3031 	if (!strcmp(var, "record.aio")) {
3032 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
3033 		if (!rec->opts.nr_cblocks)
3034 			rec->opts.nr_cblocks = nr_cblocks_default;
3035 	}
3036 #endif
3037 	if (!strcmp(var, "record.debuginfod")) {
3038 		rec->debuginfod.urls = strdup(value);
3039 		if (!rec->debuginfod.urls)
3040 			return -ENOMEM;
3041 		rec->debuginfod.set = true;
3042 	}
3043 
3044 	return 0;
3045 }
3046 
3047 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
3048 {
3049 	struct record *rec = (struct record *)opt->value;
3050 
3051 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
3052 }
3053 
3054 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
3055 {
3056 	struct record_opts *opts = (struct record_opts *)opt->value;
3057 
3058 	if (unset || !str)
3059 		return 0;
3060 
3061 	if (!strcasecmp(str, "node"))
3062 		opts->affinity = PERF_AFFINITY_NODE;
3063 	else if (!strcasecmp(str, "cpu"))
3064 		opts->affinity = PERF_AFFINITY_CPU;
3065 
3066 	return 0;
3067 }
3068 
3069 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3070 {
3071 	mask->nbits = nr_bits;
3072 	mask->bits = bitmap_zalloc(mask->nbits);
3073 	if (!mask->bits)
3074 		return -ENOMEM;
3075 
3076 	return 0;
3077 }
3078 
3079 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3080 {
3081 	bitmap_free(mask->bits);
3082 	mask->nbits = 0;
3083 }
3084 
3085 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3086 {
3087 	int ret;
3088 
3089 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3090 	if (ret) {
3091 		mask->affinity.bits = NULL;
3092 		return ret;
3093 	}
3094 
3095 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3096 	if (ret) {
3097 		record__mmap_cpu_mask_free(&mask->maps);
3098 		mask->maps.bits = NULL;
3099 	}
3100 
3101 	return ret;
3102 }
3103 
3104 static void record__thread_mask_free(struct thread_mask *mask)
3105 {
3106 	record__mmap_cpu_mask_free(&mask->maps);
3107 	record__mmap_cpu_mask_free(&mask->affinity);
3108 }
3109 
3110 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3111 {
3112 	int s;
3113 	struct record_opts *opts = opt->value;
3114 
3115 	if (unset || !str || !strlen(str)) {
3116 		opts->threads_spec = THREAD_SPEC__CPU;
3117 	} else {
3118 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3119 			if (s == THREAD_SPEC__USER) {
3120 				opts->threads_user_spec = strdup(str);
3121 				if (!opts->threads_user_spec)
3122 					return -ENOMEM;
3123 				opts->threads_spec = THREAD_SPEC__USER;
3124 				break;
3125 			}
3126 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3127 				opts->threads_spec = s;
3128 				break;
3129 			}
3130 		}
3131 	}
3132 
3133 	if (opts->threads_spec == THREAD_SPEC__USER)
3134 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3135 	else
3136 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3137 
3138 	return 0;
3139 }
3140 
3141 static int parse_output_max_size(const struct option *opt,
3142 				 const char *str, int unset)
3143 {
3144 	unsigned long *s = (unsigned long *)opt->value;
3145 	static struct parse_tag tags_size[] = {
3146 		{ .tag  = 'B', .mult = 1       },
3147 		{ .tag  = 'K', .mult = 1 << 10 },
3148 		{ .tag  = 'M', .mult = 1 << 20 },
3149 		{ .tag  = 'G', .mult = 1 << 30 },
3150 		{ .tag  = 0 },
3151 	};
3152 	unsigned long val;
3153 
3154 	if (unset) {
3155 		*s = 0;
3156 		return 0;
3157 	}
3158 
3159 	val = parse_tag_value(str, tags_size);
3160 	if (val != (unsigned long) -1) {
3161 		*s = val;
3162 		return 0;
3163 	}
3164 
3165 	return -1;
3166 }
3167 
3168 static int record__parse_mmap_pages(const struct option *opt,
3169 				    const char *str,
3170 				    int unset __maybe_unused)
3171 {
3172 	struct record_opts *opts = opt->value;
3173 	char *s, *p;
3174 	unsigned int mmap_pages;
3175 	int ret;
3176 
3177 	if (!str)
3178 		return -EINVAL;
3179 
3180 	s = strdup(str);
3181 	if (!s)
3182 		return -ENOMEM;
3183 
3184 	p = strchr(s, ',');
3185 	if (p)
3186 		*p = '\0';
3187 
3188 	if (*s) {
3189 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3190 		if (ret)
3191 			goto out_free;
3192 		opts->mmap_pages = mmap_pages;
3193 	}
3194 
3195 	if (!p) {
3196 		ret = 0;
3197 		goto out_free;
3198 	}
3199 
3200 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3201 	if (ret)
3202 		goto out_free;
3203 
3204 	opts->auxtrace_mmap_pages = mmap_pages;
3205 
3206 out_free:
3207 	free(s);
3208 	return ret;
3209 }
3210 
3211 static int record__parse_off_cpu_thresh(const struct option *opt,
3212 					const char *str,
3213 					int unset __maybe_unused)
3214 {
3215 	struct record_opts *opts = opt->value;
3216 	char *endptr;
3217 	u64 off_cpu_thresh_ms;
3218 
3219 	if (!str)
3220 		return -EINVAL;
3221 
3222 	off_cpu_thresh_ms = strtoull(str, &endptr, 10);
3223 
3224 	/* the threshold isn't string "0", yet strtoull() returns 0, parsing failed */
3225 	if (*endptr || (off_cpu_thresh_ms == 0 && strcmp(str, "0")))
3226 		return -EINVAL;
3227 	else
3228 		opts->off_cpu_thresh_ns = off_cpu_thresh_ms * NSEC_PER_MSEC;
3229 
3230 	return 0;
3231 }
3232 
3233 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3234 {
3235 }
3236 
3237 static int parse_control_option(const struct option *opt,
3238 				const char *str,
3239 				int unset __maybe_unused)
3240 {
3241 	struct record_opts *opts = opt->value;
3242 
3243 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3244 }
3245 
3246 static void switch_output_size_warn(struct record *rec)
3247 {
3248 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3249 	struct switch_output *s = &rec->switch_output;
3250 
3251 	wakeup_size /= 2;
3252 
3253 	if (s->size < wakeup_size) {
3254 		char buf[100];
3255 
3256 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3257 		pr_warning("WARNING: switch-output data size lower than "
3258 			   "wakeup kernel buffer size (%s) "
3259 			   "expect bigger perf.data sizes\n", buf);
3260 	}
3261 }
3262 
3263 static int switch_output_setup(struct record *rec)
3264 {
3265 	struct switch_output *s = &rec->switch_output;
3266 	static struct parse_tag tags_size[] = {
3267 		{ .tag  = 'B', .mult = 1       },
3268 		{ .tag  = 'K', .mult = 1 << 10 },
3269 		{ .tag  = 'M', .mult = 1 << 20 },
3270 		{ .tag  = 'G', .mult = 1 << 30 },
3271 		{ .tag  = 0 },
3272 	};
3273 	static struct parse_tag tags_time[] = {
3274 		{ .tag  = 's', .mult = 1        },
3275 		{ .tag  = 'm', .mult = 60       },
3276 		{ .tag  = 'h', .mult = 60*60    },
3277 		{ .tag  = 'd', .mult = 60*60*24 },
3278 		{ .tag  = 0 },
3279 	};
3280 	unsigned long val;
3281 
3282 	/*
3283 	 * If we're using --switch-output-events, then we imply its
3284 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3285 	 *  thread to its parent.
3286 	 */
3287 	if (rec->switch_output_event_set) {
3288 		if (record__threads_enabled(rec)) {
3289 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3290 			return 0;
3291 		}
3292 		goto do_signal;
3293 	}
3294 
3295 	if (!s->set)
3296 		return 0;
3297 
3298 	if (record__threads_enabled(rec)) {
3299 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3300 		return 0;
3301 	}
3302 
3303 	if (!strcmp(s->str, "signal")) {
3304 do_signal:
3305 		s->signal = true;
3306 		pr_debug("switch-output with SIGUSR2 signal\n");
3307 		goto enabled;
3308 	}
3309 
3310 	val = parse_tag_value(s->str, tags_size);
3311 	if (val != (unsigned long) -1) {
3312 		s->size = val;
3313 		pr_debug("switch-output with %s size threshold\n", s->str);
3314 		goto enabled;
3315 	}
3316 
3317 	val = parse_tag_value(s->str, tags_time);
3318 	if (val != (unsigned long) -1) {
3319 		s->time = val;
3320 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3321 			 s->str, s->time);
3322 		goto enabled;
3323 	}
3324 
3325 	return -1;
3326 
3327 enabled:
3328 	rec->timestamp_filename = true;
3329 	s->enabled              = true;
3330 
3331 	if (s->size && !rec->opts.no_buffering)
3332 		switch_output_size_warn(rec);
3333 
3334 	return 0;
3335 }
3336 
3337 static const char * const __record_usage[] = {
3338 	"perf record [<options>] [<command>]",
3339 	"perf record [<options>] -- <command> [<options>]",
3340 	NULL
3341 };
3342 const char * const *record_usage = __record_usage;
3343 
3344 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
3345 				  struct perf_sample *sample, struct machine *machine)
3346 {
3347 	/*
3348 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3349 	 * no need to add them twice.
3350 	 */
3351 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3352 		return 0;
3353 	return perf_event__process_mmap(tool, event, sample, machine);
3354 }
3355 
3356 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
3357 				   struct perf_sample *sample, struct machine *machine)
3358 {
3359 	/*
3360 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3361 	 * no need to add them twice.
3362 	 */
3363 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3364 		return 0;
3365 
3366 	return perf_event__process_mmap2(tool, event, sample, machine);
3367 }
3368 
3369 static int process_timestamp_boundary(const struct perf_tool *tool,
3370 				      union perf_event *event __maybe_unused,
3371 				      struct perf_sample *sample,
3372 				      struct machine *machine __maybe_unused)
3373 {
3374 	struct record *rec = container_of(tool, struct record, tool);
3375 
3376 	set_timestamp_boundary(rec, sample->time);
3377 	return 0;
3378 }
3379 
3380 static int parse_record_synth_option(const struct option *opt,
3381 				     const char *str,
3382 				     int unset __maybe_unused)
3383 {
3384 	struct record_opts *opts = opt->value;
3385 	char *p = strdup(str);
3386 
3387 	if (p == NULL)
3388 		return -1;
3389 
3390 	opts->synth = parse_synth_opt(p);
3391 	free(p);
3392 
3393 	if (opts->synth < 0) {
3394 		pr_err("Invalid synth option: %s\n", str);
3395 		return -1;
3396 	}
3397 	return 0;
3398 }
3399 
3400 /*
3401  * XXX Ideally would be local to cmd_record() and passed to a record__new
3402  * because we need to have access to it in record__exit, that is called
3403  * after cmd_record() exits, but since record_options need to be accessible to
3404  * builtin-script, leave it here.
3405  *
3406  * At least we don't ouch it in all the other functions here directly.
3407  *
3408  * Just say no to tons of global variables, sigh.
3409  */
3410 static struct record record = {
3411 	.opts = {
3412 		.sample_time	     = true,
3413 		.mmap_pages	     = UINT_MAX,
3414 		.user_freq	     = UINT_MAX,
3415 		.user_interval	     = ULLONG_MAX,
3416 		.freq		     = 4000,
3417 		.target		     = {
3418 			.uses_mmap   = true,
3419 			.default_per_cpu = true,
3420 		},
3421 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3422 		.nr_threads_synthesize = 1,
3423 		.ctl_fd              = -1,
3424 		.ctl_fd_ack          = -1,
3425 		.synth               = PERF_SYNTH_ALL,
3426 		.off_cpu_thresh_ns   = OFFCPU_THRESH,
3427 	},
3428 	.buildid_mmap = true,
3429 };
3430 
3431 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3432 	"\n\t\t\t\tDefault: fp";
3433 
3434 static bool dry_run;
3435 
3436 static struct parse_events_option_args parse_events_option_args = {
3437 	.evlistp = &record.evlist,
3438 };
3439 
3440 static struct parse_events_option_args switch_output_parse_events_option_args = {
3441 	.evlistp = &record.sb_evlist,
3442 };
3443 
3444 /*
3445  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3446  * with it and switch to use the library functions in perf_evlist that came
3447  * from builtin-record.c, i.e. use record_opts,
3448  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3449  * using pipes, etc.
3450  */
3451 static struct option __record_options[] = {
3452 	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3453 		     "event selector. use 'perf list' to list available events",
3454 		     parse_events_option),
3455 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3456 		     "event filter", parse_filter),
3457 	OPT_BOOLEAN(0, "latency", &record.latency,
3458 		    "Enable data collection for latency profiling.\n"
3459 		    "\t\t\t  Use perf report --latency for latency-centric profile."),
3460 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3461 			   NULL, "don't record events from perf itself",
3462 			   exclude_perf),
3463 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3464 		    "record events on existing process id"),
3465 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3466 		    "record events on existing thread id"),
3467 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3468 		    "collect data with this RT SCHED_FIFO priority"),
3469 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3470 		    "collect data without buffering"),
3471 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3472 		    "collect raw sample records from all opened counters"),
3473 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3474 			    "system-wide collection from all CPUs"),
3475 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3476 		    "list of cpus to monitor"),
3477 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3478 	OPT_STRING('o', "output", &record.data.path, "file",
3479 		    "output file name"),
3480 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3481 			&record.opts.no_inherit_set,
3482 			"child tasks do not inherit counters"),
3483 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3484 		    "synthesize non-sample events at the end of output"),
3485 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3486 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3487 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3488 		    "Fail if the specified frequency can't be used"),
3489 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3490 		     "profile at this frequency",
3491 		      record__parse_freq),
3492 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3493 		     "number of mmap data pages and AUX area tracing mmap pages",
3494 		     record__parse_mmap_pages),
3495 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3496 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3497 		     record__mmap_flush_parse),
3498 	OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
3499 			   NULL, "enables call-graph recording" ,
3500 			   &record_callchain_opt),
3501 	OPT_CALLBACK(0, "call-graph", &record.opts,
3502 		     "record_mode[,record_size]", record_callchain_help,
3503 		     &record_parse_callchain_opt),
3504 	OPT_INCR('v', "verbose", &verbose,
3505 		    "be more verbose (show counter open errors, etc)"),
3506 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3507 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3508 		    "per thread counts"),
3509 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3510 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3511 		    "Record the sample physical addresses"),
3512 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3513 		    "Record the sampled data address data page size"),
3514 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3515 		    "Record the sampled code address (ip) page size"),
3516 	OPT_BOOLEAN(0, "sample-mem-info", &record.opts.sample_data_src,
3517 		    "Record the data source for memory operations"),
3518 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3519 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3520 		    "Record the sample identifier"),
3521 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3522 			&record.opts.sample_time_set,
3523 			"Record the sample timestamps"),
3524 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3525 			"Record the sample period"),
3526 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3527 		    "don't sample"),
3528 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3529 			&record.no_buildid_cache_set,
3530 			"do not update the buildid cache"),
3531 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3532 			&record.no_buildid_set,
3533 			"do not collect buildids in perf.data"),
3534 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3535 		     "monitor event in cgroup name only",
3536 		     parse_cgroups),
3537 	OPT_CALLBACK('D', "delay", &record, "ms",
3538 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3539 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3540 		     record__parse_event_enable_time),
3541 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3542 	OPT_STRING('u', "uid", &record.uid_str, "user", "user to profile"),
3543 
3544 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3545 		     "branch any", "sample any taken branches",
3546 		     parse_branch_stack),
3547 
3548 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3549 		     "branch filter mask", "branch stack filter modes",
3550 		     parse_branch_stack),
3551 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3552 		    "sample by weight (on special events only)"),
3553 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3554 		    "sample transaction flags (special events only)"),
3555 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3556 		    "use per-thread mmaps"),
3557 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3558 		    "sample selected machine registers on interrupt,"
3559 		    " use '-I?' to list register names", parse_intr_regs),
3560 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3561 		    "sample selected machine registers in user space,"
3562 		    " use '--user-regs=?' to list register names", parse_user_regs),
3563 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3564 		    "Record running/enabled time of read (:S) events"),
3565 	OPT_CALLBACK('k', "clockid", &record.opts,
3566 	"clockid", "clockid to use for events, see clock_gettime()",
3567 	parse_clockid),
3568 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3569 			  "opts", "AUX area tracing Snapshot Mode", ""),
3570 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3571 			  "opts", "sample AUX area", ""),
3572 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3573 			"per thread proc mmap processing timeout in ms"),
3574 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3575 		    "Record namespaces events"),
3576 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3577 		    "Record cgroup events"),
3578 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3579 			&record.opts.record_switch_events_set,
3580 			"Record context switch events"),
3581 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3582 			 "Configure all used events to run in kernel space.",
3583 			 PARSE_OPT_EXCLUSIVE),
3584 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3585 			 "Configure all used events to run in user space.",
3586 			 PARSE_OPT_EXCLUSIVE),
3587 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3588 		    "collect kernel callchains"),
3589 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3590 		    "collect user callchains"),
3591 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3592 		   "file", "vmlinux pathname"),
3593 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3594 		    "Record build-id of all DSOs regardless of hits"),
3595 	OPT_BOOLEAN_SET(0, "buildid-mmap", &record.buildid_mmap, &record.buildid_mmap_set,
3596 			"Record build-id in mmap events and skip build-id processing."),
3597 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3598 		    "append timestamp to output filename"),
3599 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3600 		    "Record timestamp boundary (time of first/last samples)"),
3601 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3602 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3603 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3604 			  "signal"),
3605 	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3606 			 &record.switch_output_event_set, "switch output event",
3607 			 "switch output event selector. use 'perf list' to list available events",
3608 			 parse_events_option_new_evlist),
3609 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3610 		   "Limit number of switch output generated files"),
3611 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3612 		    "Parse options then exit"),
3613 #ifdef HAVE_AIO_SUPPORT
3614 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3615 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3616 		     record__aio_parse),
3617 #endif
3618 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3619 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3620 		     record__parse_affinity),
3621 #ifdef HAVE_ZSTD_SUPPORT
3622 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3623 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3624 			    record__parse_comp_level),
3625 #endif
3626 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3627 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3628 	OPT_UINTEGER(0, "num-thread-synthesize",
3629 		     &record.opts.nr_threads_synthesize,
3630 		     "number of threads to run for event synthesis"),
3631 #ifdef HAVE_LIBPFM
3632 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3633 		"libpfm4 event selector. use 'perf list' to list available events",
3634 		parse_libpfm_events_option),
3635 #endif
3636 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3637 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3638 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3639 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3640 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3641 		      parse_control_option),
3642 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3643 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3644 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3645 			  &record.debuginfod.set, "debuginfod urls",
3646 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3647 			  "system"),
3648 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3649 			    "write collected trace data into several data files using parallel threads",
3650 			    record__parse_threads),
3651 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3652 	OPT_STRING(0, "setup-filter", &record.filter_action, "pin|unpin",
3653 		   "BPF filter action"),
3654 	OPT_CALLBACK(0, "off-cpu-thresh", &record.opts, "ms",
3655 		     "Dump off-cpu samples if off-cpu time exceeds this threshold (in milliseconds). (Default: 500ms)",
3656 		     record__parse_off_cpu_thresh),
3657 	OPT_BOOLEAN_SET(0, "data-mmap", &record.opts.record_data_mmap,
3658 			&record.opts.record_data_mmap_set,
3659 			"Record mmap events for non-executable mappings"),
3660 	OPT_END()
3661 };
3662 
3663 struct option *record_options = __record_options;
3664 
3665 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3666 {
3667 	struct perf_cpu cpu;
3668 	unsigned int idx;
3669 
3670 	if (cpu_map__is_dummy(cpus))
3671 		return 0;
3672 
3673 	perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3674 		/* Return ENODEV is input cpu is greater than max cpu */
3675 		if ((unsigned long)cpu.cpu > mask->nbits)
3676 			return -ENODEV;
3677 		__set_bit(cpu.cpu, mask->bits);
3678 	}
3679 
3680 	return 0;
3681 }
3682 
3683 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3684 {
3685 	struct perf_cpu_map *cpus;
3686 
3687 	cpus = perf_cpu_map__new(mask_spec);
3688 	if (!cpus)
3689 		return -ENOMEM;
3690 
3691 	bitmap_zero(mask->bits, mask->nbits);
3692 	if (record__mmap_cpu_mask_init(mask, cpus))
3693 		return -ENODEV;
3694 
3695 	perf_cpu_map__put(cpus);
3696 
3697 	return 0;
3698 }
3699 
3700 static void record__free_thread_masks(struct record *rec, int nr_threads)
3701 {
3702 	int t;
3703 
3704 	if (rec->thread_masks)
3705 		for (t = 0; t < nr_threads; t++)
3706 			record__thread_mask_free(&rec->thread_masks[t]);
3707 
3708 	zfree(&rec->thread_masks);
3709 }
3710 
3711 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3712 {
3713 	int t, ret;
3714 
3715 	rec->thread_masks = calloc(nr_threads, sizeof(*(rec->thread_masks)));
3716 	if (!rec->thread_masks) {
3717 		pr_err("Failed to allocate thread masks\n");
3718 		return -ENOMEM;
3719 	}
3720 
3721 	for (t = 0; t < nr_threads; t++) {
3722 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3723 		if (ret) {
3724 			pr_err("Failed to allocate thread masks[%d]\n", t);
3725 			goto out_free;
3726 		}
3727 	}
3728 
3729 	return 0;
3730 
3731 out_free:
3732 	record__free_thread_masks(rec, nr_threads);
3733 
3734 	return ret;
3735 }
3736 
3737 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3738 {
3739 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3740 
3741 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3742 	if (ret)
3743 		return ret;
3744 
3745 	rec->nr_threads = nr_cpus;
3746 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3747 
3748 	for (t = 0; t < rec->nr_threads; t++) {
3749 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3750 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3751 		if (verbose > 0) {
3752 			pr_debug("thread_masks[%d]: ", t);
3753 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3754 			pr_debug("thread_masks[%d]: ", t);
3755 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3756 		}
3757 	}
3758 
3759 	return 0;
3760 }
3761 
3762 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3763 					  const char **maps_spec, const char **affinity_spec,
3764 					  u32 nr_spec)
3765 {
3766 	u32 s;
3767 	int ret = 0, t = 0;
3768 	struct mmap_cpu_mask cpus_mask;
3769 	struct thread_mask thread_mask, full_mask, *thread_masks;
3770 
3771 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3772 	if (ret) {
3773 		pr_err("Failed to allocate CPUs mask\n");
3774 		return ret;
3775 	}
3776 
3777 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3778 	if (ret) {
3779 		pr_err("Failed to init cpu mask\n");
3780 		goto out_free_cpu_mask;
3781 	}
3782 
3783 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3784 	if (ret) {
3785 		pr_err("Failed to allocate full mask\n");
3786 		goto out_free_cpu_mask;
3787 	}
3788 
3789 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3790 	if (ret) {
3791 		pr_err("Failed to allocate thread mask\n");
3792 		goto out_free_full_and_cpu_masks;
3793 	}
3794 
3795 	for (s = 0; s < nr_spec; s++) {
3796 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3797 		if (ret) {
3798 			pr_err("Failed to initialize maps thread mask\n");
3799 			goto out_free;
3800 		}
3801 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3802 		if (ret) {
3803 			pr_err("Failed to initialize affinity thread mask\n");
3804 			goto out_free;
3805 		}
3806 
3807 		/* ignore invalid CPUs but do not allow empty masks */
3808 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3809 				cpus_mask.bits, thread_mask.maps.nbits)) {
3810 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3811 			ret = -EINVAL;
3812 			goto out_free;
3813 		}
3814 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3815 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3816 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3817 			ret = -EINVAL;
3818 			goto out_free;
3819 		}
3820 
3821 		/* do not allow intersection with other masks (full_mask) */
3822 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3823 				      thread_mask.maps.nbits)) {
3824 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3825 			ret = -EINVAL;
3826 			goto out_free;
3827 		}
3828 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3829 				      thread_mask.affinity.nbits)) {
3830 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3831 			ret = -EINVAL;
3832 			goto out_free;
3833 		}
3834 
3835 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3836 			  thread_mask.maps.bits, full_mask.maps.nbits);
3837 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3838 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3839 
3840 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3841 		if (!thread_masks) {
3842 			pr_err("Failed to reallocate thread masks\n");
3843 			ret = -ENOMEM;
3844 			goto out_free;
3845 		}
3846 		rec->thread_masks = thread_masks;
3847 		rec->thread_masks[t] = thread_mask;
3848 		if (verbose > 0) {
3849 			pr_debug("thread_masks[%d]: ", t);
3850 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3851 			pr_debug("thread_masks[%d]: ", t);
3852 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3853 		}
3854 		t++;
3855 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3856 		if (ret) {
3857 			pr_err("Failed to allocate thread mask\n");
3858 			goto out_free_full_and_cpu_masks;
3859 		}
3860 	}
3861 	rec->nr_threads = t;
3862 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3863 	if (!rec->nr_threads)
3864 		ret = -EINVAL;
3865 
3866 out_free:
3867 	record__thread_mask_free(&thread_mask);
3868 out_free_full_and_cpu_masks:
3869 	record__thread_mask_free(&full_mask);
3870 out_free_cpu_mask:
3871 	record__mmap_cpu_mask_free(&cpus_mask);
3872 
3873 	return ret;
3874 }
3875 
3876 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3877 {
3878 	int ret;
3879 	struct cpu_topology *topo;
3880 
3881 	topo = cpu_topology__new();
3882 	if (!topo) {
3883 		pr_err("Failed to allocate CPU topology\n");
3884 		return -ENOMEM;
3885 	}
3886 
3887 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3888 					     topo->core_cpus_list, topo->core_cpus_lists);
3889 	cpu_topology__delete(topo);
3890 
3891 	return ret;
3892 }
3893 
3894 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3895 {
3896 	int ret;
3897 	struct cpu_topology *topo;
3898 
3899 	topo = cpu_topology__new();
3900 	if (!topo) {
3901 		pr_err("Failed to allocate CPU topology\n");
3902 		return -ENOMEM;
3903 	}
3904 
3905 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3906 					     topo->package_cpus_list, topo->package_cpus_lists);
3907 	cpu_topology__delete(topo);
3908 
3909 	return ret;
3910 }
3911 
3912 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3913 {
3914 	u32 s;
3915 	int ret;
3916 	const char **spec;
3917 	struct numa_topology *topo;
3918 
3919 	topo = numa_topology__new();
3920 	if (!topo) {
3921 		pr_err("Failed to allocate NUMA topology\n");
3922 		return -ENOMEM;
3923 	}
3924 
3925 	spec = calloc(topo->nr, sizeof(char *));
3926 	if (!spec) {
3927 		pr_err("Failed to allocate NUMA spec\n");
3928 		ret = -ENOMEM;
3929 		goto out_delete_topo;
3930 	}
3931 	for (s = 0; s < topo->nr; s++)
3932 		spec[s] = topo->nodes[s].cpus;
3933 
3934 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3935 
3936 	zfree(&spec);
3937 
3938 out_delete_topo:
3939 	numa_topology__delete(topo);
3940 
3941 	return ret;
3942 }
3943 
3944 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3945 {
3946 	int t, ret;
3947 	u32 s, nr_spec = 0;
3948 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3949 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3950 
3951 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3952 		spec = strtok_r(user_spec, ":", &spec_ptr);
3953 		if (spec == NULL)
3954 			break;
3955 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3956 		mask = strtok_r(spec, "/", &mask_ptr);
3957 		if (mask == NULL)
3958 			break;
3959 		pr_debug2("  maps mask: %s\n", mask);
3960 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3961 		if (!tmp_spec) {
3962 			pr_err("Failed to reallocate maps spec\n");
3963 			ret = -ENOMEM;
3964 			goto out_free;
3965 		}
3966 		maps_spec = tmp_spec;
3967 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3968 		if (!maps_spec[nr_spec]) {
3969 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3970 			ret = -ENOMEM;
3971 			goto out_free;
3972 		}
3973 		mask = strtok_r(NULL, "/", &mask_ptr);
3974 		if (mask == NULL) {
3975 			pr_err("Invalid thread maps or affinity specs\n");
3976 			ret = -EINVAL;
3977 			goto out_free;
3978 		}
3979 		pr_debug2("  affinity mask: %s\n", mask);
3980 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3981 		if (!tmp_spec) {
3982 			pr_err("Failed to reallocate affinity spec\n");
3983 			ret = -ENOMEM;
3984 			goto out_free;
3985 		}
3986 		affinity_spec = tmp_spec;
3987 		affinity_spec[nr_spec] = strdup(mask);
3988 		if (!affinity_spec[nr_spec]) {
3989 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3990 			ret = -ENOMEM;
3991 			goto out_free;
3992 		}
3993 		dup_mask = NULL;
3994 		nr_spec++;
3995 	}
3996 
3997 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3998 					     (const char **)affinity_spec, nr_spec);
3999 
4000 out_free:
4001 	free(dup_mask);
4002 	for (s = 0; s < nr_spec; s++) {
4003 		if (maps_spec)
4004 			free(maps_spec[s]);
4005 		if (affinity_spec)
4006 			free(affinity_spec[s]);
4007 	}
4008 	free(affinity_spec);
4009 	free(maps_spec);
4010 
4011 	return ret;
4012 }
4013 
4014 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
4015 {
4016 	int ret;
4017 
4018 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
4019 	if (ret)
4020 		return ret;
4021 
4022 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
4023 		return -ENODEV;
4024 
4025 	rec->nr_threads = 1;
4026 
4027 	return 0;
4028 }
4029 
4030 static int record__init_thread_masks(struct record *rec)
4031 {
4032 	int ret = 0;
4033 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
4034 
4035 	if (!record__threads_enabled(rec))
4036 		return record__init_thread_default_masks(rec, cpus);
4037 
4038 	if (evlist__per_thread(rec->evlist)) {
4039 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
4040 		return -EINVAL;
4041 	}
4042 
4043 	switch (rec->opts.threads_spec) {
4044 	case THREAD_SPEC__CPU:
4045 		ret = record__init_thread_cpu_masks(rec, cpus);
4046 		break;
4047 	case THREAD_SPEC__CORE:
4048 		ret = record__init_thread_core_masks(rec, cpus);
4049 		break;
4050 	case THREAD_SPEC__PACKAGE:
4051 		ret = record__init_thread_package_masks(rec, cpus);
4052 		break;
4053 	case THREAD_SPEC__NUMA:
4054 		ret = record__init_thread_numa_masks(rec, cpus);
4055 		break;
4056 	case THREAD_SPEC__USER:
4057 		ret = record__init_thread_user_masks(rec, cpus);
4058 		break;
4059 	default:
4060 		break;
4061 	}
4062 
4063 	return ret;
4064 }
4065 
4066 int cmd_record(int argc, const char **argv)
4067 {
4068 	int err;
4069 	struct record *rec = &record;
4070 	char errbuf[BUFSIZ];
4071 
4072 	setlocale(LC_ALL, "");
4073 
4074 #ifndef HAVE_BPF_SKEL
4075 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
4076 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
4077 # undef set_nobuild
4078 #endif
4079 
4080 	/* Disable eager loading of kernel symbols that adds overhead to perf record. */
4081 	symbol_conf.lazy_load_kernel_maps = true;
4082 	rec->opts.affinity = PERF_AFFINITY_SYS;
4083 
4084 	rec->evlist = evlist__new();
4085 	if (rec->evlist == NULL)
4086 		return -ENOMEM;
4087 
4088 	err = perf_config(perf_record_config, rec);
4089 	if (err)
4090 		return err;
4091 
4092 	argc = parse_options(argc, argv, record_options, record_usage,
4093 			    PARSE_OPT_STOP_AT_NON_OPTION);
4094 	if (quiet)
4095 		perf_quiet_option();
4096 
4097 	err = symbol__validate_sym_arguments();
4098 	if (err)
4099 		return err;
4100 
4101 	perf_debuginfod_setup(&record.debuginfod);
4102 
4103 	/*
4104 	 * Use system wide (-a) for the default target (ie. when no
4105 	 * workload). User ID filtering also implies system-wide.
4106 	 */
4107 	if ((!argc && target__none(&rec->opts.target)) || rec->uid_str)
4108 		rec->opts.target.system_wide = true;
4109 
4110 	if (nr_cgroups && !rec->opts.target.system_wide) {
4111 		usage_with_options_msg(record_usage, record_options,
4112 			"cgroup monitoring only available in system-wide mode");
4113 
4114 	}
4115 
4116 	if (record.latency) {
4117 		/*
4118 		 * There is no fundamental reason why latency profiling
4119 		 * can't work for system-wide mode, but exact semantics
4120 		 * and details are to be defined.
4121 		 * See the following thread for details:
4122 		 * https://lore.kernel.org/all/Z4XDJyvjiie3howF@google.com/
4123 		 */
4124 		if (record.opts.target.system_wide) {
4125 			pr_err("Failed: latency profiling is not supported with system-wide collection.\n");
4126 			err = -EINVAL;
4127 			goto out_opts;
4128 		}
4129 		record.opts.record_switch_events = true;
4130 	}
4131 
4132 	if (rec->buildid_mmap && !perf_can_record_build_id()) {
4133 		pr_warning("Missing support for build id in kernel mmap events.\n"
4134 			   "Disable this warning with --no-buildid-mmap\n");
4135 		rec->buildid_mmap = false;
4136 	}
4137 
4138 	if (rec->buildid_mmap) {
4139 		/* Enable perf_event_attr::build_id bit. */
4140 		rec->opts.build_id = true;
4141 		/* Disable build-ID table in the header. */
4142 		rec->no_buildid = true;
4143 	} else {
4144 		pr_debug("Disabling build id in synthesized mmap2 events.\n");
4145 		symbol_conf.no_buildid_mmap2 = true;
4146 	}
4147 
4148 	if (rec->no_buildid_set && rec->no_buildid) {
4149 		/* -B implies -N for historic reasons. */
4150 		rec->no_buildid_cache = true;
4151 	}
4152 
4153 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4154 		pr_err("Kernel has no cgroup sampling support.\n");
4155 		err = -EINVAL;
4156 		goto out_opts;
4157 	}
4158 
4159 	if (rec->opts.kcore)
4160 		rec->opts.text_poke = true;
4161 
4162 	if (rec->opts.kcore || record__threads_enabled(rec))
4163 		rec->data.is_dir = true;
4164 
4165 	if (record__threads_enabled(rec)) {
4166 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4167 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4168 			goto out_opts;
4169 		}
4170 		if (record__aio_enabled(rec)) {
4171 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4172 			goto out_opts;
4173 		}
4174 	}
4175 
4176 	if (rec->opts.comp_level != 0) {
4177 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4178 		rec->no_buildid = true;
4179 	}
4180 
4181 	if (rec->opts.record_switch_events &&
4182 	    !perf_can_record_switch_events()) {
4183 		ui__error("kernel does not support recording context switch events\n");
4184 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4185 		err = -EINVAL;
4186 		goto out_opts;
4187 	}
4188 
4189 	if (switch_output_setup(rec)) {
4190 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4191 		err = -EINVAL;
4192 		goto out_opts;
4193 	}
4194 
4195 	if (rec->switch_output.time) {
4196 		signal(SIGALRM, alarm_sig_handler);
4197 		alarm(rec->switch_output.time);
4198 	}
4199 
4200 	if (rec->switch_output.num_files) {
4201 		rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4202 						      sizeof(char *));
4203 		if (!rec->switch_output.filenames) {
4204 			err = -EINVAL;
4205 			goto out_opts;
4206 		}
4207 	}
4208 
4209 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4210 		rec->timestamp_filename = false;
4211 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4212 	}
4213 
4214 	if (rec->filter_action) {
4215 		if (!strcmp(rec->filter_action, "pin"))
4216 			err = perf_bpf_filter__pin();
4217 		else if (!strcmp(rec->filter_action, "unpin"))
4218 			err = perf_bpf_filter__unpin();
4219 		else {
4220 			pr_warning("Unknown BPF filter action: %s\n", rec->filter_action);
4221 			err = -EINVAL;
4222 		}
4223 		goto out_opts;
4224 	}
4225 
4226 	/* For backward compatibility, -d implies --mem-info and --data-mmap */
4227 	if (rec->opts.sample_address) {
4228 		rec->opts.sample_data_src = true;
4229 		if (!rec->opts.record_data_mmap_set)
4230 			rec->opts.record_data_mmap = true;
4231 	}
4232 
4233 	/*
4234 	 * Allow aliases to facilitate the lookup of symbols for address
4235 	 * filters. Refer to auxtrace_parse_filters().
4236 	 */
4237 	symbol_conf.allow_aliases = true;
4238 
4239 	symbol__init(NULL);
4240 
4241 	err = record__auxtrace_init(rec);
4242 	if (err)
4243 		goto out;
4244 
4245 	if (dry_run)
4246 		goto out;
4247 
4248 	err = -ENOMEM;
4249 
4250 	if (rec->no_buildid_cache) {
4251 		disable_buildid_cache();
4252 	} else if (rec->switch_output.enabled) {
4253 		/*
4254 		 * In 'perf record --switch-output', disable buildid
4255 		 * generation by default to reduce data file switching
4256 		 * overhead. Still generate buildid if they are required
4257 		 * explicitly using
4258 		 *
4259 		 *  perf record --switch-output --no-no-buildid \
4260 		 *              --no-no-buildid-cache
4261 		 *
4262 		 * Following code equals to:
4263 		 *
4264 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4265 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4266 		 *         disable_buildid_cache();
4267 		 */
4268 		bool disable = true;
4269 
4270 		if (rec->no_buildid_set && !rec->no_buildid)
4271 			disable = false;
4272 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4273 			disable = false;
4274 		if (disable) {
4275 			rec->no_buildid = true;
4276 			rec->no_buildid_cache = true;
4277 			disable_buildid_cache();
4278 		}
4279 	}
4280 
4281 	if (record.opts.overwrite)
4282 		record.opts.tail_synthesize = true;
4283 
4284 	if (rec->evlist->core.nr_entries == 0) {
4285 		struct evlist *def_evlist = evlist__new_default(&rec->opts.target,
4286 								callchain_param.enabled);
4287 
4288 		if (!def_evlist)
4289 			goto out;
4290 
4291 		evlist__splice_list_tail(rec->evlist, &def_evlist->core.entries);
4292 		evlist__delete(def_evlist);
4293 	}
4294 
4295 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4296 		rec->opts.no_inherit = true;
4297 
4298 	err = target__validate(&rec->opts.target);
4299 	if (err) {
4300 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4301 		ui__warning("%s\n", errbuf);
4302 	}
4303 
4304 	if (rec->uid_str) {
4305 		uid_t uid = parse_uid(rec->uid_str);
4306 
4307 		if (uid == UINT_MAX) {
4308 			ui__error("Invalid User: %s", rec->uid_str);
4309 			err = -EINVAL;
4310 			goto out;
4311 		}
4312 		err = parse_uid_filter(rec->evlist, uid);
4313 		if (err)
4314 			goto out;
4315 	}
4316 
4317 	/* Enable ignoring missing threads when -p option is defined. */
4318 	rec->opts.ignore_missing_thread = rec->opts.target.pid;
4319 
4320 	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4321 
4322 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4323 		arch__add_leaf_frame_record_opts(&rec->opts);
4324 
4325 	err = -ENOMEM;
4326 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4327 		if (rec->opts.target.pid != NULL) {
4328 			pr_err("Couldn't create thread/CPU maps: %s\n",
4329 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4330 			goto out;
4331 		}
4332 		else
4333 			usage_with_options(record_usage, record_options);
4334 	}
4335 
4336 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4337 	if (err)
4338 		goto out;
4339 
4340 	/*
4341 	 * We take all buildids when the file contains
4342 	 * AUX area tracing data because we do not decode the
4343 	 * trace because it would take too long.
4344 	 */
4345 	if (rec->opts.full_auxtrace)
4346 		rec->buildid_all = true;
4347 
4348 	if (rec->opts.text_poke) {
4349 		err = record__config_text_poke(rec->evlist);
4350 		if (err) {
4351 			pr_err("record__config_text_poke failed, error %d\n", err);
4352 			goto out;
4353 		}
4354 	}
4355 
4356 	if (rec->off_cpu) {
4357 		err = record__config_off_cpu(rec);
4358 		if (err) {
4359 			pr_err("record__config_off_cpu failed, error %d\n", err);
4360 			goto out;
4361 		}
4362 	}
4363 
4364 	if (record_opts__config(&rec->opts)) {
4365 		err = -EINVAL;
4366 		goto out;
4367 	}
4368 
4369 	err = record__config_tracking_events(rec);
4370 	if (err) {
4371 		pr_err("record__config_tracking_events failed, error %d\n", err);
4372 		goto out;
4373 	}
4374 
4375 	err = record__init_thread_masks(rec);
4376 	if (err) {
4377 		pr_err("Failed to initialize parallel data streaming masks\n");
4378 		goto out;
4379 	}
4380 
4381 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4382 		rec->opts.nr_cblocks = nr_cblocks_max;
4383 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4384 
4385 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4386 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4387 
4388 	if (rec->opts.comp_level > comp_level_max)
4389 		rec->opts.comp_level = comp_level_max;
4390 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4391 
4392 	err = __cmd_record(&record, argc, argv);
4393 out:
4394 	record__free_thread_masks(rec, rec->nr_threads);
4395 	rec->nr_threads = 0;
4396 	symbol__exit();
4397 	auxtrace_record__free(rec->itr);
4398 out_opts:
4399 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4400 	evlist__delete(rec->evlist);
4401 	return err;
4402 }
4403 
4404 static void snapshot_sig_handler(int sig __maybe_unused)
4405 {
4406 	struct record *rec = &record;
4407 
4408 	hit_auxtrace_snapshot_trigger(rec);
4409 
4410 	if (switch_output_signal(rec))
4411 		trigger_hit(&switch_output_trigger);
4412 }
4413 
4414 static void alarm_sig_handler(int sig __maybe_unused)
4415 {
4416 	struct record *rec = &record;
4417 
4418 	if (switch_output_time(rec))
4419 		trigger_hit(&switch_output_trigger);
4420 }
4421