xref: /linux/tools/perf/builtin-record.c (revision 046fd8206d820b71e7870f7b894b46f8a15ae974)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/stat.h"
30 #include "util/symbol.h"
31 #include "util/record.h"
32 #include "util/cpumap.h"
33 #include "util/thread_map.h"
34 #include "util/data.h"
35 #include "util/perf_regs.h"
36 #include "util/auxtrace.h"
37 #include "util/tsc.h"
38 #include "util/parse-branch-options.h"
39 #include "util/parse-regs-options.h"
40 #include "util/perf_api_probe.h"
41 #include "util/trigger.h"
42 #include "util/perf-hooks.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/pmu.h"
50 #include "util/pmus.h"
51 #include "util/clockid.h"
52 #include "util/off_cpu.h"
53 #include "util/bpf-filter.h"
54 #include "util/strbuf.h"
55 #include "asm/bug.h"
56 #include "perf.h"
57 #include "cputopo.h"
58 #include "dwarf-regs.h"
59 
60 #include <errno.h>
61 #include <inttypes.h>
62 #include <locale.h>
63 #include <poll.h>
64 #include <pthread.h>
65 #include <unistd.h>
66 #ifndef HAVE_GETTID
67 #include <syscall.h>
68 #endif
69 #include <sched.h>
70 #include <signal.h>
71 #ifdef HAVE_EVENTFD_SUPPORT
72 #include <sys/eventfd.h>
73 #endif
74 #include <sys/mman.h>
75 #include <sys/wait.h>
76 #include <sys/types.h>
77 #include <sys/stat.h>
78 #include <fcntl.h>
79 #include <linux/err.h>
80 #include <linux/string.h>
81 #include <linux/time64.h>
82 #include <linux/zalloc.h>
83 #include <linux/bitmap.h>
84 #include <sys/time.h>
85 
86 struct switch_output {
87 	bool		 enabled;
88 	bool		 signal;
89 	unsigned long	 size;
90 	unsigned long	 time;
91 	const char	*str;
92 	bool		 set;
93 	char		 **filenames;
94 	int		 num_files;
95 	int		 cur_file;
96 };
97 
98 struct thread_mask {
99 	struct mmap_cpu_mask	maps;
100 	struct mmap_cpu_mask	affinity;
101 };
102 
103 struct record_thread {
104 	pid_t			tid;
105 	struct thread_mask	*mask;
106 	struct {
107 		int		msg[2];
108 		int		ack[2];
109 	} pipes;
110 	struct fdarray		pollfd;
111 	int			ctlfd_pos;
112 	int			nr_mmaps;
113 	struct mmap		**maps;
114 	struct mmap		**overwrite_maps;
115 	struct record		*rec;
116 	unsigned long long	samples;
117 	unsigned long		waking;
118 	u64			bytes_written;
119 	u64			bytes_transferred;
120 	u64			bytes_compressed;
121 };
122 
123 static __thread struct record_thread *thread;
124 
125 enum thread_msg {
126 	THREAD_MSG__UNDEFINED = 0,
127 	THREAD_MSG__READY,
128 	THREAD_MSG__MAX,
129 };
130 
131 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
132 	"UNDEFINED", "READY"
133 };
134 
135 enum thread_spec {
136 	THREAD_SPEC__UNDEFINED = 0,
137 	THREAD_SPEC__CPU,
138 	THREAD_SPEC__CORE,
139 	THREAD_SPEC__PACKAGE,
140 	THREAD_SPEC__NUMA,
141 	THREAD_SPEC__USER,
142 	THREAD_SPEC__MAX,
143 };
144 
145 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
146 	"undefined", "cpu", "core", "package", "numa", "user"
147 };
148 
149 struct pollfd_index_map {
150 	int evlist_pollfd_index;
151 	int thread_pollfd_index;
152 };
153 
154 struct record {
155 	struct perf_tool	tool;
156 	struct record_opts	opts;
157 	u64			bytes_written;
158 	u64			thread_bytes_written;
159 	struct perf_data	data;
160 	struct auxtrace_record	*itr;
161 	struct evlist	*evlist;
162 	struct perf_session	*session;
163 	struct evlist		*sb_evlist;
164 	pthread_t		thread_id;
165 	int			realtime_prio;
166 	bool			latency;
167 	bool			switch_output_event_set;
168 	bool			no_buildid;
169 	bool			no_buildid_set;
170 	bool			no_buildid_cache;
171 	bool			no_buildid_cache_set;
172 	bool			buildid_all;
173 	bool			buildid_mmap;
174 	bool			buildid_mmap_set;
175 	bool			timestamp_filename;
176 	bool			timestamp_boundary;
177 	bool			off_cpu;
178 	const char		*filter_action;
179 	const char		*uid_str;
180 	struct switch_output	switch_output;
181 	unsigned long long	samples;
182 	unsigned long		output_max_size;	/* = 0: unlimited */
183 	struct perf_debuginfod	debuginfod;
184 	int			nr_threads;
185 	struct thread_mask	*thread_masks;
186 	struct record_thread	*thread_data;
187 	struct pollfd_index_map	*index_map;
188 	size_t			index_map_sz;
189 	size_t			index_map_cnt;
190 };
191 
192 static volatile int done;
193 
194 static volatile int auxtrace_record__snapshot_started;
195 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
196 static DEFINE_TRIGGER(switch_output_trigger);
197 
198 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
199 	"SYS", "NODE", "CPU"
200 };
201 
202 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
203 				  struct perf_sample *sample, struct machine *machine);
204 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
205 				   struct perf_sample *sample, struct machine *machine);
206 static int process_timestamp_boundary(const struct perf_tool *tool,
207 				      union perf_event *event,
208 				      struct perf_sample *sample,
209 				      struct machine *machine);
210 
211 #ifndef HAVE_GETTID
212 static inline pid_t gettid(void)
213 {
214 	return (pid_t)syscall(__NR_gettid);
215 }
216 #endif
217 
218 static int record__threads_enabled(struct record *rec)
219 {
220 	return rec->opts.threads_spec;
221 }
222 
223 static bool switch_output_signal(struct record *rec)
224 {
225 	return rec->switch_output.signal &&
226 	       trigger_is_ready(&switch_output_trigger);
227 }
228 
229 static bool switch_output_size(struct record *rec)
230 {
231 	return rec->switch_output.size &&
232 	       trigger_is_ready(&switch_output_trigger) &&
233 	       (rec->bytes_written >= rec->switch_output.size);
234 }
235 
236 static bool switch_output_time(struct record *rec)
237 {
238 	return rec->switch_output.time &&
239 	       trigger_is_ready(&switch_output_trigger);
240 }
241 
242 static u64 record__bytes_written(struct record *rec)
243 {
244 	return rec->bytes_written + rec->thread_bytes_written;
245 }
246 
247 static bool record__output_max_size_exceeded(struct record *rec)
248 {
249 	return rec->output_max_size &&
250 	       (record__bytes_written(rec) >= rec->output_max_size);
251 }
252 
253 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
254 			 void *bf, size_t size)
255 {
256 	struct perf_data_file *file = &rec->session->data->file;
257 
258 	if (map && map->file)
259 		file = map->file;
260 
261 	if (perf_data_file__write(file, bf, size) < 0) {
262 		pr_err("failed to write perf data, error: %m\n");
263 		return -1;
264 	}
265 
266 	if (map && map->file) {
267 		thread->bytes_written += size;
268 		rec->thread_bytes_written += size;
269 	} else {
270 		rec->bytes_written += size;
271 	}
272 
273 	if (record__output_max_size_exceeded(rec) && !done) {
274 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
275 				" stopping session ]\n",
276 				record__bytes_written(rec) >> 10);
277 		done = 1;
278 	}
279 
280 	if (switch_output_size(rec))
281 		trigger_hit(&switch_output_trigger);
282 
283 	return 0;
284 }
285 
286 static int record__aio_enabled(struct record *rec);
287 static int record__comp_enabled(struct record *rec);
288 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
289 			    void *dst, size_t dst_size, void *src, size_t src_size);
290 
291 #ifdef HAVE_AIO_SUPPORT
292 static int record__aio_write(struct aiocb *cblock, int trace_fd,
293 		void *buf, size_t size, off_t off)
294 {
295 	int rc;
296 
297 	cblock->aio_fildes = trace_fd;
298 	cblock->aio_buf    = buf;
299 	cblock->aio_nbytes = size;
300 	cblock->aio_offset = off;
301 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
302 
303 	do {
304 		rc = aio_write(cblock);
305 		if (rc == 0) {
306 			break;
307 		} else if (errno != EAGAIN) {
308 			cblock->aio_fildes = -1;
309 			pr_err("failed to queue perf data, error: %m\n");
310 			break;
311 		}
312 	} while (1);
313 
314 	return rc;
315 }
316 
317 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
318 {
319 	void *rem_buf;
320 	off_t rem_off;
321 	size_t rem_size;
322 	int rc, aio_errno;
323 	ssize_t aio_ret, written;
324 
325 	aio_errno = aio_error(cblock);
326 	if (aio_errno == EINPROGRESS)
327 		return 0;
328 
329 	written = aio_ret = aio_return(cblock);
330 	if (aio_ret < 0) {
331 		if (aio_errno != EINTR)
332 			pr_err("failed to write perf data, error: %m\n");
333 		written = 0;
334 	}
335 
336 	rem_size = cblock->aio_nbytes - written;
337 
338 	if (rem_size == 0) {
339 		cblock->aio_fildes = -1;
340 		/*
341 		 * md->refcount is incremented in record__aio_pushfn() for
342 		 * every aio write request started in record__aio_push() so
343 		 * decrement it because the request is now complete.
344 		 */
345 		perf_mmap__put(&md->core);
346 		rc = 1;
347 	} else {
348 		/*
349 		 * aio write request may require restart with the
350 		 * remainder if the kernel didn't write whole
351 		 * chunk at once.
352 		 */
353 		rem_off = cblock->aio_offset + written;
354 		rem_buf = (void *)(cblock->aio_buf + written);
355 		record__aio_write(cblock, cblock->aio_fildes,
356 				rem_buf, rem_size, rem_off);
357 		rc = 0;
358 	}
359 
360 	return rc;
361 }
362 
363 static int record__aio_sync(struct mmap *md, bool sync_all)
364 {
365 	struct aiocb **aiocb = md->aio.aiocb;
366 	struct aiocb *cblocks = md->aio.cblocks;
367 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
368 	int i, do_suspend;
369 
370 	do {
371 		do_suspend = 0;
372 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
373 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
374 				if (sync_all)
375 					aiocb[i] = NULL;
376 				else
377 					return i;
378 			} else {
379 				/*
380 				 * Started aio write is not complete yet
381 				 * so it has to be waited before the
382 				 * next allocation.
383 				 */
384 				aiocb[i] = &cblocks[i];
385 				do_suspend = 1;
386 			}
387 		}
388 		if (!do_suspend)
389 			return -1;
390 
391 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
392 			if (!(errno == EAGAIN || errno == EINTR))
393 				pr_err("failed to sync perf data, error: %m\n");
394 		}
395 	} while (1);
396 }
397 
398 struct record_aio {
399 	struct record	*rec;
400 	void		*data;
401 	size_t		size;
402 };
403 
404 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
405 {
406 	struct record_aio *aio = to;
407 
408 	/*
409 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
410 	 * to release space in the kernel buffer as fast as possible, calling
411 	 * perf_mmap__consume() from perf_mmap__push() function.
412 	 *
413 	 * That lets the kernel to proceed with storing more profiling data into
414 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
415 	 *
416 	 * Coping can be done in two steps in case the chunk of profiling data
417 	 * crosses the upper bound of the kernel buffer. In this case we first move
418 	 * part of data from map->start till the upper bound and then the remainder
419 	 * from the beginning of the kernel buffer till the end of the data chunk.
420 	 */
421 
422 	if (record__comp_enabled(aio->rec)) {
423 		ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
424 						   mmap__mmap_len(map) - aio->size,
425 						   buf, size);
426 		if (compressed < 0)
427 			return (int)compressed;
428 
429 		size = compressed;
430 	} else {
431 		memcpy(aio->data + aio->size, buf, size);
432 	}
433 
434 	if (!aio->size) {
435 		/*
436 		 * Increment map->refcount to guard map->aio.data[] buffer
437 		 * from premature deallocation because map object can be
438 		 * released earlier than aio write request started on
439 		 * map->aio.data[] buffer is complete.
440 		 *
441 		 * perf_mmap__put() is done at record__aio_complete()
442 		 * after started aio request completion or at record__aio_push()
443 		 * if the request failed to start.
444 		 */
445 		perf_mmap__get(&map->core);
446 	}
447 
448 	aio->size += size;
449 
450 	return size;
451 }
452 
453 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
454 {
455 	int ret, idx;
456 	int trace_fd = rec->session->data->file.fd;
457 	struct record_aio aio = { .rec = rec, .size = 0 };
458 
459 	/*
460 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
461 	 * becomes available after previous aio write operation.
462 	 */
463 
464 	idx = record__aio_sync(map, false);
465 	aio.data = map->aio.data[idx];
466 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
467 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
468 		return ret;
469 
470 	rec->samples++;
471 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
472 	if (!ret) {
473 		*off += aio.size;
474 		rec->bytes_written += aio.size;
475 		if (switch_output_size(rec))
476 			trigger_hit(&switch_output_trigger);
477 	} else {
478 		/*
479 		 * Decrement map->refcount incremented in record__aio_pushfn()
480 		 * back if record__aio_write() operation failed to start, otherwise
481 		 * map->refcount is decremented in record__aio_complete() after
482 		 * aio write operation finishes successfully.
483 		 */
484 		perf_mmap__put(&map->core);
485 	}
486 
487 	return ret;
488 }
489 
490 static off_t record__aio_get_pos(int trace_fd)
491 {
492 	return lseek(trace_fd, 0, SEEK_CUR);
493 }
494 
495 static void record__aio_set_pos(int trace_fd, off_t pos)
496 {
497 	lseek(trace_fd, pos, SEEK_SET);
498 }
499 
500 static void record__aio_mmap_read_sync(struct record *rec)
501 {
502 	int i;
503 	struct evlist *evlist = rec->evlist;
504 	struct mmap *maps = evlist->mmap;
505 
506 	if (!record__aio_enabled(rec))
507 		return;
508 
509 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
510 		struct mmap *map = &maps[i];
511 
512 		if (map->core.base)
513 			record__aio_sync(map, true);
514 	}
515 }
516 
517 static int nr_cblocks_default = 1;
518 static int nr_cblocks_max = 4;
519 
520 static int record__aio_parse(const struct option *opt,
521 			     const char *str,
522 			     int unset)
523 {
524 	struct record_opts *opts = (struct record_opts *)opt->value;
525 
526 	if (unset) {
527 		opts->nr_cblocks = 0;
528 	} else {
529 		if (str)
530 			opts->nr_cblocks = strtol(str, NULL, 0);
531 		if (!opts->nr_cblocks)
532 			opts->nr_cblocks = nr_cblocks_default;
533 	}
534 
535 	return 0;
536 }
537 #else /* HAVE_AIO_SUPPORT */
538 static int nr_cblocks_max = 0;
539 
540 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
541 			    off_t *off __maybe_unused)
542 {
543 	return -1;
544 }
545 
546 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
547 {
548 	return -1;
549 }
550 
551 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
552 {
553 }
554 
555 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
556 {
557 }
558 #endif
559 
560 static int record__aio_enabled(struct record *rec)
561 {
562 	return rec->opts.nr_cblocks > 0;
563 }
564 
565 #define MMAP_FLUSH_DEFAULT 1
566 static int record__mmap_flush_parse(const struct option *opt,
567 				    const char *str,
568 				    int unset)
569 {
570 	int flush_max;
571 	struct record_opts *opts = (struct record_opts *)opt->value;
572 	static struct parse_tag tags[] = {
573 			{ .tag  = 'B', .mult = 1       },
574 			{ .tag  = 'K', .mult = 1 << 10 },
575 			{ .tag  = 'M', .mult = 1 << 20 },
576 			{ .tag  = 'G', .mult = 1 << 30 },
577 			{ .tag  = 0 },
578 	};
579 
580 	if (unset)
581 		return 0;
582 
583 	if (str) {
584 		opts->mmap_flush = parse_tag_value(str, tags);
585 		if (opts->mmap_flush == (int)-1)
586 			opts->mmap_flush = strtol(str, NULL, 0);
587 	}
588 
589 	if (!opts->mmap_flush)
590 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
591 
592 	flush_max = evlist__mmap_size(opts->mmap_pages);
593 	flush_max /= 4;
594 	if (opts->mmap_flush > flush_max)
595 		opts->mmap_flush = flush_max;
596 
597 	return 0;
598 }
599 
600 #ifdef HAVE_ZSTD_SUPPORT
601 static unsigned int comp_level_default = 1;
602 
603 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
604 {
605 	struct record_opts *opts = opt->value;
606 
607 	if (unset) {
608 		opts->comp_level = 0;
609 	} else {
610 		if (str)
611 			opts->comp_level = strtol(str, NULL, 0);
612 		if (!opts->comp_level)
613 			opts->comp_level = comp_level_default;
614 	}
615 
616 	return 0;
617 }
618 #endif
619 static unsigned int comp_level_max = 22;
620 
621 static int record__comp_enabled(struct record *rec)
622 {
623 	return rec->opts.comp_level > 0;
624 }
625 
626 static int process_synthesized_event(const struct perf_tool *tool,
627 				     union perf_event *event,
628 				     struct perf_sample *sample __maybe_unused,
629 				     struct machine *machine __maybe_unused)
630 {
631 	struct record *rec = container_of(tool, struct record, tool);
632 	return record__write(rec, NULL, event, event->header.size);
633 }
634 
635 static struct mutex synth_lock;
636 
637 static int process_locked_synthesized_event(const struct perf_tool *tool,
638 				     union perf_event *event,
639 				     struct perf_sample *sample __maybe_unused,
640 				     struct machine *machine __maybe_unused)
641 {
642 	int ret;
643 
644 	mutex_lock(&synth_lock);
645 	ret = process_synthesized_event(tool, event, sample, machine);
646 	mutex_unlock(&synth_lock);
647 	return ret;
648 }
649 
650 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
651 {
652 	struct record *rec = to;
653 
654 	if (record__comp_enabled(rec)) {
655 		struct perf_record_compressed2 *event = map->data;
656 		size_t padding = 0;
657 		u8 pad[8] = {0};
658 		ssize_t compressed = zstd_compress(rec->session, map, map->data,
659 						   mmap__mmap_len(map), bf, size);
660 
661 		if (compressed < 0)
662 			return (int)compressed;
663 
664 		bf = event;
665 		thread->samples++;
666 
667 		/*
668 		 * The record from `zstd_compress` is not 8 bytes aligned, which would cause asan
669 		 * error. We make it aligned here.
670 		 */
671 		event->data_size = compressed - sizeof(struct perf_record_compressed2);
672 		event->header.size = PERF_ALIGN(compressed, sizeof(u64));
673 		padding = event->header.size - compressed;
674 		return record__write(rec, map, bf, compressed) ||
675 		       record__write(rec, map, &pad, padding);
676 	}
677 
678 	thread->samples++;
679 	return record__write(rec, map, bf, size);
680 }
681 
682 static volatile sig_atomic_t signr = -1;
683 static volatile sig_atomic_t child_finished;
684 #ifdef HAVE_EVENTFD_SUPPORT
685 static volatile sig_atomic_t done_fd = -1;
686 #endif
687 
688 static void sig_handler(int sig)
689 {
690 	if (sig == SIGCHLD)
691 		child_finished = 1;
692 	else
693 		signr = sig;
694 
695 	done = 1;
696 #ifdef HAVE_EVENTFD_SUPPORT
697 	if (done_fd >= 0) {
698 		u64 tmp = 1;
699 		int orig_errno = errno;
700 
701 		/*
702 		 * It is possible for this signal handler to run after done is
703 		 * checked in the main loop, but before the perf counter fds are
704 		 * polled. If this happens, the poll() will continue to wait
705 		 * even though done is set, and will only break out if either
706 		 * another signal is received, or the counters are ready for
707 		 * read. To ensure the poll() doesn't sleep when done is set,
708 		 * use an eventfd (done_fd) to wake up the poll().
709 		 */
710 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
711 			pr_err("failed to signal wakeup fd, error: %m\n");
712 
713 		errno = orig_errno;
714 	}
715 #endif // HAVE_EVENTFD_SUPPORT
716 }
717 
718 static void sigsegv_handler(int sig)
719 {
720 	perf_hooks__recover();
721 	sighandler_dump_stack(sig);
722 }
723 
724 static void record__sig_exit(void)
725 {
726 	if (signr == -1)
727 		return;
728 
729 	signal(signr, SIG_DFL);
730 	raise(signr);
731 }
732 
733 static int record__process_auxtrace(const struct perf_tool *tool,
734 				    struct mmap *map,
735 				    union perf_event *event, void *data1,
736 				    size_t len1, void *data2, size_t len2)
737 {
738 	struct record *rec = container_of(tool, struct record, tool);
739 	struct perf_data *data = &rec->data;
740 	size_t padding;
741 	u8 pad[8] = {0};
742 
743 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
744 		off_t file_offset;
745 		int fd = perf_data__fd(data);
746 		int err;
747 
748 		file_offset = lseek(fd, 0, SEEK_CUR);
749 		if (file_offset == -1)
750 			return -1;
751 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
752 						     event, file_offset);
753 		if (err)
754 			return err;
755 	}
756 
757 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
758 	padding = (len1 + len2) & 7;
759 	if (padding)
760 		padding = 8 - padding;
761 
762 	record__write(rec, map, event, event->header.size);
763 	record__write(rec, map, data1, len1);
764 	if (len2)
765 		record__write(rec, map, data2, len2);
766 	record__write(rec, map, &pad, padding);
767 
768 	return 0;
769 }
770 
771 static int record__auxtrace_mmap_read(struct record *rec,
772 				      struct mmap *map)
773 {
774 	int ret;
775 
776 	ret = auxtrace_mmap__read(map, rec->itr,
777 				  perf_session__env(rec->session),
778 				  &rec->tool,
779 				  record__process_auxtrace);
780 	if (ret < 0)
781 		return ret;
782 
783 	if (ret)
784 		rec->samples++;
785 
786 	return 0;
787 }
788 
789 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
790 					       struct mmap *map)
791 {
792 	int ret;
793 
794 	ret = auxtrace_mmap__read_snapshot(map, rec->itr,
795 					   perf_session__env(rec->session),
796 					   &rec->tool,
797 					   record__process_auxtrace,
798 					   rec->opts.auxtrace_snapshot_size);
799 	if (ret < 0)
800 		return ret;
801 
802 	if (ret)
803 		rec->samples++;
804 
805 	return 0;
806 }
807 
808 static int record__auxtrace_read_snapshot_all(struct record *rec)
809 {
810 	int i;
811 	int rc = 0;
812 
813 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
814 		struct mmap *map = &rec->evlist->mmap[i];
815 
816 		if (!map->auxtrace_mmap.base)
817 			continue;
818 
819 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
820 			rc = -1;
821 			goto out;
822 		}
823 	}
824 out:
825 	return rc;
826 }
827 
828 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
829 {
830 	pr_debug("Recording AUX area tracing snapshot\n");
831 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
832 		trigger_error(&auxtrace_snapshot_trigger);
833 	} else {
834 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
835 			trigger_error(&auxtrace_snapshot_trigger);
836 		else
837 			trigger_ready(&auxtrace_snapshot_trigger);
838 	}
839 }
840 
841 static int record__auxtrace_snapshot_exit(struct record *rec)
842 {
843 	if (trigger_is_error(&auxtrace_snapshot_trigger))
844 		return 0;
845 
846 	if (!auxtrace_record__snapshot_started &&
847 	    auxtrace_record__snapshot_start(rec->itr))
848 		return -1;
849 
850 	record__read_auxtrace_snapshot(rec, true);
851 	if (trigger_is_error(&auxtrace_snapshot_trigger))
852 		return -1;
853 
854 	return 0;
855 }
856 
857 static int record__auxtrace_init(struct record *rec)
858 {
859 	int err;
860 
861 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
862 	    && record__threads_enabled(rec)) {
863 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
864 		return -EINVAL;
865 	}
866 
867 	if (!rec->itr) {
868 		rec->itr = auxtrace_record__init(rec->evlist, &err);
869 		if (err)
870 			return err;
871 	}
872 
873 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
874 					      rec->opts.auxtrace_snapshot_opts);
875 	if (err)
876 		return err;
877 
878 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
879 					    rec->opts.auxtrace_sample_opts);
880 	if (err)
881 		return err;
882 
883 	err = auxtrace_parse_aux_action(rec->evlist);
884 	if (err)
885 		return err;
886 
887 	return auxtrace_parse_filters(rec->evlist);
888 }
889 
890 static int record__config_text_poke(struct evlist *evlist)
891 {
892 	struct evsel *evsel;
893 
894 	/* Nothing to do if text poke is already configured */
895 	evlist__for_each_entry(evlist, evsel) {
896 		if (evsel->core.attr.text_poke)
897 			return 0;
898 	}
899 
900 	evsel = evlist__add_dummy_on_all_cpus(evlist);
901 	if (!evsel)
902 		return -ENOMEM;
903 
904 	evsel->core.attr.text_poke = 1;
905 	evsel->core.attr.ksymbol = 1;
906 	evsel->immediate = true;
907 	evsel__set_sample_bit(evsel, TIME);
908 
909 	return 0;
910 }
911 
912 static int record__config_off_cpu(struct record *rec)
913 {
914 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
915 }
916 
917 static bool record__tracking_system_wide(struct record *rec)
918 {
919 	struct evlist *evlist = rec->evlist;
920 	struct evsel *evsel;
921 
922 	/*
923 	 * If non-dummy evsel exists, system_wide sideband is need to
924 	 * help parse sample information.
925 	 * For example, PERF_EVENT_MMAP event to help parse symbol,
926 	 * and PERF_EVENT_COMM event to help parse task executable name.
927 	 */
928 	evlist__for_each_entry(evlist, evsel) {
929 		if (!evsel__is_dummy_event(evsel))
930 			return true;
931 	}
932 
933 	return false;
934 }
935 
936 static int record__config_tracking_events(struct record *rec)
937 {
938 	struct record_opts *opts = &rec->opts;
939 	struct evlist *evlist = rec->evlist;
940 	bool system_wide = false;
941 	struct evsel *evsel;
942 
943 	/*
944 	 * For initial_delay, system wide or a hybrid system, we need to add
945 	 * tracking event so that we can track PERF_RECORD_MMAP to cover the
946 	 * delay of waiting or event synthesis.
947 	 */
948 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
949 	    perf_pmus__num_core_pmus() > 1) {
950 		/*
951 		 * User space tasks can migrate between CPUs, so when tracing
952 		 * selected CPUs, sideband for all CPUs is still needed.
953 		 */
954 		if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
955 			system_wide = true;
956 
957 		evsel = evlist__findnew_tracking_event(evlist, system_wide);
958 		if (!evsel)
959 			return -ENOMEM;
960 
961 		/*
962 		 * Enable the tracking event when the process is forked for
963 		 * initial_delay, immediately for system wide.
964 		 */
965 		if (opts->target.initial_delay && !evsel->immediate &&
966 		    !target__has_cpu(&opts->target))
967 			evsel->core.attr.enable_on_exec = 1;
968 		else
969 			evsel->immediate = 1;
970 	}
971 
972 	return 0;
973 }
974 
975 static bool record__kcore_readable(struct machine *machine)
976 {
977 	char kcore[PATH_MAX];
978 	int fd;
979 
980 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
981 
982 	fd = open(kcore, O_RDONLY);
983 	if (fd < 0)
984 		return false;
985 
986 	close(fd);
987 
988 	return true;
989 }
990 
991 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
992 {
993 	char from_dir[PATH_MAX];
994 	char kcore_dir[PATH_MAX];
995 	int ret;
996 
997 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
998 
999 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1000 	if (ret)
1001 		return ret;
1002 
1003 	return kcore_copy(from_dir, kcore_dir);
1004 }
1005 
1006 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1007 {
1008 	thread_data->pipes.msg[0] = -1;
1009 	thread_data->pipes.msg[1] = -1;
1010 	thread_data->pipes.ack[0] = -1;
1011 	thread_data->pipes.ack[1] = -1;
1012 }
1013 
1014 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1015 {
1016 	if (pipe(thread_data->pipes.msg))
1017 		return -EINVAL;
1018 
1019 	if (pipe(thread_data->pipes.ack)) {
1020 		close(thread_data->pipes.msg[0]);
1021 		thread_data->pipes.msg[0] = -1;
1022 		close(thread_data->pipes.msg[1]);
1023 		thread_data->pipes.msg[1] = -1;
1024 		return -EINVAL;
1025 	}
1026 
1027 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1028 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1029 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1030 
1031 	return 0;
1032 }
1033 
1034 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1035 {
1036 	if (thread_data->pipes.msg[0] != -1) {
1037 		close(thread_data->pipes.msg[0]);
1038 		thread_data->pipes.msg[0] = -1;
1039 	}
1040 	if (thread_data->pipes.msg[1] != -1) {
1041 		close(thread_data->pipes.msg[1]);
1042 		thread_data->pipes.msg[1] = -1;
1043 	}
1044 	if (thread_data->pipes.ack[0] != -1) {
1045 		close(thread_data->pipes.ack[0]);
1046 		thread_data->pipes.ack[0] = -1;
1047 	}
1048 	if (thread_data->pipes.ack[1] != -1) {
1049 		close(thread_data->pipes.ack[1]);
1050 		thread_data->pipes.ack[1] = -1;
1051 	}
1052 }
1053 
1054 static bool evlist__per_thread(struct evlist *evlist)
1055 {
1056 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1057 }
1058 
1059 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1060 {
1061 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1062 	struct mmap *mmap = evlist->mmap;
1063 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1064 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1065 	bool per_thread = evlist__per_thread(evlist);
1066 
1067 	if (per_thread)
1068 		thread_data->nr_mmaps = nr_mmaps;
1069 	else
1070 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1071 						      thread_data->mask->maps.nbits);
1072 	if (mmap) {
1073 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1074 		if (!thread_data->maps)
1075 			return -ENOMEM;
1076 	}
1077 	if (overwrite_mmap) {
1078 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1079 		if (!thread_data->overwrite_maps) {
1080 			zfree(&thread_data->maps);
1081 			return -ENOMEM;
1082 		}
1083 	}
1084 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1085 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1086 
1087 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1088 		if (per_thread ||
1089 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1090 			if (thread_data->maps) {
1091 				thread_data->maps[tm] = &mmap[m];
1092 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1093 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1094 			}
1095 			if (thread_data->overwrite_maps) {
1096 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1097 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1098 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1099 			}
1100 			tm++;
1101 		}
1102 	}
1103 
1104 	return 0;
1105 }
1106 
1107 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1108 {
1109 	int f, tm, pos;
1110 	struct mmap *map, *overwrite_map;
1111 
1112 	fdarray__init(&thread_data->pollfd, 64);
1113 
1114 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1115 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1116 		overwrite_map = thread_data->overwrite_maps ?
1117 				thread_data->overwrite_maps[tm] : NULL;
1118 
1119 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1120 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1121 
1122 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1123 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1124 							      &evlist->core.pollfd);
1125 				if (pos < 0)
1126 					return pos;
1127 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1128 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1129 			}
1130 		}
1131 	}
1132 
1133 	return 0;
1134 }
1135 
1136 static void record__free_thread_data(struct record *rec)
1137 {
1138 	int t;
1139 	struct record_thread *thread_data = rec->thread_data;
1140 
1141 	if (thread_data == NULL)
1142 		return;
1143 
1144 	for (t = 0; t < rec->nr_threads; t++) {
1145 		record__thread_data_close_pipes(&thread_data[t]);
1146 		zfree(&thread_data[t].maps);
1147 		zfree(&thread_data[t].overwrite_maps);
1148 		fdarray__exit(&thread_data[t].pollfd);
1149 	}
1150 
1151 	zfree(&rec->thread_data);
1152 }
1153 
1154 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1155 						    int evlist_pollfd_index,
1156 						    int thread_pollfd_index)
1157 {
1158 	size_t x = rec->index_map_cnt;
1159 
1160 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1161 		return -ENOMEM;
1162 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1163 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1164 	rec->index_map_cnt += 1;
1165 	return 0;
1166 }
1167 
1168 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1169 						    struct evlist *evlist,
1170 						    struct record_thread *thread_data)
1171 {
1172 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1173 	struct pollfd *t_entries = thread_data->pollfd.entries;
1174 	int err = 0;
1175 	size_t i;
1176 
1177 	for (i = 0; i < rec->index_map_cnt; i++) {
1178 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1179 		int t_pos = rec->index_map[i].thread_pollfd_index;
1180 
1181 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1182 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1183 			pr_err("Thread and evlist pollfd index mismatch\n");
1184 			err = -EINVAL;
1185 			continue;
1186 		}
1187 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1188 	}
1189 	return err;
1190 }
1191 
1192 static int record__dup_non_perf_events(struct record *rec,
1193 				       struct evlist *evlist,
1194 				       struct record_thread *thread_data)
1195 {
1196 	struct fdarray *fda = &evlist->core.pollfd;
1197 	int i, ret;
1198 
1199 	for (i = 0; i < fda->nr; i++) {
1200 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1201 			continue;
1202 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1203 		if (ret < 0) {
1204 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1205 			return ret;
1206 		}
1207 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1208 			  thread_data, ret, fda->entries[i].fd);
1209 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1210 		if (ret < 0) {
1211 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1212 			return ret;
1213 		}
1214 	}
1215 	return 0;
1216 }
1217 
1218 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1219 {
1220 	int t, ret;
1221 	struct record_thread *thread_data;
1222 
1223 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1224 	if (!rec->thread_data) {
1225 		pr_err("Failed to allocate thread data\n");
1226 		return -ENOMEM;
1227 	}
1228 	thread_data = rec->thread_data;
1229 
1230 	for (t = 0; t < rec->nr_threads; t++)
1231 		record__thread_data_init_pipes(&thread_data[t]);
1232 
1233 	for (t = 0; t < rec->nr_threads; t++) {
1234 		thread_data[t].rec = rec;
1235 		thread_data[t].mask = &rec->thread_masks[t];
1236 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1237 		if (ret) {
1238 			pr_err("Failed to initialize thread[%d] maps\n", t);
1239 			goto out_free;
1240 		}
1241 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1242 		if (ret) {
1243 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1244 			goto out_free;
1245 		}
1246 		if (t) {
1247 			thread_data[t].tid = -1;
1248 			ret = record__thread_data_open_pipes(&thread_data[t]);
1249 			if (ret) {
1250 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1251 				goto out_free;
1252 			}
1253 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1254 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1255 			if (ret < 0) {
1256 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1257 				goto out_free;
1258 			}
1259 			thread_data[t].ctlfd_pos = ret;
1260 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1261 				 thread_data, thread_data[t].ctlfd_pos,
1262 				 thread_data[t].pipes.msg[0]);
1263 		} else {
1264 			thread_data[t].tid = gettid();
1265 
1266 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1267 			if (ret < 0)
1268 				goto out_free;
1269 
1270 			thread_data[t].ctlfd_pos = -1; /* Not used */
1271 		}
1272 	}
1273 
1274 	return 0;
1275 
1276 out_free:
1277 	record__free_thread_data(rec);
1278 
1279 	return ret;
1280 }
1281 
1282 static int record__mmap_evlist(struct record *rec,
1283 			       struct evlist *evlist)
1284 {
1285 	int i, ret;
1286 	struct record_opts *opts = &rec->opts;
1287 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1288 				  opts->auxtrace_sample_mode;
1289 
1290 	if (opts->affinity != PERF_AFFINITY_SYS)
1291 		cpu__setup_cpunode_map();
1292 
1293 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1294 				 opts->auxtrace_mmap_pages,
1295 				 auxtrace_overwrite,
1296 				 opts->nr_cblocks, opts->affinity,
1297 				 opts->mmap_flush, opts->comp_level) < 0) {
1298 		if (errno == EPERM) {
1299 			pr_err("Permission error mapping pages.\n"
1300 			       "Consider increasing "
1301 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1302 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1303 			       "(current value: %u,%u)\n",
1304 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1305 			return -errno;
1306 		} else {
1307 			pr_err("failed to mmap: %m\n");
1308 			if (errno)
1309 				return -errno;
1310 			else
1311 				return -EINVAL;
1312 		}
1313 	}
1314 
1315 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1316 		return -1;
1317 
1318 	ret = record__alloc_thread_data(rec, evlist);
1319 	if (ret)
1320 		return ret;
1321 
1322 	if (record__threads_enabled(rec)) {
1323 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1324 		if (ret) {
1325 			errno = -ret;
1326 			pr_err("Failed to create data directory: %m\n");
1327 			return ret;
1328 		}
1329 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1330 			if (evlist->mmap)
1331 				evlist->mmap[i].file = &rec->data.dir.files[i];
1332 			if (evlist->overwrite_mmap)
1333 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1334 		}
1335 	}
1336 
1337 	return 0;
1338 }
1339 
1340 static int record__mmap(struct record *rec)
1341 {
1342 	return record__mmap_evlist(rec, rec->evlist);
1343 }
1344 
1345 static int record__open(struct record *rec)
1346 {
1347 	char msg[BUFSIZ];
1348 	struct evsel *pos;
1349 	struct evlist *evlist = rec->evlist;
1350 	struct perf_session *session = rec->session;
1351 	struct record_opts *opts = &rec->opts;
1352 	int rc = 0;
1353 	bool skipped = false;
1354 	bool removed_tracking = false;
1355 
1356 	evlist__for_each_entry(evlist, pos) {
1357 		if (removed_tracking) {
1358 			/*
1359 			 * Normally the head of the list has tracking enabled
1360 			 * for sideband data like mmaps. If this event is
1361 			 * removed, make sure to add tracking to the next
1362 			 * processed event.
1363 			 */
1364 			if (!pos->tracking) {
1365 				pos->tracking = true;
1366 				evsel__config(pos, opts, &callchain_param);
1367 			}
1368 			removed_tracking = false;
1369 		}
1370 try_again:
1371 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1372 			bool report_error = true;
1373 
1374 			if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1375 				if (verbose > 0)
1376 					ui__warning("%s\n", msg);
1377 				goto try_again;
1378 			}
1379 			if ((errno == EINVAL || errno == EBADF) &&
1380 			    pos->core.leader != &pos->core &&
1381 			    pos->weak_group) {
1382 			        pos = evlist__reset_weak_group(evlist, pos, true);
1383 				goto try_again;
1384 			}
1385 #if defined(__aarch64__) || defined(__arm__)
1386 			if (strstr(evsel__name(pos), "cycles")) {
1387 				struct evsel *pos2;
1388 				/*
1389 				 * Unfortunately ARM has many events named
1390 				 * "cycles" on PMUs like the system-level (L3)
1391 				 * cache which don't support sampling. Only
1392 				 * display such failures to open when there is
1393 				 * only 1 cycles event or verbose is enabled.
1394 				 */
1395 				evlist__for_each_entry(evlist, pos2) {
1396 					if (pos2 == pos)
1397 						continue;
1398 					if (strstr(evsel__name(pos2), "cycles")) {
1399 						report_error = false;
1400 						break;
1401 					}
1402 				}
1403 			}
1404 #endif
1405 			if (report_error || verbose > 0) {
1406 				evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1407 				ui__error("Failure to open event '%s' on PMU '%s' which will be "
1408 					  "removed.\n%s\n",
1409 					  evsel__name(pos), evsel__pmu_name(pos), msg);
1410 			}
1411 			if (pos->tracking)
1412 				removed_tracking = true;
1413 			pos->skippable = true;
1414 			skipped = true;
1415 		}
1416 	}
1417 
1418 	if (skipped) {
1419 		struct evsel *tmp;
1420 		int idx = 0;
1421 		bool evlist_empty = true;
1422 
1423 		/* Remove evsels that failed to open and update indices. */
1424 		evlist__for_each_entry_safe(evlist, tmp, pos) {
1425 			if (pos->skippable) {
1426 				evlist__remove(evlist, pos);
1427 				continue;
1428 			}
1429 
1430 			/*
1431 			 * Note, dummy events may be command line parsed or
1432 			 * added by the tool. We care about supporting `perf
1433 			 * record -e dummy` which may be used as a permission
1434 			 * check. Dummy events that are added to the command
1435 			 * line and opened along with other events that fail,
1436 			 * will still fail as if the dummy events were tool
1437 			 * added events for the sake of code simplicity.
1438 			 */
1439 			if (!evsel__is_dummy_event(pos))
1440 				evlist_empty = false;
1441 		}
1442 		evlist__for_each_entry(evlist, pos) {
1443 			pos->core.idx = idx++;
1444 		}
1445 		/* If list is empty then fail. */
1446 		if (evlist_empty) {
1447 			ui__error("Failure to open any events for recording.\n");
1448 			rc = -1;
1449 			goto out;
1450 		}
1451 	}
1452 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1453 		pr_warning(
1454 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1455 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1456 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1457 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1458 "Samples in kernel modules won't be resolved at all.\n\n"
1459 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1460 "even with a suitable vmlinux or kallsyms file.\n\n");
1461 	}
1462 
1463 	if (evlist__apply_filters(evlist, &pos, &opts->target)) {
1464 		pr_err("failed to set filter \"%s\" on event %s: %m\n",
1465 			pos->filter ?: "BPF", evsel__name(pos));
1466 		rc = -1;
1467 		goto out;
1468 	}
1469 
1470 	rc = record__mmap(rec);
1471 	if (rc)
1472 		goto out;
1473 
1474 	session->evlist = evlist;
1475 	perf_session__set_id_hdr_size(session);
1476 out:
1477 	return rc;
1478 }
1479 
1480 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1481 {
1482 	if (rec->evlist->first_sample_time == 0)
1483 		rec->evlist->first_sample_time = sample_time;
1484 
1485 	if (sample_time)
1486 		rec->evlist->last_sample_time = sample_time;
1487 }
1488 
1489 static int process_sample_event(const struct perf_tool *tool,
1490 				union perf_event *event,
1491 				struct perf_sample *sample,
1492 				struct evsel *evsel,
1493 				struct machine *machine)
1494 {
1495 	struct record *rec = container_of(tool, struct record, tool);
1496 
1497 	set_timestamp_boundary(rec, sample->time);
1498 
1499 	if (rec->buildid_all)
1500 		return 0;
1501 
1502 	rec->samples++;
1503 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1504 }
1505 
1506 static int process_buildids(struct record *rec)
1507 {
1508 	struct perf_session *session = rec->session;
1509 
1510 	if (perf_data__size(&rec->data) == 0)
1511 		return 0;
1512 
1513 	/* A single DSO is needed and not all inline frames. */
1514 	symbol_conf.inline_name = false;
1515 	/*
1516 	 * During this process, it'll load kernel map and replace the
1517 	 * dso->long_name to a real pathname it found.  In this case
1518 	 * we prefer the vmlinux path like
1519 	 *   /lib/modules/3.16.4/build/vmlinux
1520 	 *
1521 	 * rather than build-id path (in debug directory).
1522 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1523 	 */
1524 	symbol_conf.ignore_vmlinux_buildid = true;
1525 	/*
1526 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1527 	 * so no need to process samples. But if timestamp_boundary is enabled,
1528 	 * it still needs to walk on all samples to get the timestamps of
1529 	 * first/last samples.
1530 	 */
1531 	if (rec->buildid_all && !rec->timestamp_boundary)
1532 		rec->tool.sample = process_event_sample_stub;
1533 
1534 	return perf_session__process_events(session);
1535 }
1536 
1537 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1538 {
1539 	int err;
1540 	struct perf_tool *tool = data;
1541 	/*
1542 	 *As for guest kernel when processing subcommand record&report,
1543 	 *we arrange module mmap prior to guest kernel mmap and trigger
1544 	 *a preload dso because default guest module symbols are loaded
1545 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1546 	 *method is used to avoid symbol missing when the first addr is
1547 	 *in module instead of in guest kernel.
1548 	 */
1549 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1550 					     machine);
1551 	if (err < 0)
1552 		pr_err("Couldn't record guest kernel [%d]'s reference"
1553 		       " relocation symbol.\n", machine->pid);
1554 
1555 	/*
1556 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1557 	 * have no _text sometimes.
1558 	 */
1559 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1560 						 machine);
1561 	if (err < 0)
1562 		pr_err("Couldn't record guest kernel [%d]'s reference"
1563 		       " relocation symbol.\n", machine->pid);
1564 }
1565 
1566 static struct perf_event_header finished_round_event = {
1567 	.size = sizeof(struct perf_event_header),
1568 	.type = PERF_RECORD_FINISHED_ROUND,
1569 };
1570 
1571 static struct perf_event_header finished_init_event = {
1572 	.size = sizeof(struct perf_event_header),
1573 	.type = PERF_RECORD_FINISHED_INIT,
1574 };
1575 
1576 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1577 {
1578 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1579 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1580 			  thread->mask->affinity.nbits)) {
1581 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1582 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1583 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1584 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1585 					(cpu_set_t *)thread->mask->affinity.bits);
1586 		if (verbose == 2) {
1587 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1588 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1589 		}
1590 	}
1591 }
1592 
1593 static size_t process_comp_header(void *record, size_t increment)
1594 {
1595 	struct perf_record_compressed2 *event = record;
1596 	size_t size = sizeof(*event);
1597 
1598 	if (increment) {
1599 		event->header.size += increment;
1600 		return increment;
1601 	}
1602 
1603 	event->header.type = PERF_RECORD_COMPRESSED2;
1604 	event->header.size = size;
1605 
1606 	return size;
1607 }
1608 
1609 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1610 			    void *dst, size_t dst_size, void *src, size_t src_size)
1611 {
1612 	ssize_t compressed;
1613 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed2) - 1;
1614 	struct zstd_data *zstd_data = &session->zstd_data;
1615 
1616 	if (map && map->file)
1617 		zstd_data = &map->zstd_data;
1618 
1619 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1620 						     max_record_size, process_comp_header);
1621 	if (compressed < 0)
1622 		return compressed;
1623 
1624 	if (map && map->file) {
1625 		thread->bytes_transferred += src_size;
1626 		thread->bytes_compressed  += compressed;
1627 	} else {
1628 		session->bytes_transferred += src_size;
1629 		session->bytes_compressed  += compressed;
1630 	}
1631 
1632 	return compressed;
1633 }
1634 
1635 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1636 				    bool overwrite, bool synch)
1637 {
1638 	u64 bytes_written = rec->bytes_written;
1639 	int i;
1640 	int rc = 0;
1641 	int nr_mmaps;
1642 	struct mmap **maps;
1643 	int trace_fd = rec->data.file.fd;
1644 	off_t off = 0;
1645 
1646 	if (!evlist)
1647 		return 0;
1648 
1649 	nr_mmaps = thread->nr_mmaps;
1650 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1651 
1652 	if (!maps)
1653 		return 0;
1654 
1655 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1656 		return 0;
1657 
1658 	if (record__aio_enabled(rec))
1659 		off = record__aio_get_pos(trace_fd);
1660 
1661 	for (i = 0; i < nr_mmaps; i++) {
1662 		u64 flush = 0;
1663 		struct mmap *map = maps[i];
1664 
1665 		if (map->core.base) {
1666 			record__adjust_affinity(rec, map);
1667 			if (synch) {
1668 				flush = map->core.flush;
1669 				map->core.flush = 1;
1670 			}
1671 			if (!record__aio_enabled(rec)) {
1672 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1673 					if (synch)
1674 						map->core.flush = flush;
1675 					rc = -1;
1676 					goto out;
1677 				}
1678 			} else {
1679 				if (record__aio_push(rec, map, &off) < 0) {
1680 					record__aio_set_pos(trace_fd, off);
1681 					if (synch)
1682 						map->core.flush = flush;
1683 					rc = -1;
1684 					goto out;
1685 				}
1686 			}
1687 			if (synch)
1688 				map->core.flush = flush;
1689 		}
1690 
1691 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1692 		    !rec->opts.auxtrace_sample_mode &&
1693 		    record__auxtrace_mmap_read(rec, map) != 0) {
1694 			rc = -1;
1695 			goto out;
1696 		}
1697 	}
1698 
1699 	if (record__aio_enabled(rec))
1700 		record__aio_set_pos(trace_fd, off);
1701 
1702 	/*
1703 	 * Mark the round finished in case we wrote
1704 	 * at least one event.
1705 	 *
1706 	 * No need for round events in directory mode,
1707 	 * because per-cpu maps and files have data
1708 	 * sorted by kernel.
1709 	 */
1710 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1711 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1712 
1713 	if (overwrite)
1714 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1715 out:
1716 	return rc;
1717 }
1718 
1719 static int record__mmap_read_all(struct record *rec, bool synch)
1720 {
1721 	int err;
1722 
1723 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1724 	if (err)
1725 		return err;
1726 
1727 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1728 }
1729 
1730 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1731 					   void *arg __maybe_unused)
1732 {
1733 	struct perf_mmap *map = fda->priv[fd].ptr;
1734 
1735 	if (map)
1736 		perf_mmap__put(map);
1737 }
1738 
1739 static void *record__thread(void *arg)
1740 {
1741 	enum thread_msg msg = THREAD_MSG__READY;
1742 	bool terminate = false;
1743 	struct fdarray *pollfd;
1744 	int err, ctlfd_pos;
1745 
1746 	thread = arg;
1747 	thread->tid = gettid();
1748 
1749 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1750 	if (err == -1)
1751 		pr_warning("threads[%d]: failed to notify on start: %m\n", thread->tid);
1752 
1753 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1754 
1755 	pollfd = &thread->pollfd;
1756 	ctlfd_pos = thread->ctlfd_pos;
1757 
1758 	for (;;) {
1759 		unsigned long long hits = thread->samples;
1760 
1761 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1762 			break;
1763 
1764 		if (hits == thread->samples) {
1765 
1766 			err = fdarray__poll(pollfd, -1);
1767 			/*
1768 			 * Propagate error, only if there's any. Ignore positive
1769 			 * number of returned events and interrupt error.
1770 			 */
1771 			if (err > 0 || (err < 0 && errno == EINTR))
1772 				err = 0;
1773 			thread->waking++;
1774 
1775 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1776 					    record__thread_munmap_filtered, NULL) == 0)
1777 				break;
1778 		}
1779 
1780 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1781 			terminate = true;
1782 			close(thread->pipes.msg[0]);
1783 			thread->pipes.msg[0] = -1;
1784 			pollfd->entries[ctlfd_pos].fd = -1;
1785 			pollfd->entries[ctlfd_pos].events = 0;
1786 		}
1787 
1788 		pollfd->entries[ctlfd_pos].revents = 0;
1789 	}
1790 	record__mmap_read_all(thread->rec, true);
1791 
1792 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1793 	if (err == -1)
1794 		pr_warning("threads[%d]: failed to notify on termination: %m\n", thread->tid);
1795 
1796 	return NULL;
1797 }
1798 
1799 static void record__init_features(struct record *rec)
1800 {
1801 	struct perf_session *session = rec->session;
1802 	int feat;
1803 
1804 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1805 		perf_header__set_feat(&session->header, feat);
1806 
1807 	if (rec->no_buildid)
1808 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1809 
1810 	if (!have_tracepoints(&rec->evlist->core.entries))
1811 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1812 
1813 	if (!rec->opts.branch_stack)
1814 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1815 
1816 	if (!rec->opts.full_auxtrace)
1817 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1818 
1819 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1820 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1821 
1822 	if (!rec->opts.use_clockid)
1823 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1824 
1825 	if (!record__threads_enabled(rec))
1826 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1827 
1828 	if (!record__comp_enabled(rec))
1829 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1830 
1831 	perf_header__clear_feat(&session->header, HEADER_STAT);
1832 }
1833 
1834 static void
1835 record__finish_output(struct record *rec)
1836 {
1837 	int i;
1838 	struct perf_data *data = &rec->data;
1839 	int fd = perf_data__fd(data);
1840 
1841 	if (data->is_pipe) {
1842 		/* Just to display approx. size */
1843 		data->file.size = rec->bytes_written;
1844 		return;
1845 	}
1846 
1847 	rec->session->header.data_size += rec->bytes_written;
1848 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1849 	if (record__threads_enabled(rec)) {
1850 		for (i = 0; i < data->dir.nr; i++)
1851 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1852 	}
1853 
1854 	/* Buildid scanning disabled or build ID in kernel and synthesized map events. */
1855 	if (!rec->no_buildid || !rec->no_buildid_cache) {
1856 		process_buildids(rec);
1857 
1858 		if (rec->buildid_all)
1859 			perf_session__dsos_hit_all(rec->session);
1860 	}
1861 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1862 	perf_session__cache_build_ids(rec->session);
1863 }
1864 
1865 static int record__synthesize_workload(struct record *rec, bool tail)
1866 {
1867 	int err;
1868 	struct perf_thread_map *thread_map;
1869 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1870 
1871 	if (rec->opts.tail_synthesize != tail)
1872 		return 0;
1873 
1874 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1875 	if (thread_map == NULL)
1876 		return -1;
1877 
1878 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1879 						 process_synthesized_event,
1880 						 &rec->session->machines.host,
1881 						 needs_mmap,
1882 						 rec->opts.record_data_mmap);
1883 	perf_thread_map__put(thread_map);
1884 	return err;
1885 }
1886 
1887 static int write_finished_init(struct record *rec, bool tail)
1888 {
1889 	if (rec->opts.tail_synthesize != tail)
1890 		return 0;
1891 
1892 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1893 }
1894 
1895 static int record__synthesize(struct record *rec, bool tail);
1896 
1897 static int
1898 record__switch_output(struct record *rec, bool at_exit)
1899 {
1900 	struct perf_data *data = &rec->data;
1901 	char *new_filename = NULL;
1902 	int fd, err;
1903 
1904 	/* Same Size:      "2015122520103046"*/
1905 	char timestamp[] = "InvalidTimestamp";
1906 
1907 	record__aio_mmap_read_sync(rec);
1908 
1909 	write_finished_init(rec, true);
1910 
1911 	record__synthesize(rec, true);
1912 	if (target__none(&rec->opts.target))
1913 		record__synthesize_workload(rec, true);
1914 
1915 	rec->samples = 0;
1916 	record__finish_output(rec);
1917 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1918 	if (err) {
1919 		pr_err("Failed to get current timestamp\n");
1920 		return -EINVAL;
1921 	}
1922 
1923 	fd = perf_data__switch(data, timestamp,
1924 			       rec->session->header.data_offset,
1925 			       at_exit, &new_filename);
1926 	if (fd >= 0 && !at_exit) {
1927 		rec->bytes_written = 0;
1928 		rec->session->header.data_size = 0;
1929 	}
1930 
1931 	if (!quiet) {
1932 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1933 			data->path, timestamp);
1934 	}
1935 
1936 	if (rec->switch_output.num_files) {
1937 		int n = rec->switch_output.cur_file + 1;
1938 
1939 		if (n >= rec->switch_output.num_files)
1940 			n = 0;
1941 		rec->switch_output.cur_file = n;
1942 		if (rec->switch_output.filenames[n]) {
1943 			remove(rec->switch_output.filenames[n]);
1944 			zfree(&rec->switch_output.filenames[n]);
1945 		}
1946 		rec->switch_output.filenames[n] = new_filename;
1947 	} else {
1948 		free(new_filename);
1949 	}
1950 
1951 	/* Output tracking events */
1952 	if (!at_exit) {
1953 		record__synthesize(rec, false);
1954 
1955 		/*
1956 		 * In 'perf record --switch-output' without -a,
1957 		 * record__synthesize() in record__switch_output() won't
1958 		 * generate tracking events because there's no thread_map
1959 		 * in evlist. Which causes newly created perf.data doesn't
1960 		 * contain map and comm information.
1961 		 * Create a fake thread_map and directly call
1962 		 * perf_event__synthesize_thread_map() for those events.
1963 		 */
1964 		if (target__none(&rec->opts.target))
1965 			record__synthesize_workload(rec, false);
1966 		write_finished_init(rec, false);
1967 	}
1968 	return fd;
1969 }
1970 
1971 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1972 					struct perf_record_lost_samples *lost,
1973 					int cpu_idx, int thread_idx, u64 lost_count,
1974 					u16 misc_flag)
1975 {
1976 	struct perf_sample_id *sid;
1977 	struct perf_sample sample;
1978 	int id_hdr_size;
1979 
1980 	perf_sample__init(&sample, /*all=*/true);
1981 	lost->lost = lost_count;
1982 	if (evsel->core.ids) {
1983 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1984 		sample.id = sid->id;
1985 	}
1986 
1987 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1988 						       evsel->core.attr.sample_type, &sample);
1989 	lost->header.size = sizeof(*lost) + id_hdr_size;
1990 	lost->header.misc = misc_flag;
1991 	record__write(rec, NULL, lost, lost->header.size);
1992 	perf_sample__exit(&sample);
1993 }
1994 
1995 static void record__read_lost_samples(struct record *rec)
1996 {
1997 	struct perf_session *session = rec->session;
1998 	struct perf_record_lost_samples_and_ids lost;
1999 	struct evsel *evsel;
2000 
2001 	/* there was an error during record__open */
2002 	if (session->evlist == NULL)
2003 		return;
2004 
2005 	evlist__for_each_entry(session->evlist, evsel) {
2006 		struct xyarray *xy = evsel->core.sample_id;
2007 		u64 lost_count;
2008 
2009 		if (xy == NULL || evsel->core.fd == NULL)
2010 			continue;
2011 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
2012 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
2013 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
2014 			continue;
2015 		}
2016 
2017 		for (int x = 0; x < xyarray__max_x(xy); x++) {
2018 			for (int y = 0; y < xyarray__max_y(xy); y++) {
2019 				struct perf_counts_values count;
2020 
2021 				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
2022 					pr_debug("read LOST count failed\n");
2023 					return;
2024 				}
2025 
2026 				if (count.lost) {
2027 					memset(&lost, 0, sizeof(lost));
2028 					lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
2029 					__record__save_lost_samples(rec, evsel, &lost.lost,
2030 								    x, y, count.lost, 0);
2031 				}
2032 			}
2033 		}
2034 
2035 		lost_count = perf_bpf_filter__lost_count(evsel);
2036 		if (lost_count) {
2037 			memset(&lost, 0, sizeof(lost));
2038 			lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
2039 			__record__save_lost_samples(rec, evsel, &lost.lost, 0, 0, lost_count,
2040 						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
2041 		}
2042 	}
2043 }
2044 
2045 static volatile sig_atomic_t workload_exec_errno;
2046 
2047 /*
2048  * evlist__prepare_workload will send a SIGUSR1
2049  * if the fork fails, since we asked by setting its
2050  * want_signal to true.
2051  */
2052 static void workload_exec_failed_signal(int signo __maybe_unused,
2053 					siginfo_t *info,
2054 					void *ucontext __maybe_unused)
2055 {
2056 	workload_exec_errno = info->si_value.sival_int;
2057 	done = 1;
2058 	child_finished = 1;
2059 }
2060 
2061 static void snapshot_sig_handler(int sig);
2062 static void alarm_sig_handler(int sig);
2063 
2064 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
2065 {
2066 	if (evlist) {
2067 		if (evlist->mmap && evlist->mmap[0].core.base)
2068 			return evlist->mmap[0].core.base;
2069 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2070 			return evlist->overwrite_mmap[0].core.base;
2071 	}
2072 	return NULL;
2073 }
2074 
2075 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2076 {
2077 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2078 	if (pc)
2079 		return pc;
2080 	return NULL;
2081 }
2082 
2083 static int record__synthesize(struct record *rec, bool tail)
2084 {
2085 	struct perf_session *session = rec->session;
2086 	struct machine *machine = &session->machines.host;
2087 	struct perf_data *data = &rec->data;
2088 	struct record_opts *opts = &rec->opts;
2089 	struct perf_tool *tool = &rec->tool;
2090 	int err = 0;
2091 	event_op f = process_synthesized_event;
2092 
2093 	if (rec->opts.tail_synthesize != tail)
2094 		return 0;
2095 
2096 	if (data->is_pipe) {
2097 		err = perf_event__synthesize_for_pipe(tool, session, data,
2098 						      process_synthesized_event);
2099 		if (err < 0)
2100 			goto out;
2101 
2102 		rec->bytes_written += err;
2103 	}
2104 
2105 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2106 					  process_synthesized_event, machine);
2107 	if (err)
2108 		goto out;
2109 
2110 	/* Synthesize id_index before auxtrace_info */
2111 	err = perf_event__synthesize_id_index(tool,
2112 					      process_synthesized_event,
2113 					      session->evlist, machine);
2114 	if (err)
2115 		goto out;
2116 
2117 	if (rec->opts.full_auxtrace) {
2118 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2119 					session, process_synthesized_event);
2120 		if (err)
2121 			goto out;
2122 	}
2123 
2124 	if (!evlist__exclude_kernel(rec->evlist)) {
2125 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2126 							 machine);
2127 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2128 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2129 				   "Check /proc/kallsyms permission or run as root.\n");
2130 
2131 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2132 						     machine);
2133 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2134 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2135 				   "Check /proc/modules permission or run as root.\n");
2136 	}
2137 
2138 	if (perf_guest) {
2139 		machines__process_guests(&session->machines,
2140 					 perf_event__synthesize_guest_os, tool);
2141 	}
2142 
2143 	err = perf_event__synthesize_extra_attr(&rec->tool,
2144 						rec->evlist,
2145 						process_synthesized_event,
2146 						data->is_pipe);
2147 	if (err)
2148 		goto out;
2149 
2150 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2151 						 process_synthesized_event,
2152 						NULL);
2153 	if (err < 0) {
2154 		pr_err("Couldn't synthesize thread map.\n");
2155 		return err;
2156 	}
2157 
2158 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2159 					     process_synthesized_event, NULL);
2160 	if (err < 0) {
2161 		pr_err("Couldn't synthesize cpu map.\n");
2162 		return err;
2163 	}
2164 
2165 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2166 						machine, opts);
2167 	if (err < 0) {
2168 		pr_warning("Couldn't synthesize bpf events.\n");
2169 		err = 0;
2170 	}
2171 
2172 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2173 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2174 						     machine);
2175 		if (err < 0) {
2176 			pr_warning("Couldn't synthesize cgroup events.\n");
2177 			err = 0;
2178 		}
2179 	}
2180 
2181 	if (rec->opts.nr_threads_synthesize > 1) {
2182 		mutex_init(&synth_lock);
2183 		perf_set_multithreaded();
2184 		f = process_locked_synthesized_event;
2185 	}
2186 
2187 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2188 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2189 
2190 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2191 						    rec->evlist->core.threads,
2192 						    f, needs_mmap, opts->record_data_mmap,
2193 						    rec->opts.nr_threads_synthesize);
2194 	}
2195 
2196 	if (rec->opts.nr_threads_synthesize > 1) {
2197 		perf_set_singlethreaded();
2198 		mutex_destroy(&synth_lock);
2199 	}
2200 
2201 out:
2202 	return err;
2203 }
2204 
2205 static void record__synthesize_final_bpf_metadata(struct record *rec __maybe_unused)
2206 {
2207 #ifdef HAVE_LIBBPF_SUPPORT
2208 	perf_event__synthesize_final_bpf_metadata(rec->session,
2209 						  process_synthesized_event);
2210 #endif
2211 }
2212 
2213 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2214 {
2215 	struct record *rec = data;
2216 	pthread_kill(rec->thread_id, SIGUSR2);
2217 	return 0;
2218 }
2219 
2220 static int record__setup_sb_evlist(struct record *rec)
2221 {
2222 	struct record_opts *opts = &rec->opts;
2223 
2224 	if (rec->sb_evlist != NULL) {
2225 		/*
2226 		 * We get here if --switch-output-event populated the
2227 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2228 		 * to the main thread.
2229 		 */
2230 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2231 		rec->thread_id = pthread_self();
2232 	}
2233 #ifdef HAVE_LIBBPF_SUPPORT
2234 	if (!opts->no_bpf_event) {
2235 		if (rec->sb_evlist == NULL) {
2236 			rec->sb_evlist = evlist__new();
2237 
2238 			if (rec->sb_evlist == NULL) {
2239 				pr_err("Couldn't create side band evlist.\n.");
2240 				return -1;
2241 			}
2242 		}
2243 
2244 		if (evlist__add_bpf_sb_event(rec->sb_evlist, perf_session__env(rec->session))) {
2245 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2246 			return -1;
2247 		}
2248 	}
2249 #endif
2250 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2251 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2252 		opts->no_bpf_event = true;
2253 	}
2254 
2255 	return 0;
2256 }
2257 
2258 static int record__init_clock(struct record *rec)
2259 {
2260 	struct perf_session *session = rec->session;
2261 	struct timespec ref_clockid;
2262 	struct timeval ref_tod;
2263 	struct perf_env *env = perf_session__env(session);
2264 	u64 ref;
2265 
2266 	if (!rec->opts.use_clockid)
2267 		return 0;
2268 
2269 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2270 		env->clock.clockid_res_ns = rec->opts.clockid_res_ns;
2271 
2272 	env->clock.clockid = rec->opts.clockid;
2273 
2274 	if (gettimeofday(&ref_tod, NULL) != 0) {
2275 		pr_err("gettimeofday failed, cannot set reference time.\n");
2276 		return -1;
2277 	}
2278 
2279 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2280 		pr_err("clock_gettime failed, cannot set reference time.\n");
2281 		return -1;
2282 	}
2283 
2284 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2285 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2286 
2287 	env->clock.tod_ns = ref;
2288 
2289 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2290 	      (u64) ref_clockid.tv_nsec;
2291 
2292 	env->clock.clockid_ns = ref;
2293 	return 0;
2294 }
2295 
2296 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2297 {
2298 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2299 		trigger_hit(&auxtrace_snapshot_trigger);
2300 		auxtrace_record__snapshot_started = 1;
2301 		if (auxtrace_record__snapshot_start(rec->itr))
2302 			trigger_error(&auxtrace_snapshot_trigger);
2303 	}
2304 }
2305 
2306 static int record__terminate_thread(struct record_thread *thread_data)
2307 {
2308 	int err;
2309 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2310 	pid_t tid = thread_data->tid;
2311 
2312 	close(thread_data->pipes.msg[1]);
2313 	thread_data->pipes.msg[1] = -1;
2314 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2315 	if (err > 0)
2316 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2317 	else
2318 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2319 			   thread->tid, tid);
2320 
2321 	return 0;
2322 }
2323 
2324 static int record__start_threads(struct record *rec)
2325 {
2326 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2327 	struct record_thread *thread_data = rec->thread_data;
2328 	sigset_t full, mask;
2329 	pthread_t handle;
2330 	pthread_attr_t attrs;
2331 
2332 	thread = &thread_data[0];
2333 
2334 	if (!record__threads_enabled(rec))
2335 		return 0;
2336 
2337 	sigfillset(&full);
2338 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2339 		pr_err("Failed to block signals on threads start: %m\n");
2340 		return -1;
2341 	}
2342 
2343 	pthread_attr_init(&attrs);
2344 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2345 
2346 	for (t = 1; t < nr_threads; t++) {
2347 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2348 
2349 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2350 		pthread_attr_setaffinity_np(&attrs,
2351 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2352 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2353 #endif
2354 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2355 			for (tt = 1; tt < t; tt++)
2356 				record__terminate_thread(&thread_data[t]);
2357 			pr_err("Failed to start threads: %m\n");
2358 			ret = -1;
2359 			goto out_err;
2360 		}
2361 
2362 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2363 		if (err > 0)
2364 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2365 				  thread_msg_tags[msg]);
2366 		else
2367 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2368 				   thread->tid, rec->thread_data[t].tid);
2369 	}
2370 
2371 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2372 			(cpu_set_t *)thread->mask->affinity.bits);
2373 
2374 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2375 
2376 out_err:
2377 	pthread_attr_destroy(&attrs);
2378 
2379 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2380 		pr_err("Failed to unblock signals on threads start: %m\n");
2381 		ret = -1;
2382 	}
2383 
2384 	return ret;
2385 }
2386 
2387 static int record__stop_threads(struct record *rec)
2388 {
2389 	int t;
2390 	struct record_thread *thread_data = rec->thread_data;
2391 
2392 	for (t = 1; t < rec->nr_threads; t++)
2393 		record__terminate_thread(&thread_data[t]);
2394 
2395 	for (t = 0; t < rec->nr_threads; t++) {
2396 		rec->samples += thread_data[t].samples;
2397 		if (!record__threads_enabled(rec))
2398 			continue;
2399 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2400 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2401 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2402 			 thread_data[t].samples, thread_data[t].waking);
2403 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2404 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2405 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2406 		else
2407 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2408 	}
2409 
2410 	return 0;
2411 }
2412 
2413 static unsigned long record__waking(struct record *rec)
2414 {
2415 	int t;
2416 	unsigned long waking = 0;
2417 	struct record_thread *thread_data = rec->thread_data;
2418 
2419 	for (t = 0; t < rec->nr_threads; t++)
2420 		waking += thread_data[t].waking;
2421 
2422 	return waking;
2423 }
2424 
2425 static int __cmd_record(struct record *rec, int argc, const char **argv)
2426 {
2427 	int err;
2428 	int status = 0;
2429 	const bool forks = argc > 0;
2430 	struct perf_tool *tool = &rec->tool;
2431 	struct record_opts *opts = &rec->opts;
2432 	struct perf_data *data = &rec->data;
2433 	struct perf_session *session;
2434 	bool disabled = false, draining = false;
2435 	int fd;
2436 	float ratio = 0;
2437 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2438 	struct perf_env *env;
2439 
2440 	atexit(record__sig_exit);
2441 	signal(SIGCHLD, sig_handler);
2442 	signal(SIGINT, sig_handler);
2443 	signal(SIGTERM, sig_handler);
2444 	signal(SIGSEGV, sigsegv_handler);
2445 
2446 	if (rec->opts.record_cgroup) {
2447 #ifndef HAVE_FILE_HANDLE
2448 		pr_err("cgroup tracking is not supported\n");
2449 		return -1;
2450 #endif
2451 	}
2452 
2453 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2454 		signal(SIGUSR2, snapshot_sig_handler);
2455 		if (rec->opts.auxtrace_snapshot_mode)
2456 			trigger_on(&auxtrace_snapshot_trigger);
2457 		if (rec->switch_output.enabled)
2458 			trigger_on(&switch_output_trigger);
2459 	} else {
2460 		signal(SIGUSR2, SIG_IGN);
2461 	}
2462 
2463 	perf_tool__init(tool, /*ordered_events=*/true);
2464 	tool->sample		= process_sample_event;
2465 	tool->fork		= perf_event__process_fork;
2466 	tool->exit		= perf_event__process_exit;
2467 	tool->comm		= perf_event__process_comm;
2468 	tool->namespaces	= perf_event__process_namespaces;
2469 	tool->mmap		= build_id__process_mmap;
2470 	tool->mmap2		= build_id__process_mmap2;
2471 	tool->itrace_start	= process_timestamp_boundary;
2472 	tool->aux		= process_timestamp_boundary;
2473 	tool->namespace_events	= rec->opts.record_namespaces;
2474 	tool->cgroup_events	= rec->opts.record_cgroup;
2475 	session = perf_session__new(data, tool);
2476 	if (IS_ERR(session)) {
2477 		pr_err("Perf session creation failed.\n");
2478 		return PTR_ERR(session);
2479 	}
2480 	env = perf_session__env(session);
2481 	if (record__threads_enabled(rec)) {
2482 		if (perf_data__is_pipe(&rec->data)) {
2483 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2484 			return -1;
2485 		}
2486 		if (rec->opts.full_auxtrace) {
2487 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2488 			return -1;
2489 		}
2490 	}
2491 
2492 	fd = perf_data__fd(data);
2493 	rec->session = session;
2494 
2495 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2496 		pr_err("Compression initialization failed.\n");
2497 		return -1;
2498 	}
2499 #ifdef HAVE_EVENTFD_SUPPORT
2500 	done_fd = eventfd(0, EFD_NONBLOCK);
2501 	if (done_fd < 0) {
2502 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2503 		status = -1;
2504 		goto out_delete_session;
2505 	}
2506 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2507 	if (err < 0) {
2508 		pr_err("Failed to add wakeup eventfd to poll list\n");
2509 		status = err;
2510 		goto out_delete_session;
2511 	}
2512 #endif // HAVE_EVENTFD_SUPPORT
2513 
2514 	env->comp_type  = PERF_COMP_ZSTD;
2515 	env->comp_level = rec->opts.comp_level;
2516 
2517 	if (rec->opts.kcore &&
2518 	    !record__kcore_readable(&session->machines.host)) {
2519 		pr_err("ERROR: kcore is not readable.\n");
2520 		return -1;
2521 	}
2522 
2523 	if (record__init_clock(rec))
2524 		return -1;
2525 
2526 	record__init_features(rec);
2527 
2528 	if (forks) {
2529 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2530 					       workload_exec_failed_signal);
2531 		if (err < 0) {
2532 			pr_err("Couldn't run the workload!\n");
2533 			status = err;
2534 			goto out_delete_session;
2535 		}
2536 	}
2537 
2538 	/*
2539 	 * If we have just single event and are sending data
2540 	 * through pipe, we need to force the ids allocation,
2541 	 * because we synthesize event name through the pipe
2542 	 * and need the id for that.
2543 	 */
2544 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2545 		rec->opts.sample_id = true;
2546 
2547 	if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2548 		rec->timestamp_filename = false;
2549 		pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2550 	}
2551 
2552 	/*
2553 	 * Use global stat_config that is zero meaning aggr_mode is AGGR_NONE
2554 	 * and hybrid_merge is false.
2555 	 */
2556 	evlist__uniquify_evsel_names(rec->evlist, &stat_config);
2557 
2558 	evlist__config(rec->evlist, opts, &callchain_param);
2559 
2560 	/* Debug message used by test scripts */
2561 	pr_debug3("perf record opening and mmapping events\n");
2562 	if (record__open(rec) != 0) {
2563 		err = -1;
2564 		goto out_free_threads;
2565 	}
2566 	/* Debug message used by test scripts */
2567 	pr_debug3("perf record done opening and mmapping events\n");
2568 	env->comp_mmap_len = session->evlist->core.mmap_len;
2569 
2570 	if (rec->opts.kcore) {
2571 		err = record__kcore_copy(&session->machines.host, data);
2572 		if (err) {
2573 			pr_err("ERROR: Failed to copy kcore\n");
2574 			goto out_free_threads;
2575 		}
2576 	}
2577 
2578 	/*
2579 	 * Normally perf_session__new would do this, but it doesn't have the
2580 	 * evlist.
2581 	 */
2582 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2583 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2584 		rec->tool.ordered_events = false;
2585 	}
2586 
2587 	if (evlist__nr_groups(rec->evlist) == 0)
2588 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2589 
2590 	if (data->is_pipe) {
2591 		err = perf_header__write_pipe(fd);
2592 		if (err < 0)
2593 			goto out_free_threads;
2594 	} else {
2595 		err = perf_session__write_header(session, rec->evlist, fd, false);
2596 		if (err < 0)
2597 			goto out_free_threads;
2598 	}
2599 
2600 	err = -1;
2601 	if (!rec->no_buildid
2602 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2603 		pr_err("Couldn't generate buildids. "
2604 		       "Use --no-buildid to profile anyway.\n");
2605 		goto out_free_threads;
2606 	}
2607 
2608 	if (!evlist__needs_bpf_sb_event(rec->evlist))
2609 		opts->no_bpf_event = true;
2610 
2611 	err = record__setup_sb_evlist(rec);
2612 	if (err)
2613 		goto out_free_threads;
2614 
2615 	err = record__synthesize(rec, false);
2616 	if (err < 0)
2617 		goto out_free_threads;
2618 
2619 	if (rec->realtime_prio) {
2620 		struct sched_param param;
2621 
2622 		param.sched_priority = rec->realtime_prio;
2623 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2624 			pr_err("Could not set realtime priority.\n");
2625 			err = -1;
2626 			goto out_free_threads;
2627 		}
2628 	}
2629 
2630 	if (record__start_threads(rec))
2631 		goto out_free_threads;
2632 
2633 	/*
2634 	 * When perf is starting the traced process, all the events
2635 	 * (apart from group members) have enable_on_exec=1 set,
2636 	 * so don't spoil it by prematurely enabling them.
2637 	 */
2638 	if (!target__none(&opts->target) && !opts->target.initial_delay)
2639 		evlist__enable(rec->evlist);
2640 
2641 	/*
2642 	 * offcpu-time does not call execve, so enable_on_exe wouldn't work
2643 	 * when recording a workload, do it manually
2644 	 */
2645 	if (rec->off_cpu)
2646 		evlist__enable_evsel(rec->evlist, (char *)OFFCPU_EVENT);
2647 
2648 	/*
2649 	 * Let the child rip
2650 	 */
2651 	if (forks) {
2652 		struct machine *machine = &session->machines.host;
2653 		union perf_event *event;
2654 		pid_t tgid;
2655 
2656 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2657 		if (event == NULL) {
2658 			err = -ENOMEM;
2659 			goto out_child;
2660 		}
2661 
2662 		/*
2663 		 * Some H/W events are generated before COMM event
2664 		 * which is emitted during exec(), so perf script
2665 		 * cannot see a correct process name for those events.
2666 		 * Synthesize COMM event to prevent it.
2667 		 */
2668 		tgid = perf_event__synthesize_comm(tool, event,
2669 						   rec->evlist->workload.pid,
2670 						   process_synthesized_event,
2671 						   machine);
2672 		free(event);
2673 
2674 		if (tgid == -1)
2675 			goto out_child;
2676 
2677 		event = malloc(sizeof(event->namespaces) +
2678 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2679 			       machine->id_hdr_size);
2680 		if (event == NULL) {
2681 			err = -ENOMEM;
2682 			goto out_child;
2683 		}
2684 
2685 		/*
2686 		 * Synthesize NAMESPACES event for the command specified.
2687 		 */
2688 		perf_event__synthesize_namespaces(tool, event,
2689 						  rec->evlist->workload.pid,
2690 						  tgid, process_synthesized_event,
2691 						  machine);
2692 		free(event);
2693 
2694 		evlist__start_workload(rec->evlist);
2695 	}
2696 
2697 	if (opts->target.initial_delay) {
2698 		pr_info(EVLIST_DISABLED_MSG);
2699 		if (opts->target.initial_delay > 0) {
2700 			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2701 			evlist__enable(rec->evlist);
2702 			pr_info(EVLIST_ENABLED_MSG);
2703 		}
2704 	}
2705 
2706 	err = event_enable_timer__start(rec->evlist->eet);
2707 	if (err)
2708 		goto out_child;
2709 
2710 	/* Debug message used by test scripts */
2711 	pr_debug3("perf record has started\n");
2712 	fflush(stderr);
2713 
2714 	trigger_ready(&auxtrace_snapshot_trigger);
2715 	trigger_ready(&switch_output_trigger);
2716 	perf_hooks__invoke_record_start();
2717 
2718 	/*
2719 	 * Must write FINISHED_INIT so it will be seen after all other
2720 	 * synthesized user events, but before any regular events.
2721 	 */
2722 	err = write_finished_init(rec, false);
2723 	if (err < 0)
2724 		goto out_child;
2725 
2726 	for (;;) {
2727 		unsigned long long hits = thread->samples;
2728 
2729 		/*
2730 		 * rec->evlist->bkw_mmap_state is possible to be
2731 		 * BKW_MMAP_EMPTY here: when done == true and
2732 		 * hits != rec->samples in previous round.
2733 		 *
2734 		 * evlist__toggle_bkw_mmap ensure we never
2735 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2736 		 */
2737 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2738 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2739 
2740 		if (record__mmap_read_all(rec, false) < 0) {
2741 			trigger_error(&auxtrace_snapshot_trigger);
2742 			trigger_error(&switch_output_trigger);
2743 			err = -1;
2744 			goto out_child;
2745 		}
2746 
2747 		if (auxtrace_record__snapshot_started) {
2748 			auxtrace_record__snapshot_started = 0;
2749 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2750 				record__read_auxtrace_snapshot(rec, false);
2751 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2752 				pr_err("AUX area tracing snapshot failed\n");
2753 				err = -1;
2754 				goto out_child;
2755 			}
2756 		}
2757 
2758 		if (trigger_is_hit(&switch_output_trigger)) {
2759 			/*
2760 			 * If switch_output_trigger is hit, the data in
2761 			 * overwritable ring buffer should have been collected,
2762 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2763 			 *
2764 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2765 			 * record__mmap_read_all() didn't collect data from
2766 			 * overwritable ring buffer. Read again.
2767 			 */
2768 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2769 				continue;
2770 			trigger_ready(&switch_output_trigger);
2771 
2772 			/*
2773 			 * Reenable events in overwrite ring buffer after
2774 			 * record__mmap_read_all(): we should have collected
2775 			 * data from it.
2776 			 */
2777 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2778 
2779 			if (!quiet)
2780 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2781 					record__waking(rec));
2782 			thread->waking = 0;
2783 			fd = record__switch_output(rec, false);
2784 			if (fd < 0) {
2785 				pr_err("Failed to switch to new file\n");
2786 				trigger_error(&switch_output_trigger);
2787 				err = fd;
2788 				goto out_child;
2789 			}
2790 
2791 			/* re-arm the alarm */
2792 			if (rec->switch_output.time)
2793 				alarm(rec->switch_output.time);
2794 		}
2795 
2796 		if (hits == thread->samples) {
2797 			if (done || draining)
2798 				break;
2799 			err = fdarray__poll(&thread->pollfd, -1);
2800 			/*
2801 			 * Propagate error, only if there's any. Ignore positive
2802 			 * number of returned events and interrupt error.
2803 			 */
2804 			if (err > 0 || (err < 0 && errno == EINTR))
2805 				err = 0;
2806 			thread->waking++;
2807 
2808 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2809 					    record__thread_munmap_filtered, NULL) == 0)
2810 				draining = true;
2811 
2812 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2813 			if (err)
2814 				goto out_child;
2815 		}
2816 
2817 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2818 			switch (cmd) {
2819 			case EVLIST_CTL_CMD_SNAPSHOT:
2820 				hit_auxtrace_snapshot_trigger(rec);
2821 				evlist__ctlfd_ack(rec->evlist);
2822 				break;
2823 			case EVLIST_CTL_CMD_STOP:
2824 				done = 1;
2825 				break;
2826 			case EVLIST_CTL_CMD_ACK:
2827 			case EVLIST_CTL_CMD_UNSUPPORTED:
2828 			case EVLIST_CTL_CMD_ENABLE:
2829 			case EVLIST_CTL_CMD_DISABLE:
2830 			case EVLIST_CTL_CMD_EVLIST:
2831 			case EVLIST_CTL_CMD_PING:
2832 			default:
2833 				break;
2834 			}
2835 		}
2836 
2837 		err = event_enable_timer__process(rec->evlist->eet);
2838 		if (err < 0)
2839 			goto out_child;
2840 		if (err) {
2841 			err = 0;
2842 			done = 1;
2843 		}
2844 
2845 		/*
2846 		 * When perf is starting the traced process, at the end events
2847 		 * die with the process and we wait for that. Thus no need to
2848 		 * disable events in this case.
2849 		 */
2850 		if (done && !disabled && !target__none(&opts->target)) {
2851 			trigger_off(&auxtrace_snapshot_trigger);
2852 			evlist__disable(rec->evlist);
2853 			disabled = true;
2854 		}
2855 	}
2856 
2857 	trigger_off(&auxtrace_snapshot_trigger);
2858 	trigger_off(&switch_output_trigger);
2859 
2860 	record__synthesize_final_bpf_metadata(rec);
2861 
2862 	if (opts->auxtrace_snapshot_on_exit)
2863 		record__auxtrace_snapshot_exit(rec);
2864 
2865 	if (forks && workload_exec_errno) {
2866 		char msg[STRERR_BUFSIZE];
2867 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2868 		struct strbuf sb = STRBUF_INIT;
2869 
2870 		evlist__format_evsels(rec->evlist, &sb, 2048);
2871 
2872 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2873 			sb.buf, argv[0], emsg);
2874 		strbuf_release(&sb);
2875 		err = -1;
2876 		goto out_child;
2877 	}
2878 
2879 	if (!quiet)
2880 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2881 			record__waking(rec));
2882 
2883 	write_finished_init(rec, true);
2884 
2885 	if (target__none(&rec->opts.target))
2886 		record__synthesize_workload(rec, true);
2887 
2888 out_child:
2889 	record__stop_threads(rec);
2890 	record__mmap_read_all(rec, true);
2891 out_free_threads:
2892 	record__free_thread_data(rec);
2893 	evlist__finalize_ctlfd(rec->evlist);
2894 	record__aio_mmap_read_sync(rec);
2895 
2896 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2897 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2898 		env->comp_ratio = ratio + 0.5;
2899 	}
2900 
2901 	if (forks) {
2902 		int exit_status;
2903 
2904 		if (!child_finished)
2905 			kill(rec->evlist->workload.pid, SIGTERM);
2906 
2907 		wait(&exit_status);
2908 
2909 		if (err < 0)
2910 			status = err;
2911 		else if (WIFEXITED(exit_status))
2912 			status = WEXITSTATUS(exit_status);
2913 		else if (WIFSIGNALED(exit_status))
2914 			signr = WTERMSIG(exit_status);
2915 	} else
2916 		status = err;
2917 
2918 	if (rec->off_cpu)
2919 		rec->bytes_written += off_cpu_write(rec->session);
2920 
2921 	record__read_lost_samples(rec);
2922 	/* this will be recalculated during process_buildids() */
2923 	rec->samples = 0;
2924 
2925 	if (!err) {
2926 		record__synthesize(rec, true);
2927 		if (!rec->timestamp_filename) {
2928 			record__finish_output(rec);
2929 		} else {
2930 			fd = record__switch_output(rec, true);
2931 			if (fd < 0) {
2932 				status = fd;
2933 				goto out_delete_session;
2934 			}
2935 		}
2936 	}
2937 
2938 	perf_hooks__invoke_record_end();
2939 
2940 	if (!err && !quiet) {
2941 		char samples[128];
2942 		const char *postfix = rec->timestamp_filename ?
2943 					".<timestamp>" : "";
2944 
2945 		if (rec->samples && !rec->opts.full_auxtrace)
2946 			scnprintf(samples, sizeof(samples),
2947 				  " (%" PRIu64 " samples)", rec->samples);
2948 		else
2949 			samples[0] = '\0';
2950 
2951 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2952 			perf_data__size(data) / 1024.0 / 1024.0,
2953 			data->path, postfix, samples);
2954 		if (ratio) {
2955 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2956 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2957 					ratio);
2958 		}
2959 		fprintf(stderr, " ]\n");
2960 	}
2961 
2962 out_delete_session:
2963 #ifdef HAVE_EVENTFD_SUPPORT
2964 	if (done_fd >= 0) {
2965 		fd = done_fd;
2966 		done_fd = -1;
2967 
2968 		close(fd);
2969 	}
2970 #endif
2971 	zstd_fini(&session->zstd_data);
2972 	if (!opts->no_bpf_event)
2973 		evlist__stop_sb_thread(rec->sb_evlist);
2974 
2975 	perf_session__delete(session);
2976 	return status;
2977 }
2978 
2979 static int record_parse_callchain_opt(const struct option *opt,
2980 			       const char *arg,
2981 			       int unset)
2982 {
2983 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2984 }
2985 
2986 static int record_callchain_opt(const struct option *opt,
2987 				const char *arg __maybe_unused,
2988 				int unset)
2989 {
2990 	/*
2991 	 * The -g option only sets the callchain if not already configured by
2992 	 * .perfconfig. It does, however, enable it.
2993 	 */
2994 	if (callchain_param.record_mode != CALLCHAIN_NONE) {
2995 		callchain_param.enabled = true;
2996 		return 0;
2997 	}
2998 
2999 	return record_opts__parse_callchain(opt->value, &callchain_param,
3000 					    EM_HOST != EM_S390 ? "fp" : "dwarf",
3001 					    unset);
3002 }
3003 
3004 
3005 static int perf_record_config(const char *var, const char *value, void *cb)
3006 {
3007 	struct record *rec = cb;
3008 
3009 	if (!strcmp(var, "record.build-id")) {
3010 		if (!strcmp(value, "cache"))
3011 			rec->no_buildid_cache = false;
3012 		else if (!strcmp(value, "no-cache"))
3013 			rec->no_buildid_cache = true;
3014 		else if (!strcmp(value, "skip"))
3015 			rec->no_buildid = rec->no_buildid_cache = true;
3016 		else if (!strcmp(value, "mmap"))
3017 			rec->buildid_mmap = true;
3018 		else if (!strcmp(value, "no-mmap"))
3019 			rec->buildid_mmap = false;
3020 		else
3021 			return -1;
3022 		return 0;
3023 	}
3024 	if (!strcmp(var, "record.call-graph")) {
3025 		var = "call-graph.record-mode";
3026 		return perf_default_config(var, value, cb);
3027 	}
3028 #ifdef HAVE_AIO_SUPPORT
3029 	if (!strcmp(var, "record.aio")) {
3030 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
3031 		if (!rec->opts.nr_cblocks)
3032 			rec->opts.nr_cblocks = nr_cblocks_default;
3033 	}
3034 #endif
3035 	if (!strcmp(var, "record.debuginfod")) {
3036 		rec->debuginfod.urls = strdup(value);
3037 		if (!rec->debuginfod.urls)
3038 			return -ENOMEM;
3039 		rec->debuginfod.set = true;
3040 	}
3041 
3042 	return 0;
3043 }
3044 
3045 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
3046 {
3047 	struct record *rec = (struct record *)opt->value;
3048 
3049 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
3050 }
3051 
3052 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
3053 {
3054 	struct record_opts *opts = (struct record_opts *)opt->value;
3055 
3056 	if (unset || !str)
3057 		return 0;
3058 
3059 	if (!strcasecmp(str, "node"))
3060 		opts->affinity = PERF_AFFINITY_NODE;
3061 	else if (!strcasecmp(str, "cpu"))
3062 		opts->affinity = PERF_AFFINITY_CPU;
3063 
3064 	return 0;
3065 }
3066 
3067 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3068 {
3069 	mask->nbits = nr_bits;
3070 	mask->bits = bitmap_zalloc(mask->nbits);
3071 	if (!mask->bits)
3072 		return -ENOMEM;
3073 
3074 	return 0;
3075 }
3076 
3077 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3078 {
3079 	bitmap_free(mask->bits);
3080 	mask->nbits = 0;
3081 }
3082 
3083 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3084 {
3085 	int ret;
3086 
3087 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3088 	if (ret) {
3089 		mask->affinity.bits = NULL;
3090 		return ret;
3091 	}
3092 
3093 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3094 	if (ret) {
3095 		record__mmap_cpu_mask_free(&mask->maps);
3096 		mask->maps.bits = NULL;
3097 	}
3098 
3099 	return ret;
3100 }
3101 
3102 static void record__thread_mask_free(struct thread_mask *mask)
3103 {
3104 	record__mmap_cpu_mask_free(&mask->maps);
3105 	record__mmap_cpu_mask_free(&mask->affinity);
3106 }
3107 
3108 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3109 {
3110 	int s;
3111 	struct record_opts *opts = opt->value;
3112 
3113 	if (unset || !str || !strlen(str)) {
3114 		opts->threads_spec = THREAD_SPEC__CPU;
3115 	} else {
3116 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3117 			if (s == THREAD_SPEC__USER) {
3118 				opts->threads_user_spec = strdup(str);
3119 				if (!opts->threads_user_spec)
3120 					return -ENOMEM;
3121 				opts->threads_spec = THREAD_SPEC__USER;
3122 				break;
3123 			}
3124 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3125 				opts->threads_spec = s;
3126 				break;
3127 			}
3128 		}
3129 	}
3130 
3131 	if (opts->threads_spec == THREAD_SPEC__USER)
3132 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3133 	else
3134 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3135 
3136 	return 0;
3137 }
3138 
3139 static int parse_output_max_size(const struct option *opt,
3140 				 const char *str, int unset)
3141 {
3142 	unsigned long *s = (unsigned long *)opt->value;
3143 	static struct parse_tag tags_size[] = {
3144 		{ .tag  = 'B', .mult = 1       },
3145 		{ .tag  = 'K', .mult = 1 << 10 },
3146 		{ .tag  = 'M', .mult = 1 << 20 },
3147 		{ .tag  = 'G', .mult = 1 << 30 },
3148 		{ .tag  = 0 },
3149 	};
3150 	unsigned long val;
3151 
3152 	if (unset) {
3153 		*s = 0;
3154 		return 0;
3155 	}
3156 
3157 	val = parse_tag_value(str, tags_size);
3158 	if (val != (unsigned long) -1) {
3159 		*s = val;
3160 		return 0;
3161 	}
3162 
3163 	return -1;
3164 }
3165 
3166 static int record__parse_mmap_pages(const struct option *opt,
3167 				    const char *str,
3168 				    int unset __maybe_unused)
3169 {
3170 	struct record_opts *opts = opt->value;
3171 	char *s, *p;
3172 	unsigned int mmap_pages;
3173 	int ret;
3174 
3175 	if (!str)
3176 		return -EINVAL;
3177 
3178 	s = strdup(str);
3179 	if (!s)
3180 		return -ENOMEM;
3181 
3182 	p = strchr(s, ',');
3183 	if (p)
3184 		*p = '\0';
3185 
3186 	if (*s) {
3187 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3188 		if (ret)
3189 			goto out_free;
3190 		opts->mmap_pages = mmap_pages;
3191 	}
3192 
3193 	if (!p) {
3194 		ret = 0;
3195 		goto out_free;
3196 	}
3197 
3198 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3199 	if (ret)
3200 		goto out_free;
3201 
3202 	opts->auxtrace_mmap_pages = mmap_pages;
3203 
3204 out_free:
3205 	free(s);
3206 	return ret;
3207 }
3208 
3209 static int record__parse_off_cpu_thresh(const struct option *opt,
3210 					const char *str,
3211 					int unset __maybe_unused)
3212 {
3213 	struct record_opts *opts = opt->value;
3214 	char *endptr;
3215 	u64 off_cpu_thresh_ms;
3216 
3217 	if (!str)
3218 		return -EINVAL;
3219 
3220 	off_cpu_thresh_ms = strtoull(str, &endptr, 10);
3221 
3222 	/* the threshold isn't string "0", yet strtoull() returns 0, parsing failed */
3223 	if (*endptr || (off_cpu_thresh_ms == 0 && strcmp(str, "0")))
3224 		return -EINVAL;
3225 	else
3226 		opts->off_cpu_thresh_ns = off_cpu_thresh_ms * NSEC_PER_MSEC;
3227 
3228 	return 0;
3229 }
3230 
3231 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3232 {
3233 }
3234 
3235 static int parse_control_option(const struct option *opt,
3236 				const char *str,
3237 				int unset __maybe_unused)
3238 {
3239 	struct record_opts *opts = opt->value;
3240 
3241 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3242 }
3243 
3244 static void switch_output_size_warn(struct record *rec)
3245 {
3246 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3247 	struct switch_output *s = &rec->switch_output;
3248 
3249 	wakeup_size /= 2;
3250 
3251 	if (s->size < wakeup_size) {
3252 		char buf[100];
3253 
3254 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3255 		pr_warning("WARNING: switch-output data size lower than "
3256 			   "wakeup kernel buffer size (%s) "
3257 			   "expect bigger perf.data sizes\n", buf);
3258 	}
3259 }
3260 
3261 static int switch_output_setup(struct record *rec)
3262 {
3263 	struct switch_output *s = &rec->switch_output;
3264 	static struct parse_tag tags_size[] = {
3265 		{ .tag  = 'B', .mult = 1       },
3266 		{ .tag  = 'K', .mult = 1 << 10 },
3267 		{ .tag  = 'M', .mult = 1 << 20 },
3268 		{ .tag  = 'G', .mult = 1 << 30 },
3269 		{ .tag  = 0 },
3270 	};
3271 	static struct parse_tag tags_time[] = {
3272 		{ .tag  = 's', .mult = 1        },
3273 		{ .tag  = 'm', .mult = 60       },
3274 		{ .tag  = 'h', .mult = 60*60    },
3275 		{ .tag  = 'd', .mult = 60*60*24 },
3276 		{ .tag  = 0 },
3277 	};
3278 	unsigned long val;
3279 
3280 	/*
3281 	 * If we're using --switch-output-events, then we imply its
3282 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3283 	 *  thread to its parent.
3284 	 */
3285 	if (rec->switch_output_event_set) {
3286 		if (record__threads_enabled(rec)) {
3287 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3288 			return 0;
3289 		}
3290 		goto do_signal;
3291 	}
3292 
3293 	if (!s->set)
3294 		return 0;
3295 
3296 	if (record__threads_enabled(rec)) {
3297 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3298 		return 0;
3299 	}
3300 
3301 	if (!strcmp(s->str, "signal")) {
3302 do_signal:
3303 		s->signal = true;
3304 		pr_debug("switch-output with SIGUSR2 signal\n");
3305 		goto enabled;
3306 	}
3307 
3308 	val = parse_tag_value(s->str, tags_size);
3309 	if (val != (unsigned long) -1) {
3310 		s->size = val;
3311 		pr_debug("switch-output with %s size threshold\n", s->str);
3312 		goto enabled;
3313 	}
3314 
3315 	val = parse_tag_value(s->str, tags_time);
3316 	if (val != (unsigned long) -1) {
3317 		s->time = val;
3318 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3319 			 s->str, s->time);
3320 		goto enabled;
3321 	}
3322 
3323 	return -1;
3324 
3325 enabled:
3326 	rec->timestamp_filename = true;
3327 	s->enabled              = true;
3328 
3329 	if (s->size && !rec->opts.no_buffering)
3330 		switch_output_size_warn(rec);
3331 
3332 	return 0;
3333 }
3334 
3335 static const char * const __record_usage[] = {
3336 	"perf record [<options>] [<command>]",
3337 	"perf record [<options>] -- <command> [<options>]",
3338 	NULL
3339 };
3340 const char * const *record_usage = __record_usage;
3341 
3342 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
3343 				  struct perf_sample *sample, struct machine *machine)
3344 {
3345 	/*
3346 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3347 	 * no need to add them twice.
3348 	 */
3349 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3350 		return 0;
3351 	return perf_event__process_mmap(tool, event, sample, machine);
3352 }
3353 
3354 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
3355 				   struct perf_sample *sample, struct machine *machine)
3356 {
3357 	/*
3358 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3359 	 * no need to add them twice.
3360 	 */
3361 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3362 		return 0;
3363 
3364 	return perf_event__process_mmap2(tool, event, sample, machine);
3365 }
3366 
3367 static int process_timestamp_boundary(const struct perf_tool *tool,
3368 				      union perf_event *event __maybe_unused,
3369 				      struct perf_sample *sample,
3370 				      struct machine *machine __maybe_unused)
3371 {
3372 	struct record *rec = container_of(tool, struct record, tool);
3373 
3374 	set_timestamp_boundary(rec, sample->time);
3375 	return 0;
3376 }
3377 
3378 static int parse_record_synth_option(const struct option *opt,
3379 				     const char *str,
3380 				     int unset __maybe_unused)
3381 {
3382 	struct record_opts *opts = opt->value;
3383 	char *p = strdup(str);
3384 
3385 	if (p == NULL)
3386 		return -1;
3387 
3388 	opts->synth = parse_synth_opt(p);
3389 	free(p);
3390 
3391 	if (opts->synth < 0) {
3392 		pr_err("Invalid synth option: %s\n", str);
3393 		return -1;
3394 	}
3395 	return 0;
3396 }
3397 
3398 /*
3399  * XXX Ideally would be local to cmd_record() and passed to a record__new
3400  * because we need to have access to it in record__exit, that is called
3401  * after cmd_record() exits, but since record_options need to be accessible to
3402  * builtin-script, leave it here.
3403  *
3404  * At least we don't ouch it in all the other functions here directly.
3405  *
3406  * Just say no to tons of global variables, sigh.
3407  */
3408 static struct record record = {
3409 	.opts = {
3410 		.sample_time	     = true,
3411 		.mmap_pages	     = UINT_MAX,
3412 		.user_freq	     = UINT_MAX,
3413 		.user_interval	     = ULLONG_MAX,
3414 		.freq		     = 4000,
3415 		.target		     = {
3416 			.uses_mmap   = true,
3417 			.default_per_cpu = true,
3418 		},
3419 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3420 		.nr_threads_synthesize = 1,
3421 		.ctl_fd              = -1,
3422 		.ctl_fd_ack          = -1,
3423 		.synth               = PERF_SYNTH_ALL,
3424 		.off_cpu_thresh_ns   = OFFCPU_THRESH,
3425 	},
3426 	.buildid_mmap = true,
3427 };
3428 
3429 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3430 	"\n\t\t\t\tDefault: fp";
3431 
3432 static bool dry_run;
3433 
3434 static struct parse_events_option_args parse_events_option_args = {
3435 	.evlistp = &record.evlist,
3436 };
3437 
3438 static struct parse_events_option_args switch_output_parse_events_option_args = {
3439 	.evlistp = &record.sb_evlist,
3440 };
3441 
3442 /*
3443  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3444  * with it and switch to use the library functions in perf_evlist that came
3445  * from builtin-record.c, i.e. use record_opts,
3446  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3447  * using pipes, etc.
3448  */
3449 static struct option __record_options[] = {
3450 	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3451 		     "event selector. use 'perf list' to list available events",
3452 		     parse_events_option),
3453 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3454 		     "event filter", parse_filter),
3455 	OPT_BOOLEAN(0, "latency", &record.latency,
3456 		    "Enable data collection for latency profiling.\n"
3457 		    "\t\t\t  Use perf report --latency for latency-centric profile."),
3458 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3459 			   NULL, "don't record events from perf itself",
3460 			   exclude_perf),
3461 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3462 		    "record events on existing process id"),
3463 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3464 		    "record events on existing thread id"),
3465 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3466 		    "collect data with this RT SCHED_FIFO priority"),
3467 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3468 		    "collect data without buffering"),
3469 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3470 		    "collect raw sample records from all opened counters"),
3471 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3472 			    "system-wide collection from all CPUs"),
3473 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3474 		    "list of cpus to monitor"),
3475 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3476 	OPT_STRING('o', "output", &record.data.path, "file",
3477 		    "output file name"),
3478 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3479 			&record.opts.no_inherit_set,
3480 			"child tasks do not inherit counters"),
3481 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3482 		    "synthesize non-sample events at the end of output"),
3483 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3484 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3485 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3486 		    "Fail if the specified frequency can't be used"),
3487 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3488 		     "profile at this frequency",
3489 		      record__parse_freq),
3490 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3491 		     "number of mmap data pages and AUX area tracing mmap pages",
3492 		     record__parse_mmap_pages),
3493 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3494 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3495 		     record__mmap_flush_parse),
3496 	OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
3497 			   NULL, "enables call-graph recording" ,
3498 			   &record_callchain_opt),
3499 	OPT_CALLBACK(0, "call-graph", &record.opts,
3500 		     "record_mode[,record_size]", record_callchain_help,
3501 		     &record_parse_callchain_opt),
3502 	OPT_INCR('v', "verbose", &verbose,
3503 		    "be more verbose (show counter open errors, etc)"),
3504 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3505 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3506 		    "per thread counts"),
3507 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3508 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3509 		    "Record the sample physical addresses"),
3510 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3511 		    "Record the sampled data address data page size"),
3512 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3513 		    "Record the sampled code address (ip) page size"),
3514 	OPT_BOOLEAN(0, "sample-mem-info", &record.opts.sample_data_src,
3515 		    "Record the data source for memory operations"),
3516 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3517 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3518 		    "Record the sample identifier"),
3519 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3520 			&record.opts.sample_time_set,
3521 			"Record the sample timestamps"),
3522 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3523 			"Record the sample period"),
3524 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3525 		    "don't sample"),
3526 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3527 			&record.no_buildid_cache_set,
3528 			"do not update the buildid cache"),
3529 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3530 			&record.no_buildid_set,
3531 			"do not collect buildids in perf.data"),
3532 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3533 		     "monitor event in cgroup name only",
3534 		     parse_cgroups),
3535 	OPT_CALLBACK('D', "delay", &record, "ms",
3536 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3537 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3538 		     record__parse_event_enable_time),
3539 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3540 	OPT_STRING('u', "uid", &record.uid_str, "user", "user to profile"),
3541 
3542 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3543 		     "branch any", "sample any taken branches",
3544 		     parse_branch_stack),
3545 
3546 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3547 		     "branch filter mask", "branch stack filter modes",
3548 		     parse_branch_stack),
3549 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3550 		    "sample by weight (on special events only)"),
3551 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3552 		    "sample transaction flags (special events only)"),
3553 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3554 		    "use per-thread mmaps"),
3555 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3556 		    "sample selected machine registers on interrupt,"
3557 		    " use '-I?' to list register names", parse_intr_regs),
3558 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3559 		    "sample selected machine registers in user space,"
3560 		    " use '--user-regs=?' to list register names", parse_user_regs),
3561 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3562 		    "Record running/enabled time of read (:S) events"),
3563 	OPT_CALLBACK('k', "clockid", &record.opts,
3564 	"clockid", "clockid to use for events, see clock_gettime()",
3565 	parse_clockid),
3566 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3567 			  "opts", "AUX area tracing Snapshot Mode", ""),
3568 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3569 			  "opts", "sample AUX area", ""),
3570 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3571 			"per thread proc mmap processing timeout in ms"),
3572 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3573 		    "Record namespaces events"),
3574 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3575 		    "Record cgroup events"),
3576 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3577 			&record.opts.record_switch_events_set,
3578 			"Record context switch events"),
3579 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3580 			 "Configure all used events to run in kernel space.",
3581 			 PARSE_OPT_EXCLUSIVE),
3582 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3583 			 "Configure all used events to run in user space.",
3584 			 PARSE_OPT_EXCLUSIVE),
3585 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3586 		    "collect kernel callchains"),
3587 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3588 		    "collect user callchains"),
3589 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3590 		   "file", "vmlinux pathname"),
3591 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3592 		    "Record build-id of all DSOs regardless of hits"),
3593 	OPT_BOOLEAN_SET(0, "buildid-mmap", &record.buildid_mmap, &record.buildid_mmap_set,
3594 			"Record build-id in mmap events and skip build-id processing."),
3595 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3596 		    "append timestamp to output filename"),
3597 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3598 		    "Record timestamp boundary (time of first/last samples)"),
3599 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3600 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3601 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3602 			  "signal"),
3603 	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3604 			 &record.switch_output_event_set, "switch output event",
3605 			 "switch output event selector. use 'perf list' to list available events",
3606 			 parse_events_option_new_evlist),
3607 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3608 		   "Limit number of switch output generated files"),
3609 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3610 		    "Parse options then exit"),
3611 #ifdef HAVE_AIO_SUPPORT
3612 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3613 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3614 		     record__aio_parse),
3615 #endif
3616 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3617 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3618 		     record__parse_affinity),
3619 #ifdef HAVE_ZSTD_SUPPORT
3620 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3621 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3622 			    record__parse_comp_level),
3623 #endif
3624 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3625 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3626 	OPT_UINTEGER(0, "num-thread-synthesize",
3627 		     &record.opts.nr_threads_synthesize,
3628 		     "number of threads to run for event synthesis"),
3629 #ifdef HAVE_LIBPFM
3630 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3631 		"libpfm4 event selector. use 'perf list' to list available events",
3632 		parse_libpfm_events_option),
3633 #endif
3634 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3635 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3636 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3637 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3638 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3639 		      parse_control_option),
3640 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3641 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3642 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3643 			  &record.debuginfod.set, "debuginfod urls",
3644 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3645 			  "system"),
3646 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3647 			    "write collected trace data into several data files using parallel threads",
3648 			    record__parse_threads),
3649 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3650 	OPT_STRING(0, "setup-filter", &record.filter_action, "pin|unpin",
3651 		   "BPF filter action"),
3652 	OPT_CALLBACK(0, "off-cpu-thresh", &record.opts, "ms",
3653 		     "Dump off-cpu samples if off-cpu time exceeds this threshold (in milliseconds). (Default: 500ms)",
3654 		     record__parse_off_cpu_thresh),
3655 	OPT_BOOLEAN_SET(0, "data-mmap", &record.opts.record_data_mmap,
3656 			&record.opts.record_data_mmap_set,
3657 			"Record mmap events for non-executable mappings"),
3658 	OPT_END()
3659 };
3660 
3661 struct option *record_options = __record_options;
3662 
3663 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3664 {
3665 	struct perf_cpu cpu;
3666 	unsigned int idx;
3667 
3668 	if (cpu_map__is_dummy(cpus))
3669 		return 0;
3670 
3671 	perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3672 		/* Return ENODEV is input cpu is greater than max cpu */
3673 		if ((unsigned long)cpu.cpu > mask->nbits)
3674 			return -ENODEV;
3675 		__set_bit(cpu.cpu, mask->bits);
3676 	}
3677 
3678 	return 0;
3679 }
3680 
3681 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3682 {
3683 	struct perf_cpu_map *cpus;
3684 
3685 	cpus = perf_cpu_map__new(mask_spec);
3686 	if (!cpus)
3687 		return -ENOMEM;
3688 
3689 	bitmap_zero(mask->bits, mask->nbits);
3690 	if (record__mmap_cpu_mask_init(mask, cpus))
3691 		return -ENODEV;
3692 
3693 	perf_cpu_map__put(cpus);
3694 
3695 	return 0;
3696 }
3697 
3698 static void record__free_thread_masks(struct record *rec, int nr_threads)
3699 {
3700 	int t;
3701 
3702 	if (rec->thread_masks)
3703 		for (t = 0; t < nr_threads; t++)
3704 			record__thread_mask_free(&rec->thread_masks[t]);
3705 
3706 	zfree(&rec->thread_masks);
3707 }
3708 
3709 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3710 {
3711 	int t, ret;
3712 
3713 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3714 	if (!rec->thread_masks) {
3715 		pr_err("Failed to allocate thread masks\n");
3716 		return -ENOMEM;
3717 	}
3718 
3719 	for (t = 0; t < nr_threads; t++) {
3720 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3721 		if (ret) {
3722 			pr_err("Failed to allocate thread masks[%d]\n", t);
3723 			goto out_free;
3724 		}
3725 	}
3726 
3727 	return 0;
3728 
3729 out_free:
3730 	record__free_thread_masks(rec, nr_threads);
3731 
3732 	return ret;
3733 }
3734 
3735 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3736 {
3737 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3738 
3739 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3740 	if (ret)
3741 		return ret;
3742 
3743 	rec->nr_threads = nr_cpus;
3744 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3745 
3746 	for (t = 0; t < rec->nr_threads; t++) {
3747 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3748 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3749 		if (verbose > 0) {
3750 			pr_debug("thread_masks[%d]: ", t);
3751 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3752 			pr_debug("thread_masks[%d]: ", t);
3753 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3754 		}
3755 	}
3756 
3757 	return 0;
3758 }
3759 
3760 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3761 					  const char **maps_spec, const char **affinity_spec,
3762 					  u32 nr_spec)
3763 {
3764 	u32 s;
3765 	int ret = 0, t = 0;
3766 	struct mmap_cpu_mask cpus_mask;
3767 	struct thread_mask thread_mask, full_mask, *thread_masks;
3768 
3769 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3770 	if (ret) {
3771 		pr_err("Failed to allocate CPUs mask\n");
3772 		return ret;
3773 	}
3774 
3775 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3776 	if (ret) {
3777 		pr_err("Failed to init cpu mask\n");
3778 		goto out_free_cpu_mask;
3779 	}
3780 
3781 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3782 	if (ret) {
3783 		pr_err("Failed to allocate full mask\n");
3784 		goto out_free_cpu_mask;
3785 	}
3786 
3787 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3788 	if (ret) {
3789 		pr_err("Failed to allocate thread mask\n");
3790 		goto out_free_full_and_cpu_masks;
3791 	}
3792 
3793 	for (s = 0; s < nr_spec; s++) {
3794 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3795 		if (ret) {
3796 			pr_err("Failed to initialize maps thread mask\n");
3797 			goto out_free;
3798 		}
3799 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3800 		if (ret) {
3801 			pr_err("Failed to initialize affinity thread mask\n");
3802 			goto out_free;
3803 		}
3804 
3805 		/* ignore invalid CPUs but do not allow empty masks */
3806 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3807 				cpus_mask.bits, thread_mask.maps.nbits)) {
3808 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3809 			ret = -EINVAL;
3810 			goto out_free;
3811 		}
3812 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3813 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3814 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3815 			ret = -EINVAL;
3816 			goto out_free;
3817 		}
3818 
3819 		/* do not allow intersection with other masks (full_mask) */
3820 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3821 				      thread_mask.maps.nbits)) {
3822 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3823 			ret = -EINVAL;
3824 			goto out_free;
3825 		}
3826 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3827 				      thread_mask.affinity.nbits)) {
3828 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3829 			ret = -EINVAL;
3830 			goto out_free;
3831 		}
3832 
3833 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3834 			  thread_mask.maps.bits, full_mask.maps.nbits);
3835 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3836 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3837 
3838 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3839 		if (!thread_masks) {
3840 			pr_err("Failed to reallocate thread masks\n");
3841 			ret = -ENOMEM;
3842 			goto out_free;
3843 		}
3844 		rec->thread_masks = thread_masks;
3845 		rec->thread_masks[t] = thread_mask;
3846 		if (verbose > 0) {
3847 			pr_debug("thread_masks[%d]: ", t);
3848 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3849 			pr_debug("thread_masks[%d]: ", t);
3850 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3851 		}
3852 		t++;
3853 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3854 		if (ret) {
3855 			pr_err("Failed to allocate thread mask\n");
3856 			goto out_free_full_and_cpu_masks;
3857 		}
3858 	}
3859 	rec->nr_threads = t;
3860 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3861 	if (!rec->nr_threads)
3862 		ret = -EINVAL;
3863 
3864 out_free:
3865 	record__thread_mask_free(&thread_mask);
3866 out_free_full_and_cpu_masks:
3867 	record__thread_mask_free(&full_mask);
3868 out_free_cpu_mask:
3869 	record__mmap_cpu_mask_free(&cpus_mask);
3870 
3871 	return ret;
3872 }
3873 
3874 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3875 {
3876 	int ret;
3877 	struct cpu_topology *topo;
3878 
3879 	topo = cpu_topology__new();
3880 	if (!topo) {
3881 		pr_err("Failed to allocate CPU topology\n");
3882 		return -ENOMEM;
3883 	}
3884 
3885 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3886 					     topo->core_cpus_list, topo->core_cpus_lists);
3887 	cpu_topology__delete(topo);
3888 
3889 	return ret;
3890 }
3891 
3892 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3893 {
3894 	int ret;
3895 	struct cpu_topology *topo;
3896 
3897 	topo = cpu_topology__new();
3898 	if (!topo) {
3899 		pr_err("Failed to allocate CPU topology\n");
3900 		return -ENOMEM;
3901 	}
3902 
3903 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3904 					     topo->package_cpus_list, topo->package_cpus_lists);
3905 	cpu_topology__delete(topo);
3906 
3907 	return ret;
3908 }
3909 
3910 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3911 {
3912 	u32 s;
3913 	int ret;
3914 	const char **spec;
3915 	struct numa_topology *topo;
3916 
3917 	topo = numa_topology__new();
3918 	if (!topo) {
3919 		pr_err("Failed to allocate NUMA topology\n");
3920 		return -ENOMEM;
3921 	}
3922 
3923 	spec = zalloc(topo->nr * sizeof(char *));
3924 	if (!spec) {
3925 		pr_err("Failed to allocate NUMA spec\n");
3926 		ret = -ENOMEM;
3927 		goto out_delete_topo;
3928 	}
3929 	for (s = 0; s < topo->nr; s++)
3930 		spec[s] = topo->nodes[s].cpus;
3931 
3932 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3933 
3934 	zfree(&spec);
3935 
3936 out_delete_topo:
3937 	numa_topology__delete(topo);
3938 
3939 	return ret;
3940 }
3941 
3942 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3943 {
3944 	int t, ret;
3945 	u32 s, nr_spec = 0;
3946 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3947 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3948 
3949 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3950 		spec = strtok_r(user_spec, ":", &spec_ptr);
3951 		if (spec == NULL)
3952 			break;
3953 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3954 		mask = strtok_r(spec, "/", &mask_ptr);
3955 		if (mask == NULL)
3956 			break;
3957 		pr_debug2("  maps mask: %s\n", mask);
3958 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3959 		if (!tmp_spec) {
3960 			pr_err("Failed to reallocate maps spec\n");
3961 			ret = -ENOMEM;
3962 			goto out_free;
3963 		}
3964 		maps_spec = tmp_spec;
3965 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3966 		if (!maps_spec[nr_spec]) {
3967 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3968 			ret = -ENOMEM;
3969 			goto out_free;
3970 		}
3971 		mask = strtok_r(NULL, "/", &mask_ptr);
3972 		if (mask == NULL) {
3973 			pr_err("Invalid thread maps or affinity specs\n");
3974 			ret = -EINVAL;
3975 			goto out_free;
3976 		}
3977 		pr_debug2("  affinity mask: %s\n", mask);
3978 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3979 		if (!tmp_spec) {
3980 			pr_err("Failed to reallocate affinity spec\n");
3981 			ret = -ENOMEM;
3982 			goto out_free;
3983 		}
3984 		affinity_spec = tmp_spec;
3985 		affinity_spec[nr_spec] = strdup(mask);
3986 		if (!affinity_spec[nr_spec]) {
3987 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3988 			ret = -ENOMEM;
3989 			goto out_free;
3990 		}
3991 		dup_mask = NULL;
3992 		nr_spec++;
3993 	}
3994 
3995 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3996 					     (const char **)affinity_spec, nr_spec);
3997 
3998 out_free:
3999 	free(dup_mask);
4000 	for (s = 0; s < nr_spec; s++) {
4001 		if (maps_spec)
4002 			free(maps_spec[s]);
4003 		if (affinity_spec)
4004 			free(affinity_spec[s]);
4005 	}
4006 	free(affinity_spec);
4007 	free(maps_spec);
4008 
4009 	return ret;
4010 }
4011 
4012 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
4013 {
4014 	int ret;
4015 
4016 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
4017 	if (ret)
4018 		return ret;
4019 
4020 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
4021 		return -ENODEV;
4022 
4023 	rec->nr_threads = 1;
4024 
4025 	return 0;
4026 }
4027 
4028 static int record__init_thread_masks(struct record *rec)
4029 {
4030 	int ret = 0;
4031 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
4032 
4033 	if (!record__threads_enabled(rec))
4034 		return record__init_thread_default_masks(rec, cpus);
4035 
4036 	if (evlist__per_thread(rec->evlist)) {
4037 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
4038 		return -EINVAL;
4039 	}
4040 
4041 	switch (rec->opts.threads_spec) {
4042 	case THREAD_SPEC__CPU:
4043 		ret = record__init_thread_cpu_masks(rec, cpus);
4044 		break;
4045 	case THREAD_SPEC__CORE:
4046 		ret = record__init_thread_core_masks(rec, cpus);
4047 		break;
4048 	case THREAD_SPEC__PACKAGE:
4049 		ret = record__init_thread_package_masks(rec, cpus);
4050 		break;
4051 	case THREAD_SPEC__NUMA:
4052 		ret = record__init_thread_numa_masks(rec, cpus);
4053 		break;
4054 	case THREAD_SPEC__USER:
4055 		ret = record__init_thread_user_masks(rec, cpus);
4056 		break;
4057 	default:
4058 		break;
4059 	}
4060 
4061 	return ret;
4062 }
4063 
4064 int cmd_record(int argc, const char **argv)
4065 {
4066 	int err;
4067 	struct record *rec = &record;
4068 	char errbuf[BUFSIZ];
4069 
4070 	setlocale(LC_ALL, "");
4071 
4072 #ifndef HAVE_BPF_SKEL
4073 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
4074 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
4075 # undef set_nobuild
4076 #endif
4077 
4078 	/* Disable eager loading of kernel symbols that adds overhead to perf record. */
4079 	symbol_conf.lazy_load_kernel_maps = true;
4080 	rec->opts.affinity = PERF_AFFINITY_SYS;
4081 
4082 	rec->evlist = evlist__new();
4083 	if (rec->evlist == NULL)
4084 		return -ENOMEM;
4085 
4086 	err = perf_config(perf_record_config, rec);
4087 	if (err)
4088 		return err;
4089 
4090 	argc = parse_options(argc, argv, record_options, record_usage,
4091 			    PARSE_OPT_STOP_AT_NON_OPTION);
4092 	if (quiet)
4093 		perf_quiet_option();
4094 
4095 	err = symbol__validate_sym_arguments();
4096 	if (err)
4097 		return err;
4098 
4099 	perf_debuginfod_setup(&record.debuginfod);
4100 
4101 	/*
4102 	 * Use system wide (-a) for the default target (ie. when no
4103 	 * workload). User ID filtering also implies system-wide.
4104 	 */
4105 	if ((!argc && target__none(&rec->opts.target)) || rec->uid_str)
4106 		rec->opts.target.system_wide = true;
4107 
4108 	if (nr_cgroups && !rec->opts.target.system_wide) {
4109 		usage_with_options_msg(record_usage, record_options,
4110 			"cgroup monitoring only available in system-wide mode");
4111 
4112 	}
4113 
4114 	if (record.latency) {
4115 		/*
4116 		 * There is no fundamental reason why latency profiling
4117 		 * can't work for system-wide mode, but exact semantics
4118 		 * and details are to be defined.
4119 		 * See the following thread for details:
4120 		 * https://lore.kernel.org/all/Z4XDJyvjiie3howF@google.com/
4121 		 */
4122 		if (record.opts.target.system_wide) {
4123 			pr_err("Failed: latency profiling is not supported with system-wide collection.\n");
4124 			err = -EINVAL;
4125 			goto out_opts;
4126 		}
4127 		record.opts.record_switch_events = true;
4128 	}
4129 
4130 	if (rec->buildid_mmap && !perf_can_record_build_id()) {
4131 		pr_warning("Missing support for build id in kernel mmap events.\n"
4132 			   "Disable this warning with --no-buildid-mmap\n");
4133 		rec->buildid_mmap = false;
4134 	}
4135 
4136 	if (rec->buildid_mmap) {
4137 		/* Enable perf_event_attr::build_id bit. */
4138 		rec->opts.build_id = true;
4139 		/* Disable build-ID table in the header. */
4140 		rec->no_buildid = true;
4141 	} else {
4142 		pr_debug("Disabling build id in synthesized mmap2 events.\n");
4143 		symbol_conf.no_buildid_mmap2 = true;
4144 	}
4145 
4146 	if (rec->no_buildid_set && rec->no_buildid) {
4147 		/* -B implies -N for historic reasons. */
4148 		rec->no_buildid_cache = true;
4149 	}
4150 
4151 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4152 		pr_err("Kernel has no cgroup sampling support.\n");
4153 		err = -EINVAL;
4154 		goto out_opts;
4155 	}
4156 
4157 	if (rec->opts.kcore)
4158 		rec->opts.text_poke = true;
4159 
4160 	if (rec->opts.kcore || record__threads_enabled(rec))
4161 		rec->data.is_dir = true;
4162 
4163 	if (record__threads_enabled(rec)) {
4164 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4165 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4166 			goto out_opts;
4167 		}
4168 		if (record__aio_enabled(rec)) {
4169 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4170 			goto out_opts;
4171 		}
4172 	}
4173 
4174 	if (rec->opts.comp_level != 0) {
4175 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4176 		rec->no_buildid = true;
4177 	}
4178 
4179 	if (rec->opts.record_switch_events &&
4180 	    !perf_can_record_switch_events()) {
4181 		ui__error("kernel does not support recording context switch events\n");
4182 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4183 		err = -EINVAL;
4184 		goto out_opts;
4185 	}
4186 
4187 	if (switch_output_setup(rec)) {
4188 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4189 		err = -EINVAL;
4190 		goto out_opts;
4191 	}
4192 
4193 	if (rec->switch_output.time) {
4194 		signal(SIGALRM, alarm_sig_handler);
4195 		alarm(rec->switch_output.time);
4196 	}
4197 
4198 	if (rec->switch_output.num_files) {
4199 		rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4200 						      sizeof(char *));
4201 		if (!rec->switch_output.filenames) {
4202 			err = -EINVAL;
4203 			goto out_opts;
4204 		}
4205 	}
4206 
4207 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4208 		rec->timestamp_filename = false;
4209 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4210 	}
4211 
4212 	if (rec->filter_action) {
4213 		if (!strcmp(rec->filter_action, "pin"))
4214 			err = perf_bpf_filter__pin();
4215 		else if (!strcmp(rec->filter_action, "unpin"))
4216 			err = perf_bpf_filter__unpin();
4217 		else {
4218 			pr_warning("Unknown BPF filter action: %s\n", rec->filter_action);
4219 			err = -EINVAL;
4220 		}
4221 		goto out_opts;
4222 	}
4223 
4224 	/* For backward compatibility, -d implies --mem-info and --data-mmap */
4225 	if (rec->opts.sample_address) {
4226 		rec->opts.sample_data_src = true;
4227 		if (!rec->opts.record_data_mmap_set)
4228 			rec->opts.record_data_mmap = true;
4229 	}
4230 
4231 	/*
4232 	 * Allow aliases to facilitate the lookup of symbols for address
4233 	 * filters. Refer to auxtrace_parse_filters().
4234 	 */
4235 	symbol_conf.allow_aliases = true;
4236 
4237 	symbol__init(NULL);
4238 
4239 	err = record__auxtrace_init(rec);
4240 	if (err)
4241 		goto out;
4242 
4243 	if (dry_run)
4244 		goto out;
4245 
4246 	err = -ENOMEM;
4247 
4248 	if (rec->no_buildid_cache) {
4249 		disable_buildid_cache();
4250 	} else if (rec->switch_output.enabled) {
4251 		/*
4252 		 * In 'perf record --switch-output', disable buildid
4253 		 * generation by default to reduce data file switching
4254 		 * overhead. Still generate buildid if they are required
4255 		 * explicitly using
4256 		 *
4257 		 *  perf record --switch-output --no-no-buildid \
4258 		 *              --no-no-buildid-cache
4259 		 *
4260 		 * Following code equals to:
4261 		 *
4262 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4263 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4264 		 *         disable_buildid_cache();
4265 		 */
4266 		bool disable = true;
4267 
4268 		if (rec->no_buildid_set && !rec->no_buildid)
4269 			disable = false;
4270 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4271 			disable = false;
4272 		if (disable) {
4273 			rec->no_buildid = true;
4274 			rec->no_buildid_cache = true;
4275 			disable_buildid_cache();
4276 		}
4277 	}
4278 
4279 	if (record.opts.overwrite)
4280 		record.opts.tail_synthesize = true;
4281 
4282 	if (rec->evlist->core.nr_entries == 0) {
4283 		struct evlist *def_evlist = evlist__new_default(&rec->opts.target,
4284 								callchain_param.enabled);
4285 
4286 		if (!def_evlist)
4287 			goto out;
4288 
4289 		evlist__splice_list_tail(rec->evlist, &def_evlist->core.entries);
4290 		evlist__delete(def_evlist);
4291 	}
4292 
4293 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4294 		rec->opts.no_inherit = true;
4295 
4296 	err = target__validate(&rec->opts.target);
4297 	if (err) {
4298 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4299 		ui__warning("%s\n", errbuf);
4300 	}
4301 
4302 	if (rec->uid_str) {
4303 		uid_t uid = parse_uid(rec->uid_str);
4304 
4305 		if (uid == UINT_MAX) {
4306 			ui__error("Invalid User: %s", rec->uid_str);
4307 			err = -EINVAL;
4308 			goto out;
4309 		}
4310 		err = parse_uid_filter(rec->evlist, uid);
4311 		if (err)
4312 			goto out;
4313 	}
4314 
4315 	/* Enable ignoring missing threads when -p option is defined. */
4316 	rec->opts.ignore_missing_thread = rec->opts.target.pid;
4317 
4318 	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4319 
4320 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4321 		arch__add_leaf_frame_record_opts(&rec->opts);
4322 
4323 	err = -ENOMEM;
4324 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4325 		if (rec->opts.target.pid != NULL) {
4326 			pr_err("Couldn't create thread/CPU maps: %s\n",
4327 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4328 			goto out;
4329 		}
4330 		else
4331 			usage_with_options(record_usage, record_options);
4332 	}
4333 
4334 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4335 	if (err)
4336 		goto out;
4337 
4338 	/*
4339 	 * We take all buildids when the file contains
4340 	 * AUX area tracing data because we do not decode the
4341 	 * trace because it would take too long.
4342 	 */
4343 	if (rec->opts.full_auxtrace)
4344 		rec->buildid_all = true;
4345 
4346 	if (rec->opts.text_poke) {
4347 		err = record__config_text_poke(rec->evlist);
4348 		if (err) {
4349 			pr_err("record__config_text_poke failed, error %d\n", err);
4350 			goto out;
4351 		}
4352 	}
4353 
4354 	if (rec->off_cpu) {
4355 		err = record__config_off_cpu(rec);
4356 		if (err) {
4357 			pr_err("record__config_off_cpu failed, error %d\n", err);
4358 			goto out;
4359 		}
4360 	}
4361 
4362 	if (record_opts__config(&rec->opts)) {
4363 		err = -EINVAL;
4364 		goto out;
4365 	}
4366 
4367 	err = record__config_tracking_events(rec);
4368 	if (err) {
4369 		pr_err("record__config_tracking_events failed, error %d\n", err);
4370 		goto out;
4371 	}
4372 
4373 	err = record__init_thread_masks(rec);
4374 	if (err) {
4375 		pr_err("Failed to initialize parallel data streaming masks\n");
4376 		goto out;
4377 	}
4378 
4379 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4380 		rec->opts.nr_cblocks = nr_cblocks_max;
4381 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4382 
4383 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4384 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4385 
4386 	if (rec->opts.comp_level > comp_level_max)
4387 		rec->opts.comp_level = comp_level_max;
4388 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4389 
4390 	err = __cmd_record(&record, argc, argv);
4391 out:
4392 	record__free_thread_masks(rec, rec->nr_threads);
4393 	rec->nr_threads = 0;
4394 	symbol__exit();
4395 	auxtrace_record__free(rec->itr);
4396 out_opts:
4397 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4398 	evlist__delete(rec->evlist);
4399 	return err;
4400 }
4401 
4402 static void snapshot_sig_handler(int sig __maybe_unused)
4403 {
4404 	struct record *rec = &record;
4405 
4406 	hit_auxtrace_snapshot_trigger(rec);
4407 
4408 	if (switch_output_signal(rec))
4409 		trigger_hit(&switch_output_trigger);
4410 }
4411 
4412 static void alarm_sig_handler(int sig __maybe_unused)
4413 {
4414 	struct record *rec = &record;
4415 
4416 	if (switch_output_time(rec))
4417 		trigger_hit(&switch_output_trigger);
4418 }
4419