xref: /linux/tools/perf/builtin-record.c (revision ab38e84ba9a80581e055408e0f8c0158998fa4b9)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/stat.h"
30 #include "util/symbol.h"
31 #include "util/record.h"
32 #include "util/cpumap.h"
33 #include "util/thread_map.h"
34 #include "util/data.h"
35 #include "util/perf_regs.h"
36 #include "util/auxtrace.h"
37 #include "util/tsc.h"
38 #include "util/parse-branch-options.h"
39 #include "util/parse-regs-options.h"
40 #include "util/perf_api_probe.h"
41 #include "util/trigger.h"
42 #include "util/perf-hooks.h"
43 #include "util/cpu-set-sched.h"
44 #include "util/synthetic-events.h"
45 #include "util/time-utils.h"
46 #include "util/units.h"
47 #include "util/bpf-event.h"
48 #include "util/util.h"
49 #include "util/pfm.h"
50 #include "util/pmu.h"
51 #include "util/pmus.h"
52 #include "util/clockid.h"
53 #include "util/off_cpu.h"
54 #include "util/bpf-filter.h"
55 #include "util/strbuf.h"
56 #include "asm/bug.h"
57 #include "perf.h"
58 #include "cputopo.h"
59 
60 #include <errno.h>
61 #include <inttypes.h>
62 #include <locale.h>
63 #include <poll.h>
64 #include <pthread.h>
65 #include <unistd.h>
66 #ifndef HAVE_GETTID
67 #include <syscall.h>
68 #endif
69 #include <sched.h>
70 #include <signal.h>
71 #ifdef HAVE_EVENTFD_SUPPORT
72 #include <sys/eventfd.h>
73 #endif
74 #include <sys/mman.h>
75 #include <sys/wait.h>
76 #include <sys/types.h>
77 #include <sys/stat.h>
78 #include <fcntl.h>
79 #include <linux/err.h>
80 #include <linux/string.h>
81 #include <linux/time64.h>
82 #include <linux/zalloc.h>
83 #include <linux/bitmap.h>
84 #include <sys/time.h>
85 
86 struct switch_output {
87 	bool		 enabled;
88 	bool		 signal;
89 	unsigned long	 size;
90 	unsigned long	 time;
91 	const char	*str;
92 	bool		 set;
93 	char		 **filenames;
94 	int		 num_files;
95 	int		 cur_file;
96 };
97 
98 struct thread_mask {
99 	struct mmap_cpu_mask	maps;
100 	struct mmap_cpu_mask	affinity;
101 };
102 
103 struct record_thread {
104 	pid_t			tid;
105 	struct thread_mask	*mask;
106 	struct {
107 		int		msg[2];
108 		int		ack[2];
109 	} pipes;
110 	struct fdarray		pollfd;
111 	int			ctlfd_pos;
112 	int			nr_mmaps;
113 	struct mmap		**maps;
114 	struct mmap		**overwrite_maps;
115 	struct record		*rec;
116 	unsigned long long	samples;
117 	unsigned long		waking;
118 	u64			bytes_written;
119 	u64			bytes_transferred;
120 	u64			bytes_compressed;
121 };
122 
123 static __thread struct record_thread *thread;
124 
125 enum thread_msg {
126 	THREAD_MSG__UNDEFINED = 0,
127 	THREAD_MSG__READY,
128 	THREAD_MSG__MAX,
129 };
130 
131 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
132 	"UNDEFINED", "READY"
133 };
134 
135 enum thread_spec {
136 	THREAD_SPEC__UNDEFINED = 0,
137 	THREAD_SPEC__CPU,
138 	THREAD_SPEC__CORE,
139 	THREAD_SPEC__PACKAGE,
140 	THREAD_SPEC__NUMA,
141 	THREAD_SPEC__USER,
142 	THREAD_SPEC__MAX,
143 };
144 
145 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
146 	"undefined", "cpu", "core", "package", "numa", "user"
147 };
148 
149 struct pollfd_index_map {
150 	int evlist_pollfd_index;
151 	int thread_pollfd_index;
152 };
153 
154 struct record {
155 	struct perf_tool	tool;
156 	struct record_opts	opts;
157 	u64			bytes_written;
158 	u64			thread_bytes_written;
159 	struct perf_data	data;
160 	struct auxtrace_record	*itr;
161 	struct evlist	*evlist;
162 	struct perf_session	*session;
163 	struct evlist		*sb_evlist;
164 	pthread_t		thread_id;
165 	int			realtime_prio;
166 	bool			latency;
167 	bool			switch_output_event_set;
168 	bool			no_buildid;
169 	bool			no_buildid_set;
170 	bool			no_buildid_cache;
171 	bool			no_buildid_cache_set;
172 	bool			buildid_all;
173 	bool			buildid_mmap;
174 	bool			timestamp_filename;
175 	bool			timestamp_boundary;
176 	bool			off_cpu;
177 	const char		*filter_action;
178 	const char		*uid_str;
179 	struct switch_output	switch_output;
180 	unsigned long long	samples;
181 	unsigned long		output_max_size;	/* = 0: unlimited */
182 	struct perf_debuginfod	debuginfod;
183 	int			nr_threads;
184 	struct thread_mask	*thread_masks;
185 	struct record_thread	*thread_data;
186 	struct pollfd_index_map	*index_map;
187 	size_t			index_map_sz;
188 	size_t			index_map_cnt;
189 };
190 
191 static volatile int done;
192 
193 static volatile int auxtrace_record__snapshot_started;
194 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
195 static DEFINE_TRIGGER(switch_output_trigger);
196 
197 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
198 	"SYS", "NODE", "CPU"
199 };
200 
201 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
202 				  struct perf_sample *sample, struct machine *machine);
203 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
204 				   struct perf_sample *sample, struct machine *machine);
205 static int process_timestamp_boundary(const struct perf_tool *tool,
206 				      union perf_event *event,
207 				      struct perf_sample *sample,
208 				      struct machine *machine);
209 
210 #ifndef HAVE_GETTID
211 static inline pid_t gettid(void)
212 {
213 	return (pid_t)syscall(__NR_gettid);
214 }
215 #endif
216 
217 static int record__threads_enabled(struct record *rec)
218 {
219 	return rec->opts.threads_spec;
220 }
221 
222 static bool switch_output_signal(struct record *rec)
223 {
224 	return rec->switch_output.signal &&
225 	       trigger_is_ready(&switch_output_trigger);
226 }
227 
228 static bool switch_output_size(struct record *rec)
229 {
230 	return rec->switch_output.size &&
231 	       trigger_is_ready(&switch_output_trigger) &&
232 	       (rec->bytes_written >= rec->switch_output.size);
233 }
234 
235 static bool switch_output_time(struct record *rec)
236 {
237 	return rec->switch_output.time &&
238 	       trigger_is_ready(&switch_output_trigger);
239 }
240 
241 static u64 record__bytes_written(struct record *rec)
242 {
243 	return rec->bytes_written + rec->thread_bytes_written;
244 }
245 
246 static bool record__output_max_size_exceeded(struct record *rec)
247 {
248 	return rec->output_max_size &&
249 	       (record__bytes_written(rec) >= rec->output_max_size);
250 }
251 
252 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
253 			 void *bf, size_t size)
254 {
255 	struct perf_data_file *file = &rec->session->data->file;
256 
257 	if (map && map->file)
258 		file = map->file;
259 
260 	if (perf_data_file__write(file, bf, size) < 0) {
261 		pr_err("failed to write perf data, error: %m\n");
262 		return -1;
263 	}
264 
265 	if (map && map->file) {
266 		thread->bytes_written += size;
267 		rec->thread_bytes_written += size;
268 	} else {
269 		rec->bytes_written += size;
270 	}
271 
272 	if (record__output_max_size_exceeded(rec) && !done) {
273 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
274 				" stopping session ]\n",
275 				record__bytes_written(rec) >> 10);
276 		done = 1;
277 	}
278 
279 	if (switch_output_size(rec))
280 		trigger_hit(&switch_output_trigger);
281 
282 	return 0;
283 }
284 
285 static int record__aio_enabled(struct record *rec);
286 static int record__comp_enabled(struct record *rec);
287 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
288 			    void *dst, size_t dst_size, void *src, size_t src_size);
289 
290 #ifdef HAVE_AIO_SUPPORT
291 static int record__aio_write(struct aiocb *cblock, int trace_fd,
292 		void *buf, size_t size, off_t off)
293 {
294 	int rc;
295 
296 	cblock->aio_fildes = trace_fd;
297 	cblock->aio_buf    = buf;
298 	cblock->aio_nbytes = size;
299 	cblock->aio_offset = off;
300 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
301 
302 	do {
303 		rc = aio_write(cblock);
304 		if (rc == 0) {
305 			break;
306 		} else if (errno != EAGAIN) {
307 			cblock->aio_fildes = -1;
308 			pr_err("failed to queue perf data, error: %m\n");
309 			break;
310 		}
311 	} while (1);
312 
313 	return rc;
314 }
315 
316 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
317 {
318 	void *rem_buf;
319 	off_t rem_off;
320 	size_t rem_size;
321 	int rc, aio_errno;
322 	ssize_t aio_ret, written;
323 
324 	aio_errno = aio_error(cblock);
325 	if (aio_errno == EINPROGRESS)
326 		return 0;
327 
328 	written = aio_ret = aio_return(cblock);
329 	if (aio_ret < 0) {
330 		if (aio_errno != EINTR)
331 			pr_err("failed to write perf data, error: %m\n");
332 		written = 0;
333 	}
334 
335 	rem_size = cblock->aio_nbytes - written;
336 
337 	if (rem_size == 0) {
338 		cblock->aio_fildes = -1;
339 		/*
340 		 * md->refcount is incremented in record__aio_pushfn() for
341 		 * every aio write request started in record__aio_push() so
342 		 * decrement it because the request is now complete.
343 		 */
344 		perf_mmap__put(&md->core);
345 		rc = 1;
346 	} else {
347 		/*
348 		 * aio write request may require restart with the
349 		 * remainder if the kernel didn't write whole
350 		 * chunk at once.
351 		 */
352 		rem_off = cblock->aio_offset + written;
353 		rem_buf = (void *)(cblock->aio_buf + written);
354 		record__aio_write(cblock, cblock->aio_fildes,
355 				rem_buf, rem_size, rem_off);
356 		rc = 0;
357 	}
358 
359 	return rc;
360 }
361 
362 static int record__aio_sync(struct mmap *md, bool sync_all)
363 {
364 	struct aiocb **aiocb = md->aio.aiocb;
365 	struct aiocb *cblocks = md->aio.cblocks;
366 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
367 	int i, do_suspend;
368 
369 	do {
370 		do_suspend = 0;
371 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
372 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
373 				if (sync_all)
374 					aiocb[i] = NULL;
375 				else
376 					return i;
377 			} else {
378 				/*
379 				 * Started aio write is not complete yet
380 				 * so it has to be waited before the
381 				 * next allocation.
382 				 */
383 				aiocb[i] = &cblocks[i];
384 				do_suspend = 1;
385 			}
386 		}
387 		if (!do_suspend)
388 			return -1;
389 
390 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
391 			if (!(errno == EAGAIN || errno == EINTR))
392 				pr_err("failed to sync perf data, error: %m\n");
393 		}
394 	} while (1);
395 }
396 
397 struct record_aio {
398 	struct record	*rec;
399 	void		*data;
400 	size_t		size;
401 };
402 
403 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
404 {
405 	struct record_aio *aio = to;
406 
407 	/*
408 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
409 	 * to release space in the kernel buffer as fast as possible, calling
410 	 * perf_mmap__consume() from perf_mmap__push() function.
411 	 *
412 	 * That lets the kernel to proceed with storing more profiling data into
413 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
414 	 *
415 	 * Coping can be done in two steps in case the chunk of profiling data
416 	 * crosses the upper bound of the kernel buffer. In this case we first move
417 	 * part of data from map->start till the upper bound and then the remainder
418 	 * from the beginning of the kernel buffer till the end of the data chunk.
419 	 */
420 
421 	if (record__comp_enabled(aio->rec)) {
422 		ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
423 						   mmap__mmap_len(map) - aio->size,
424 						   buf, size);
425 		if (compressed < 0)
426 			return (int)compressed;
427 
428 		size = compressed;
429 	} else {
430 		memcpy(aio->data + aio->size, buf, size);
431 	}
432 
433 	if (!aio->size) {
434 		/*
435 		 * Increment map->refcount to guard map->aio.data[] buffer
436 		 * from premature deallocation because map object can be
437 		 * released earlier than aio write request started on
438 		 * map->aio.data[] buffer is complete.
439 		 *
440 		 * perf_mmap__put() is done at record__aio_complete()
441 		 * after started aio request completion or at record__aio_push()
442 		 * if the request failed to start.
443 		 */
444 		perf_mmap__get(&map->core);
445 	}
446 
447 	aio->size += size;
448 
449 	return size;
450 }
451 
452 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
453 {
454 	int ret, idx;
455 	int trace_fd = rec->session->data->file.fd;
456 	struct record_aio aio = { .rec = rec, .size = 0 };
457 
458 	/*
459 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
460 	 * becomes available after previous aio write operation.
461 	 */
462 
463 	idx = record__aio_sync(map, false);
464 	aio.data = map->aio.data[idx];
465 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
466 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
467 		return ret;
468 
469 	rec->samples++;
470 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
471 	if (!ret) {
472 		*off += aio.size;
473 		rec->bytes_written += aio.size;
474 		if (switch_output_size(rec))
475 			trigger_hit(&switch_output_trigger);
476 	} else {
477 		/*
478 		 * Decrement map->refcount incremented in record__aio_pushfn()
479 		 * back if record__aio_write() operation failed to start, otherwise
480 		 * map->refcount is decremented in record__aio_complete() after
481 		 * aio write operation finishes successfully.
482 		 */
483 		perf_mmap__put(&map->core);
484 	}
485 
486 	return ret;
487 }
488 
489 static off_t record__aio_get_pos(int trace_fd)
490 {
491 	return lseek(trace_fd, 0, SEEK_CUR);
492 }
493 
494 static void record__aio_set_pos(int trace_fd, off_t pos)
495 {
496 	lseek(trace_fd, pos, SEEK_SET);
497 }
498 
499 static void record__aio_mmap_read_sync(struct record *rec)
500 {
501 	int i;
502 	struct evlist *evlist = rec->evlist;
503 	struct mmap *maps = evlist->mmap;
504 
505 	if (!record__aio_enabled(rec))
506 		return;
507 
508 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
509 		struct mmap *map = &maps[i];
510 
511 		if (map->core.base)
512 			record__aio_sync(map, true);
513 	}
514 }
515 
516 static int nr_cblocks_default = 1;
517 static int nr_cblocks_max = 4;
518 
519 static int record__aio_parse(const struct option *opt,
520 			     const char *str,
521 			     int unset)
522 {
523 	struct record_opts *opts = (struct record_opts *)opt->value;
524 
525 	if (unset) {
526 		opts->nr_cblocks = 0;
527 	} else {
528 		if (str)
529 			opts->nr_cblocks = strtol(str, NULL, 0);
530 		if (!opts->nr_cblocks)
531 			opts->nr_cblocks = nr_cblocks_default;
532 	}
533 
534 	return 0;
535 }
536 #else /* HAVE_AIO_SUPPORT */
537 static int nr_cblocks_max = 0;
538 
539 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
540 			    off_t *off __maybe_unused)
541 {
542 	return -1;
543 }
544 
545 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
546 {
547 	return -1;
548 }
549 
550 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
551 {
552 }
553 
554 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
555 {
556 }
557 #endif
558 
559 static int record__aio_enabled(struct record *rec)
560 {
561 	return rec->opts.nr_cblocks > 0;
562 }
563 
564 #define MMAP_FLUSH_DEFAULT 1
565 static int record__mmap_flush_parse(const struct option *opt,
566 				    const char *str,
567 				    int unset)
568 {
569 	int flush_max;
570 	struct record_opts *opts = (struct record_opts *)opt->value;
571 	static struct parse_tag tags[] = {
572 			{ .tag  = 'B', .mult = 1       },
573 			{ .tag  = 'K', .mult = 1 << 10 },
574 			{ .tag  = 'M', .mult = 1 << 20 },
575 			{ .tag  = 'G', .mult = 1 << 30 },
576 			{ .tag  = 0 },
577 	};
578 
579 	if (unset)
580 		return 0;
581 
582 	if (str) {
583 		opts->mmap_flush = parse_tag_value(str, tags);
584 		if (opts->mmap_flush == (int)-1)
585 			opts->mmap_flush = strtol(str, NULL, 0);
586 	}
587 
588 	if (!opts->mmap_flush)
589 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
590 
591 	flush_max = evlist__mmap_size(opts->mmap_pages);
592 	flush_max /= 4;
593 	if (opts->mmap_flush > flush_max)
594 		opts->mmap_flush = flush_max;
595 
596 	return 0;
597 }
598 
599 #ifdef HAVE_ZSTD_SUPPORT
600 static unsigned int comp_level_default = 1;
601 
602 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
603 {
604 	struct record_opts *opts = opt->value;
605 
606 	if (unset) {
607 		opts->comp_level = 0;
608 	} else {
609 		if (str)
610 			opts->comp_level = strtol(str, NULL, 0);
611 		if (!opts->comp_level)
612 			opts->comp_level = comp_level_default;
613 	}
614 
615 	return 0;
616 }
617 #endif
618 static unsigned int comp_level_max = 22;
619 
620 static int record__comp_enabled(struct record *rec)
621 {
622 	return rec->opts.comp_level > 0;
623 }
624 
625 static int process_synthesized_event(const struct perf_tool *tool,
626 				     union perf_event *event,
627 				     struct perf_sample *sample __maybe_unused,
628 				     struct machine *machine __maybe_unused)
629 {
630 	struct record *rec = container_of(tool, struct record, tool);
631 	return record__write(rec, NULL, event, event->header.size);
632 }
633 
634 static struct mutex synth_lock;
635 
636 static int process_locked_synthesized_event(const struct perf_tool *tool,
637 				     union perf_event *event,
638 				     struct perf_sample *sample __maybe_unused,
639 				     struct machine *machine __maybe_unused)
640 {
641 	int ret;
642 
643 	mutex_lock(&synth_lock);
644 	ret = process_synthesized_event(tool, event, sample, machine);
645 	mutex_unlock(&synth_lock);
646 	return ret;
647 }
648 
649 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
650 {
651 	struct record *rec = to;
652 
653 	if (record__comp_enabled(rec)) {
654 		struct perf_record_compressed2 *event = map->data;
655 		size_t padding = 0;
656 		u8 pad[8] = {0};
657 		ssize_t compressed = zstd_compress(rec->session, map, map->data,
658 						   mmap__mmap_len(map), bf, size);
659 
660 		if (compressed < 0)
661 			return (int)compressed;
662 
663 		bf = event;
664 		thread->samples++;
665 
666 		/*
667 		 * The record from `zstd_compress` is not 8 bytes aligned, which would cause asan
668 		 * error. We make it aligned here.
669 		 */
670 		event->data_size = compressed - sizeof(struct perf_record_compressed2);
671 		event->header.size = PERF_ALIGN(compressed, sizeof(u64));
672 		padding = event->header.size - compressed;
673 		return record__write(rec, map, bf, compressed) ||
674 		       record__write(rec, map, &pad, padding);
675 	}
676 
677 	thread->samples++;
678 	return record__write(rec, map, bf, size);
679 }
680 
681 static volatile sig_atomic_t signr = -1;
682 static volatile sig_atomic_t child_finished;
683 #ifdef HAVE_EVENTFD_SUPPORT
684 static volatile sig_atomic_t done_fd = -1;
685 #endif
686 
687 static void sig_handler(int sig)
688 {
689 	if (sig == SIGCHLD)
690 		child_finished = 1;
691 	else
692 		signr = sig;
693 
694 	done = 1;
695 #ifdef HAVE_EVENTFD_SUPPORT
696 	if (done_fd >= 0) {
697 		u64 tmp = 1;
698 		int orig_errno = errno;
699 
700 		/*
701 		 * It is possible for this signal handler to run after done is
702 		 * checked in the main loop, but before the perf counter fds are
703 		 * polled. If this happens, the poll() will continue to wait
704 		 * even though done is set, and will only break out if either
705 		 * another signal is received, or the counters are ready for
706 		 * read. To ensure the poll() doesn't sleep when done is set,
707 		 * use an eventfd (done_fd) to wake up the poll().
708 		 */
709 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
710 			pr_err("failed to signal wakeup fd, error: %m\n");
711 
712 		errno = orig_errno;
713 	}
714 #endif // HAVE_EVENTFD_SUPPORT
715 }
716 
717 static void sigsegv_handler(int sig)
718 {
719 	perf_hooks__recover();
720 	sighandler_dump_stack(sig);
721 }
722 
723 static void record__sig_exit(void)
724 {
725 	if (signr == -1)
726 		return;
727 
728 	signal(signr, SIG_DFL);
729 	raise(signr);
730 }
731 
732 #ifdef HAVE_AUXTRACE_SUPPORT
733 
734 static int record__process_auxtrace(const struct perf_tool *tool,
735 				    struct mmap *map,
736 				    union perf_event *event, void *data1,
737 				    size_t len1, void *data2, size_t len2)
738 {
739 	struct record *rec = container_of(tool, struct record, tool);
740 	struct perf_data *data = &rec->data;
741 	size_t padding;
742 	u8 pad[8] = {0};
743 
744 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
745 		off_t file_offset;
746 		int fd = perf_data__fd(data);
747 		int err;
748 
749 		file_offset = lseek(fd, 0, SEEK_CUR);
750 		if (file_offset == -1)
751 			return -1;
752 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
753 						     event, file_offset);
754 		if (err)
755 			return err;
756 	}
757 
758 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
759 	padding = (len1 + len2) & 7;
760 	if (padding)
761 		padding = 8 - padding;
762 
763 	record__write(rec, map, event, event->header.size);
764 	record__write(rec, map, data1, len1);
765 	if (len2)
766 		record__write(rec, map, data2, len2);
767 	record__write(rec, map, &pad, padding);
768 
769 	return 0;
770 }
771 
772 static int record__auxtrace_mmap_read(struct record *rec,
773 				      struct mmap *map)
774 {
775 	int ret;
776 
777 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
778 				  record__process_auxtrace);
779 	if (ret < 0)
780 		return ret;
781 
782 	if (ret)
783 		rec->samples++;
784 
785 	return 0;
786 }
787 
788 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
789 					       struct mmap *map)
790 {
791 	int ret;
792 
793 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
794 					   record__process_auxtrace,
795 					   rec->opts.auxtrace_snapshot_size);
796 	if (ret < 0)
797 		return ret;
798 
799 	if (ret)
800 		rec->samples++;
801 
802 	return 0;
803 }
804 
805 static int record__auxtrace_read_snapshot_all(struct record *rec)
806 {
807 	int i;
808 	int rc = 0;
809 
810 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
811 		struct mmap *map = &rec->evlist->mmap[i];
812 
813 		if (!map->auxtrace_mmap.base)
814 			continue;
815 
816 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
817 			rc = -1;
818 			goto out;
819 		}
820 	}
821 out:
822 	return rc;
823 }
824 
825 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
826 {
827 	pr_debug("Recording AUX area tracing snapshot\n");
828 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
829 		trigger_error(&auxtrace_snapshot_trigger);
830 	} else {
831 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
832 			trigger_error(&auxtrace_snapshot_trigger);
833 		else
834 			trigger_ready(&auxtrace_snapshot_trigger);
835 	}
836 }
837 
838 static int record__auxtrace_snapshot_exit(struct record *rec)
839 {
840 	if (trigger_is_error(&auxtrace_snapshot_trigger))
841 		return 0;
842 
843 	if (!auxtrace_record__snapshot_started &&
844 	    auxtrace_record__snapshot_start(rec->itr))
845 		return -1;
846 
847 	record__read_auxtrace_snapshot(rec, true);
848 	if (trigger_is_error(&auxtrace_snapshot_trigger))
849 		return -1;
850 
851 	return 0;
852 }
853 
854 static int record__auxtrace_init(struct record *rec)
855 {
856 	int err;
857 
858 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
859 	    && record__threads_enabled(rec)) {
860 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
861 		return -EINVAL;
862 	}
863 
864 	if (!rec->itr) {
865 		rec->itr = auxtrace_record__init(rec->evlist, &err);
866 		if (err)
867 			return err;
868 	}
869 
870 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
871 					      rec->opts.auxtrace_snapshot_opts);
872 	if (err)
873 		return err;
874 
875 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
876 					    rec->opts.auxtrace_sample_opts);
877 	if (err)
878 		return err;
879 
880 	err = auxtrace_parse_aux_action(rec->evlist);
881 	if (err)
882 		return err;
883 
884 	return auxtrace_parse_filters(rec->evlist);
885 }
886 
887 #else
888 
889 static inline
890 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
891 			       struct mmap *map __maybe_unused)
892 {
893 	return 0;
894 }
895 
896 static inline
897 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
898 				    bool on_exit __maybe_unused)
899 {
900 }
901 
902 static inline
903 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
904 {
905 	return 0;
906 }
907 
908 static inline
909 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
910 {
911 	return 0;
912 }
913 
914 static int record__auxtrace_init(struct record *rec __maybe_unused)
915 {
916 	return 0;
917 }
918 
919 #endif
920 
921 static int record__config_text_poke(struct evlist *evlist)
922 {
923 	struct evsel *evsel;
924 
925 	/* Nothing to do if text poke is already configured */
926 	evlist__for_each_entry(evlist, evsel) {
927 		if (evsel->core.attr.text_poke)
928 			return 0;
929 	}
930 
931 	evsel = evlist__add_dummy_on_all_cpus(evlist);
932 	if (!evsel)
933 		return -ENOMEM;
934 
935 	evsel->core.attr.text_poke = 1;
936 	evsel->core.attr.ksymbol = 1;
937 	evsel->immediate = true;
938 	evsel__set_sample_bit(evsel, TIME);
939 
940 	return 0;
941 }
942 
943 static int record__config_off_cpu(struct record *rec)
944 {
945 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
946 }
947 
948 static bool record__tracking_system_wide(struct record *rec)
949 {
950 	struct evlist *evlist = rec->evlist;
951 	struct evsel *evsel;
952 
953 	/*
954 	 * If non-dummy evsel exists, system_wide sideband is need to
955 	 * help parse sample information.
956 	 * For example, PERF_EVENT_MMAP event to help parse symbol,
957 	 * and PERF_EVENT_COMM event to help parse task executable name.
958 	 */
959 	evlist__for_each_entry(evlist, evsel) {
960 		if (!evsel__is_dummy_event(evsel))
961 			return true;
962 	}
963 
964 	return false;
965 }
966 
967 static int record__config_tracking_events(struct record *rec)
968 {
969 	struct record_opts *opts = &rec->opts;
970 	struct evlist *evlist = rec->evlist;
971 	bool system_wide = false;
972 	struct evsel *evsel;
973 
974 	/*
975 	 * For initial_delay, system wide or a hybrid system, we need to add
976 	 * tracking event so that we can track PERF_RECORD_MMAP to cover the
977 	 * delay of waiting or event synthesis.
978 	 */
979 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
980 	    perf_pmus__num_core_pmus() > 1) {
981 
982 		/*
983 		 * User space tasks can migrate between CPUs, so when tracing
984 		 * selected CPUs, sideband for all CPUs is still needed.
985 		 */
986 		if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
987 			system_wide = true;
988 
989 		evsel = evlist__findnew_tracking_event(evlist, system_wide);
990 		if (!evsel)
991 			return -ENOMEM;
992 
993 		/*
994 		 * Enable the tracking event when the process is forked for
995 		 * initial_delay, immediately for system wide.
996 		 */
997 		if (opts->target.initial_delay && !evsel->immediate &&
998 		    !target__has_cpu(&opts->target))
999 			evsel->core.attr.enable_on_exec = 1;
1000 		else
1001 			evsel->immediate = 1;
1002 	}
1003 
1004 	return 0;
1005 }
1006 
1007 static bool record__kcore_readable(struct machine *machine)
1008 {
1009 	char kcore[PATH_MAX];
1010 	int fd;
1011 
1012 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
1013 
1014 	fd = open(kcore, O_RDONLY);
1015 	if (fd < 0)
1016 		return false;
1017 
1018 	close(fd);
1019 
1020 	return true;
1021 }
1022 
1023 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
1024 {
1025 	char from_dir[PATH_MAX];
1026 	char kcore_dir[PATH_MAX];
1027 	int ret;
1028 
1029 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
1030 
1031 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1032 	if (ret)
1033 		return ret;
1034 
1035 	return kcore_copy(from_dir, kcore_dir);
1036 }
1037 
1038 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1039 {
1040 	thread_data->pipes.msg[0] = -1;
1041 	thread_data->pipes.msg[1] = -1;
1042 	thread_data->pipes.ack[0] = -1;
1043 	thread_data->pipes.ack[1] = -1;
1044 }
1045 
1046 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1047 {
1048 	if (pipe(thread_data->pipes.msg))
1049 		return -EINVAL;
1050 
1051 	if (pipe(thread_data->pipes.ack)) {
1052 		close(thread_data->pipes.msg[0]);
1053 		thread_data->pipes.msg[0] = -1;
1054 		close(thread_data->pipes.msg[1]);
1055 		thread_data->pipes.msg[1] = -1;
1056 		return -EINVAL;
1057 	}
1058 
1059 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1060 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1061 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1062 
1063 	return 0;
1064 }
1065 
1066 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1067 {
1068 	if (thread_data->pipes.msg[0] != -1) {
1069 		close(thread_data->pipes.msg[0]);
1070 		thread_data->pipes.msg[0] = -1;
1071 	}
1072 	if (thread_data->pipes.msg[1] != -1) {
1073 		close(thread_data->pipes.msg[1]);
1074 		thread_data->pipes.msg[1] = -1;
1075 	}
1076 	if (thread_data->pipes.ack[0] != -1) {
1077 		close(thread_data->pipes.ack[0]);
1078 		thread_data->pipes.ack[0] = -1;
1079 	}
1080 	if (thread_data->pipes.ack[1] != -1) {
1081 		close(thread_data->pipes.ack[1]);
1082 		thread_data->pipes.ack[1] = -1;
1083 	}
1084 }
1085 
1086 static bool evlist__per_thread(struct evlist *evlist)
1087 {
1088 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1089 }
1090 
1091 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1092 {
1093 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1094 	struct mmap *mmap = evlist->mmap;
1095 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1096 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1097 	bool per_thread = evlist__per_thread(evlist);
1098 
1099 	if (per_thread)
1100 		thread_data->nr_mmaps = nr_mmaps;
1101 	else
1102 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1103 						      thread_data->mask->maps.nbits);
1104 	if (mmap) {
1105 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1106 		if (!thread_data->maps)
1107 			return -ENOMEM;
1108 	}
1109 	if (overwrite_mmap) {
1110 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1111 		if (!thread_data->overwrite_maps) {
1112 			zfree(&thread_data->maps);
1113 			return -ENOMEM;
1114 		}
1115 	}
1116 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1117 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1118 
1119 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1120 		if (per_thread ||
1121 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1122 			if (thread_data->maps) {
1123 				thread_data->maps[tm] = &mmap[m];
1124 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1125 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1126 			}
1127 			if (thread_data->overwrite_maps) {
1128 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1129 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1130 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1131 			}
1132 			tm++;
1133 		}
1134 	}
1135 
1136 	return 0;
1137 }
1138 
1139 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1140 {
1141 	int f, tm, pos;
1142 	struct mmap *map, *overwrite_map;
1143 
1144 	fdarray__init(&thread_data->pollfd, 64);
1145 
1146 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1147 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1148 		overwrite_map = thread_data->overwrite_maps ?
1149 				thread_data->overwrite_maps[tm] : NULL;
1150 
1151 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1152 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1153 
1154 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1155 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1156 							      &evlist->core.pollfd);
1157 				if (pos < 0)
1158 					return pos;
1159 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1160 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1161 			}
1162 		}
1163 	}
1164 
1165 	return 0;
1166 }
1167 
1168 static void record__free_thread_data(struct record *rec)
1169 {
1170 	int t;
1171 	struct record_thread *thread_data = rec->thread_data;
1172 
1173 	if (thread_data == NULL)
1174 		return;
1175 
1176 	for (t = 0; t < rec->nr_threads; t++) {
1177 		record__thread_data_close_pipes(&thread_data[t]);
1178 		zfree(&thread_data[t].maps);
1179 		zfree(&thread_data[t].overwrite_maps);
1180 		fdarray__exit(&thread_data[t].pollfd);
1181 	}
1182 
1183 	zfree(&rec->thread_data);
1184 }
1185 
1186 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1187 						    int evlist_pollfd_index,
1188 						    int thread_pollfd_index)
1189 {
1190 	size_t x = rec->index_map_cnt;
1191 
1192 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1193 		return -ENOMEM;
1194 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1195 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1196 	rec->index_map_cnt += 1;
1197 	return 0;
1198 }
1199 
1200 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1201 						    struct evlist *evlist,
1202 						    struct record_thread *thread_data)
1203 {
1204 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1205 	struct pollfd *t_entries = thread_data->pollfd.entries;
1206 	int err = 0;
1207 	size_t i;
1208 
1209 	for (i = 0; i < rec->index_map_cnt; i++) {
1210 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1211 		int t_pos = rec->index_map[i].thread_pollfd_index;
1212 
1213 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1214 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1215 			pr_err("Thread and evlist pollfd index mismatch\n");
1216 			err = -EINVAL;
1217 			continue;
1218 		}
1219 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1220 	}
1221 	return err;
1222 }
1223 
1224 static int record__dup_non_perf_events(struct record *rec,
1225 				       struct evlist *evlist,
1226 				       struct record_thread *thread_data)
1227 {
1228 	struct fdarray *fda = &evlist->core.pollfd;
1229 	int i, ret;
1230 
1231 	for (i = 0; i < fda->nr; i++) {
1232 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1233 			continue;
1234 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1235 		if (ret < 0) {
1236 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1237 			return ret;
1238 		}
1239 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1240 			  thread_data, ret, fda->entries[i].fd);
1241 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1242 		if (ret < 0) {
1243 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1244 			return ret;
1245 		}
1246 	}
1247 	return 0;
1248 }
1249 
1250 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1251 {
1252 	int t, ret;
1253 	struct record_thread *thread_data;
1254 
1255 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1256 	if (!rec->thread_data) {
1257 		pr_err("Failed to allocate thread data\n");
1258 		return -ENOMEM;
1259 	}
1260 	thread_data = rec->thread_data;
1261 
1262 	for (t = 0; t < rec->nr_threads; t++)
1263 		record__thread_data_init_pipes(&thread_data[t]);
1264 
1265 	for (t = 0; t < rec->nr_threads; t++) {
1266 		thread_data[t].rec = rec;
1267 		thread_data[t].mask = &rec->thread_masks[t];
1268 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1269 		if (ret) {
1270 			pr_err("Failed to initialize thread[%d] maps\n", t);
1271 			goto out_free;
1272 		}
1273 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1274 		if (ret) {
1275 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1276 			goto out_free;
1277 		}
1278 		if (t) {
1279 			thread_data[t].tid = -1;
1280 			ret = record__thread_data_open_pipes(&thread_data[t]);
1281 			if (ret) {
1282 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1283 				goto out_free;
1284 			}
1285 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1286 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1287 			if (ret < 0) {
1288 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1289 				goto out_free;
1290 			}
1291 			thread_data[t].ctlfd_pos = ret;
1292 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1293 				 thread_data, thread_data[t].ctlfd_pos,
1294 				 thread_data[t].pipes.msg[0]);
1295 		} else {
1296 			thread_data[t].tid = gettid();
1297 
1298 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1299 			if (ret < 0)
1300 				goto out_free;
1301 
1302 			thread_data[t].ctlfd_pos = -1; /* Not used */
1303 		}
1304 	}
1305 
1306 	return 0;
1307 
1308 out_free:
1309 	record__free_thread_data(rec);
1310 
1311 	return ret;
1312 }
1313 
1314 static int record__mmap_evlist(struct record *rec,
1315 			       struct evlist *evlist)
1316 {
1317 	int i, ret;
1318 	struct record_opts *opts = &rec->opts;
1319 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1320 				  opts->auxtrace_sample_mode;
1321 	char msg[512];
1322 
1323 	if (opts->affinity != PERF_AFFINITY_SYS)
1324 		cpu__setup_cpunode_map();
1325 
1326 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1327 				 opts->auxtrace_mmap_pages,
1328 				 auxtrace_overwrite,
1329 				 opts->nr_cblocks, opts->affinity,
1330 				 opts->mmap_flush, opts->comp_level) < 0) {
1331 		if (errno == EPERM) {
1332 			pr_err("Permission error mapping pages.\n"
1333 			       "Consider increasing "
1334 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1335 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1336 			       "(current value: %u,%u)\n",
1337 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1338 			return -errno;
1339 		} else {
1340 			pr_err("failed to mmap with %d (%s)\n", errno,
1341 				str_error_r(errno, msg, sizeof(msg)));
1342 			if (errno)
1343 				return -errno;
1344 			else
1345 				return -EINVAL;
1346 		}
1347 	}
1348 
1349 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1350 		return -1;
1351 
1352 	ret = record__alloc_thread_data(rec, evlist);
1353 	if (ret)
1354 		return ret;
1355 
1356 	if (record__threads_enabled(rec)) {
1357 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1358 		if (ret) {
1359 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1360 			return ret;
1361 		}
1362 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1363 			if (evlist->mmap)
1364 				evlist->mmap[i].file = &rec->data.dir.files[i];
1365 			if (evlist->overwrite_mmap)
1366 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1367 		}
1368 	}
1369 
1370 	return 0;
1371 }
1372 
1373 static int record__mmap(struct record *rec)
1374 {
1375 	return record__mmap_evlist(rec, rec->evlist);
1376 }
1377 
1378 static int record__open(struct record *rec)
1379 {
1380 	char msg[BUFSIZ];
1381 	struct evsel *pos;
1382 	struct evlist *evlist = rec->evlist;
1383 	struct perf_session *session = rec->session;
1384 	struct record_opts *opts = &rec->opts;
1385 	int rc = 0;
1386 
1387 	evlist__for_each_entry(evlist, pos) {
1388 try_again:
1389 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1390 			if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1391 				if (verbose > 0)
1392 					ui__warning("%s\n", msg);
1393 				goto try_again;
1394 			}
1395 			if ((errno == EINVAL || errno == EBADF) &&
1396 			    pos->core.leader != &pos->core &&
1397 			    pos->weak_group) {
1398 			        pos = evlist__reset_weak_group(evlist, pos, true);
1399 				goto try_again;
1400 			}
1401 			rc = -errno;
1402 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1403 			ui__error("%s\n", msg);
1404 			goto out;
1405 		}
1406 
1407 		pos->supported = true;
1408 	}
1409 
1410 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1411 		pr_warning(
1412 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1413 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1414 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1415 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1416 "Samples in kernel modules won't be resolved at all.\n\n"
1417 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1418 "even with a suitable vmlinux or kallsyms file.\n\n");
1419 	}
1420 
1421 	if (evlist__apply_filters(evlist, &pos, &opts->target)) {
1422 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1423 			pos->filter ?: "BPF", evsel__name(pos), errno,
1424 			str_error_r(errno, msg, sizeof(msg)));
1425 		rc = -1;
1426 		goto out;
1427 	}
1428 
1429 	rc = record__mmap(rec);
1430 	if (rc)
1431 		goto out;
1432 
1433 	session->evlist = evlist;
1434 	perf_session__set_id_hdr_size(session);
1435 out:
1436 	return rc;
1437 }
1438 
1439 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1440 {
1441 	if (rec->evlist->first_sample_time == 0)
1442 		rec->evlist->first_sample_time = sample_time;
1443 
1444 	if (sample_time)
1445 		rec->evlist->last_sample_time = sample_time;
1446 }
1447 
1448 static int process_sample_event(const struct perf_tool *tool,
1449 				union perf_event *event,
1450 				struct perf_sample *sample,
1451 				struct evsel *evsel,
1452 				struct machine *machine)
1453 {
1454 	struct record *rec = container_of(tool, struct record, tool);
1455 
1456 	set_timestamp_boundary(rec, sample->time);
1457 
1458 	if (rec->buildid_all)
1459 		return 0;
1460 
1461 	rec->samples++;
1462 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1463 }
1464 
1465 static int process_buildids(struct record *rec)
1466 {
1467 	struct perf_session *session = rec->session;
1468 
1469 	if (perf_data__size(&rec->data) == 0)
1470 		return 0;
1471 
1472 	/*
1473 	 * During this process, it'll load kernel map and replace the
1474 	 * dso->long_name to a real pathname it found.  In this case
1475 	 * we prefer the vmlinux path like
1476 	 *   /lib/modules/3.16.4/build/vmlinux
1477 	 *
1478 	 * rather than build-id path (in debug directory).
1479 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1480 	 */
1481 	symbol_conf.ignore_vmlinux_buildid = true;
1482 
1483 	/*
1484 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1485 	 * so no need to process samples. But if timestamp_boundary is enabled,
1486 	 * it still needs to walk on all samples to get the timestamps of
1487 	 * first/last samples.
1488 	 */
1489 	if (rec->buildid_all && !rec->timestamp_boundary)
1490 		rec->tool.sample = process_event_sample_stub;
1491 
1492 	return perf_session__process_events(session);
1493 }
1494 
1495 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1496 {
1497 	int err;
1498 	struct perf_tool *tool = data;
1499 	/*
1500 	 *As for guest kernel when processing subcommand record&report,
1501 	 *we arrange module mmap prior to guest kernel mmap and trigger
1502 	 *a preload dso because default guest module symbols are loaded
1503 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1504 	 *method is used to avoid symbol missing when the first addr is
1505 	 *in module instead of in guest kernel.
1506 	 */
1507 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1508 					     machine);
1509 	if (err < 0)
1510 		pr_err("Couldn't record guest kernel [%d]'s reference"
1511 		       " relocation symbol.\n", machine->pid);
1512 
1513 	/*
1514 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1515 	 * have no _text sometimes.
1516 	 */
1517 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1518 						 machine);
1519 	if (err < 0)
1520 		pr_err("Couldn't record guest kernel [%d]'s reference"
1521 		       " relocation symbol.\n", machine->pid);
1522 }
1523 
1524 static struct perf_event_header finished_round_event = {
1525 	.size = sizeof(struct perf_event_header),
1526 	.type = PERF_RECORD_FINISHED_ROUND,
1527 };
1528 
1529 static struct perf_event_header finished_init_event = {
1530 	.size = sizeof(struct perf_event_header),
1531 	.type = PERF_RECORD_FINISHED_INIT,
1532 };
1533 
1534 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1535 {
1536 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1537 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1538 			  thread->mask->affinity.nbits)) {
1539 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1540 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1541 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1542 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1543 					(cpu_set_t *)thread->mask->affinity.bits);
1544 		if (verbose == 2) {
1545 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1546 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1547 		}
1548 	}
1549 }
1550 
1551 static size_t process_comp_header(void *record, size_t increment)
1552 {
1553 	struct perf_record_compressed2 *event = record;
1554 	size_t size = sizeof(*event);
1555 
1556 	if (increment) {
1557 		event->header.size += increment;
1558 		return increment;
1559 	}
1560 
1561 	event->header.type = PERF_RECORD_COMPRESSED2;
1562 	event->header.size = size;
1563 
1564 	return size;
1565 }
1566 
1567 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1568 			    void *dst, size_t dst_size, void *src, size_t src_size)
1569 {
1570 	ssize_t compressed;
1571 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed2) - 1;
1572 	struct zstd_data *zstd_data = &session->zstd_data;
1573 
1574 	if (map && map->file)
1575 		zstd_data = &map->zstd_data;
1576 
1577 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1578 						     max_record_size, process_comp_header);
1579 	if (compressed < 0)
1580 		return compressed;
1581 
1582 	if (map && map->file) {
1583 		thread->bytes_transferred += src_size;
1584 		thread->bytes_compressed  += compressed;
1585 	} else {
1586 		session->bytes_transferred += src_size;
1587 		session->bytes_compressed  += compressed;
1588 	}
1589 
1590 	return compressed;
1591 }
1592 
1593 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1594 				    bool overwrite, bool synch)
1595 {
1596 	u64 bytes_written = rec->bytes_written;
1597 	int i;
1598 	int rc = 0;
1599 	int nr_mmaps;
1600 	struct mmap **maps;
1601 	int trace_fd = rec->data.file.fd;
1602 	off_t off = 0;
1603 
1604 	if (!evlist)
1605 		return 0;
1606 
1607 	nr_mmaps = thread->nr_mmaps;
1608 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1609 
1610 	if (!maps)
1611 		return 0;
1612 
1613 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1614 		return 0;
1615 
1616 	if (record__aio_enabled(rec))
1617 		off = record__aio_get_pos(trace_fd);
1618 
1619 	for (i = 0; i < nr_mmaps; i++) {
1620 		u64 flush = 0;
1621 		struct mmap *map = maps[i];
1622 
1623 		if (map->core.base) {
1624 			record__adjust_affinity(rec, map);
1625 			if (synch) {
1626 				flush = map->core.flush;
1627 				map->core.flush = 1;
1628 			}
1629 			if (!record__aio_enabled(rec)) {
1630 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1631 					if (synch)
1632 						map->core.flush = flush;
1633 					rc = -1;
1634 					goto out;
1635 				}
1636 			} else {
1637 				if (record__aio_push(rec, map, &off) < 0) {
1638 					record__aio_set_pos(trace_fd, off);
1639 					if (synch)
1640 						map->core.flush = flush;
1641 					rc = -1;
1642 					goto out;
1643 				}
1644 			}
1645 			if (synch)
1646 				map->core.flush = flush;
1647 		}
1648 
1649 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1650 		    !rec->opts.auxtrace_sample_mode &&
1651 		    record__auxtrace_mmap_read(rec, map) != 0) {
1652 			rc = -1;
1653 			goto out;
1654 		}
1655 	}
1656 
1657 	if (record__aio_enabled(rec))
1658 		record__aio_set_pos(trace_fd, off);
1659 
1660 	/*
1661 	 * Mark the round finished in case we wrote
1662 	 * at least one event.
1663 	 *
1664 	 * No need for round events in directory mode,
1665 	 * because per-cpu maps and files have data
1666 	 * sorted by kernel.
1667 	 */
1668 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1669 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1670 
1671 	if (overwrite)
1672 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1673 out:
1674 	return rc;
1675 }
1676 
1677 static int record__mmap_read_all(struct record *rec, bool synch)
1678 {
1679 	int err;
1680 
1681 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1682 	if (err)
1683 		return err;
1684 
1685 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1686 }
1687 
1688 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1689 					   void *arg __maybe_unused)
1690 {
1691 	struct perf_mmap *map = fda->priv[fd].ptr;
1692 
1693 	if (map)
1694 		perf_mmap__put(map);
1695 }
1696 
1697 static void *record__thread(void *arg)
1698 {
1699 	enum thread_msg msg = THREAD_MSG__READY;
1700 	bool terminate = false;
1701 	struct fdarray *pollfd;
1702 	int err, ctlfd_pos;
1703 
1704 	thread = arg;
1705 	thread->tid = gettid();
1706 
1707 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1708 	if (err == -1)
1709 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1710 			   thread->tid, strerror(errno));
1711 
1712 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1713 
1714 	pollfd = &thread->pollfd;
1715 	ctlfd_pos = thread->ctlfd_pos;
1716 
1717 	for (;;) {
1718 		unsigned long long hits = thread->samples;
1719 
1720 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1721 			break;
1722 
1723 		if (hits == thread->samples) {
1724 
1725 			err = fdarray__poll(pollfd, -1);
1726 			/*
1727 			 * Propagate error, only if there's any. Ignore positive
1728 			 * number of returned events and interrupt error.
1729 			 */
1730 			if (err > 0 || (err < 0 && errno == EINTR))
1731 				err = 0;
1732 			thread->waking++;
1733 
1734 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1735 					    record__thread_munmap_filtered, NULL) == 0)
1736 				break;
1737 		}
1738 
1739 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1740 			terminate = true;
1741 			close(thread->pipes.msg[0]);
1742 			thread->pipes.msg[0] = -1;
1743 			pollfd->entries[ctlfd_pos].fd = -1;
1744 			pollfd->entries[ctlfd_pos].events = 0;
1745 		}
1746 
1747 		pollfd->entries[ctlfd_pos].revents = 0;
1748 	}
1749 	record__mmap_read_all(thread->rec, true);
1750 
1751 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1752 	if (err == -1)
1753 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1754 			   thread->tid, strerror(errno));
1755 
1756 	return NULL;
1757 }
1758 
1759 static void record__init_features(struct record *rec)
1760 {
1761 	struct perf_session *session = rec->session;
1762 	int feat;
1763 
1764 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1765 		perf_header__set_feat(&session->header, feat);
1766 
1767 	if (rec->no_buildid)
1768 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1769 
1770 	if (!have_tracepoints(&rec->evlist->core.entries))
1771 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1772 
1773 	if (!rec->opts.branch_stack)
1774 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1775 
1776 	if (!rec->opts.full_auxtrace)
1777 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1778 
1779 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1780 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1781 
1782 	if (!rec->opts.use_clockid)
1783 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1784 
1785 	if (!record__threads_enabled(rec))
1786 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1787 
1788 	if (!record__comp_enabled(rec))
1789 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1790 
1791 	perf_header__clear_feat(&session->header, HEADER_STAT);
1792 }
1793 
1794 static void
1795 record__finish_output(struct record *rec)
1796 {
1797 	int i;
1798 	struct perf_data *data = &rec->data;
1799 	int fd = perf_data__fd(data);
1800 
1801 	if (data->is_pipe) {
1802 		/* Just to display approx. size */
1803 		data->file.size = rec->bytes_written;
1804 		return;
1805 	}
1806 
1807 	rec->session->header.data_size += rec->bytes_written;
1808 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1809 	if (record__threads_enabled(rec)) {
1810 		for (i = 0; i < data->dir.nr; i++)
1811 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1812 	}
1813 
1814 	if (!rec->no_buildid) {
1815 		process_buildids(rec);
1816 
1817 		if (rec->buildid_all)
1818 			perf_session__dsos_hit_all(rec->session);
1819 	}
1820 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1821 
1822 	return;
1823 }
1824 
1825 static int record__synthesize_workload(struct record *rec, bool tail)
1826 {
1827 	int err;
1828 	struct perf_thread_map *thread_map;
1829 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1830 
1831 	if (rec->opts.tail_synthesize != tail)
1832 		return 0;
1833 
1834 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1835 	if (thread_map == NULL)
1836 		return -1;
1837 
1838 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1839 						 process_synthesized_event,
1840 						 &rec->session->machines.host,
1841 						 needs_mmap,
1842 						 rec->opts.sample_address);
1843 	perf_thread_map__put(thread_map);
1844 	return err;
1845 }
1846 
1847 static int write_finished_init(struct record *rec, bool tail)
1848 {
1849 	if (rec->opts.tail_synthesize != tail)
1850 		return 0;
1851 
1852 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1853 }
1854 
1855 static int record__synthesize(struct record *rec, bool tail);
1856 
1857 static int
1858 record__switch_output(struct record *rec, bool at_exit)
1859 {
1860 	struct perf_data *data = &rec->data;
1861 	char *new_filename = NULL;
1862 	int fd, err;
1863 
1864 	/* Same Size:      "2015122520103046"*/
1865 	char timestamp[] = "InvalidTimestamp";
1866 
1867 	record__aio_mmap_read_sync(rec);
1868 
1869 	write_finished_init(rec, true);
1870 
1871 	record__synthesize(rec, true);
1872 	if (target__none(&rec->opts.target))
1873 		record__synthesize_workload(rec, true);
1874 
1875 	rec->samples = 0;
1876 	record__finish_output(rec);
1877 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1878 	if (err) {
1879 		pr_err("Failed to get current timestamp\n");
1880 		return -EINVAL;
1881 	}
1882 
1883 	fd = perf_data__switch(data, timestamp,
1884 			       rec->session->header.data_offset,
1885 			       at_exit, &new_filename);
1886 	if (fd >= 0 && !at_exit) {
1887 		rec->bytes_written = 0;
1888 		rec->session->header.data_size = 0;
1889 	}
1890 
1891 	if (!quiet) {
1892 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1893 			data->path, timestamp);
1894 	}
1895 
1896 	if (rec->switch_output.num_files) {
1897 		int n = rec->switch_output.cur_file + 1;
1898 
1899 		if (n >= rec->switch_output.num_files)
1900 			n = 0;
1901 		rec->switch_output.cur_file = n;
1902 		if (rec->switch_output.filenames[n]) {
1903 			remove(rec->switch_output.filenames[n]);
1904 			zfree(&rec->switch_output.filenames[n]);
1905 		}
1906 		rec->switch_output.filenames[n] = new_filename;
1907 	} else {
1908 		free(new_filename);
1909 	}
1910 
1911 	/* Output tracking events */
1912 	if (!at_exit) {
1913 		record__synthesize(rec, false);
1914 
1915 		/*
1916 		 * In 'perf record --switch-output' without -a,
1917 		 * record__synthesize() in record__switch_output() won't
1918 		 * generate tracking events because there's no thread_map
1919 		 * in evlist. Which causes newly created perf.data doesn't
1920 		 * contain map and comm information.
1921 		 * Create a fake thread_map and directly call
1922 		 * perf_event__synthesize_thread_map() for those events.
1923 		 */
1924 		if (target__none(&rec->opts.target))
1925 			record__synthesize_workload(rec, false);
1926 		write_finished_init(rec, false);
1927 	}
1928 	return fd;
1929 }
1930 
1931 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1932 					struct perf_record_lost_samples *lost,
1933 					int cpu_idx, int thread_idx, u64 lost_count,
1934 					u16 misc_flag)
1935 {
1936 	struct perf_sample_id *sid;
1937 	struct perf_sample sample;
1938 	int id_hdr_size;
1939 
1940 	perf_sample__init(&sample, /*all=*/true);
1941 	lost->lost = lost_count;
1942 	if (evsel->core.ids) {
1943 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1944 		sample.id = sid->id;
1945 	}
1946 
1947 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1948 						       evsel->core.attr.sample_type, &sample);
1949 	lost->header.size = sizeof(*lost) + id_hdr_size;
1950 	lost->header.misc = misc_flag;
1951 	record__write(rec, NULL, lost, lost->header.size);
1952 	perf_sample__exit(&sample);
1953 }
1954 
1955 static void record__read_lost_samples(struct record *rec)
1956 {
1957 	struct perf_session *session = rec->session;
1958 	struct perf_record_lost_samples_and_ids lost;
1959 	struct evsel *evsel;
1960 
1961 	/* there was an error during record__open */
1962 	if (session->evlist == NULL)
1963 		return;
1964 
1965 	evlist__for_each_entry(session->evlist, evsel) {
1966 		struct xyarray *xy = evsel->core.sample_id;
1967 		u64 lost_count;
1968 
1969 		if (xy == NULL || evsel->core.fd == NULL)
1970 			continue;
1971 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1972 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1973 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1974 			continue;
1975 		}
1976 
1977 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1978 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1979 				struct perf_counts_values count;
1980 
1981 				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1982 					pr_debug("read LOST count failed\n");
1983 					return;
1984 				}
1985 
1986 				if (count.lost) {
1987 					memset(&lost, 0, sizeof(lost));
1988 					lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
1989 					__record__save_lost_samples(rec, evsel, &lost.lost,
1990 								    x, y, count.lost, 0);
1991 				}
1992 			}
1993 		}
1994 
1995 		lost_count = perf_bpf_filter__lost_count(evsel);
1996 		if (lost_count) {
1997 			memset(&lost, 0, sizeof(lost));
1998 			lost.lost.header.type = PERF_RECORD_LOST_SAMPLES;
1999 			__record__save_lost_samples(rec, evsel, &lost.lost, 0, 0, lost_count,
2000 						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
2001 		}
2002 	}
2003 }
2004 
2005 static volatile sig_atomic_t workload_exec_errno;
2006 
2007 /*
2008  * evlist__prepare_workload will send a SIGUSR1
2009  * if the fork fails, since we asked by setting its
2010  * want_signal to true.
2011  */
2012 static void workload_exec_failed_signal(int signo __maybe_unused,
2013 					siginfo_t *info,
2014 					void *ucontext __maybe_unused)
2015 {
2016 	workload_exec_errno = info->si_value.sival_int;
2017 	done = 1;
2018 	child_finished = 1;
2019 }
2020 
2021 static void snapshot_sig_handler(int sig);
2022 static void alarm_sig_handler(int sig);
2023 
2024 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
2025 {
2026 	if (evlist) {
2027 		if (evlist->mmap && evlist->mmap[0].core.base)
2028 			return evlist->mmap[0].core.base;
2029 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2030 			return evlist->overwrite_mmap[0].core.base;
2031 	}
2032 	return NULL;
2033 }
2034 
2035 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2036 {
2037 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2038 	if (pc)
2039 		return pc;
2040 	return NULL;
2041 }
2042 
2043 static int record__synthesize(struct record *rec, bool tail)
2044 {
2045 	struct perf_session *session = rec->session;
2046 	struct machine *machine = &session->machines.host;
2047 	struct perf_data *data = &rec->data;
2048 	struct record_opts *opts = &rec->opts;
2049 	struct perf_tool *tool = &rec->tool;
2050 	int err = 0;
2051 	event_op f = process_synthesized_event;
2052 
2053 	if (rec->opts.tail_synthesize != tail)
2054 		return 0;
2055 
2056 	if (data->is_pipe) {
2057 		err = perf_event__synthesize_for_pipe(tool, session, data,
2058 						      process_synthesized_event);
2059 		if (err < 0)
2060 			goto out;
2061 
2062 		rec->bytes_written += err;
2063 	}
2064 
2065 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2066 					  process_synthesized_event, machine);
2067 	if (err)
2068 		goto out;
2069 
2070 	/* Synthesize id_index before auxtrace_info */
2071 	err = perf_event__synthesize_id_index(tool,
2072 					      process_synthesized_event,
2073 					      session->evlist, machine);
2074 	if (err)
2075 		goto out;
2076 
2077 	if (rec->opts.full_auxtrace) {
2078 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2079 					session, process_synthesized_event);
2080 		if (err)
2081 			goto out;
2082 	}
2083 
2084 	if (!evlist__exclude_kernel(rec->evlist)) {
2085 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2086 							 machine);
2087 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2088 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2089 				   "Check /proc/kallsyms permission or run as root.\n");
2090 
2091 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2092 						     machine);
2093 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2094 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2095 				   "Check /proc/modules permission or run as root.\n");
2096 	}
2097 
2098 	if (perf_guest) {
2099 		machines__process_guests(&session->machines,
2100 					 perf_event__synthesize_guest_os, tool);
2101 	}
2102 
2103 	err = perf_event__synthesize_extra_attr(&rec->tool,
2104 						rec->evlist,
2105 						process_synthesized_event,
2106 						data->is_pipe);
2107 	if (err)
2108 		goto out;
2109 
2110 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2111 						 process_synthesized_event,
2112 						NULL);
2113 	if (err < 0) {
2114 		pr_err("Couldn't synthesize thread map.\n");
2115 		return err;
2116 	}
2117 
2118 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2119 					     process_synthesized_event, NULL);
2120 	if (err < 0) {
2121 		pr_err("Couldn't synthesize cpu map.\n");
2122 		return err;
2123 	}
2124 
2125 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2126 						machine, opts);
2127 	if (err < 0) {
2128 		pr_warning("Couldn't synthesize bpf events.\n");
2129 		err = 0;
2130 	}
2131 
2132 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2133 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2134 						     machine);
2135 		if (err < 0) {
2136 			pr_warning("Couldn't synthesize cgroup events.\n");
2137 			err = 0;
2138 		}
2139 	}
2140 
2141 	if (rec->opts.nr_threads_synthesize > 1) {
2142 		mutex_init(&synth_lock);
2143 		perf_set_multithreaded();
2144 		f = process_locked_synthesized_event;
2145 	}
2146 
2147 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2148 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2149 
2150 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2151 						    rec->evlist->core.threads,
2152 						    f, needs_mmap, opts->sample_address,
2153 						    rec->opts.nr_threads_synthesize);
2154 	}
2155 
2156 	if (rec->opts.nr_threads_synthesize > 1) {
2157 		perf_set_singlethreaded();
2158 		mutex_destroy(&synth_lock);
2159 	}
2160 
2161 out:
2162 	return err;
2163 }
2164 
2165 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2166 {
2167 	struct record *rec = data;
2168 	pthread_kill(rec->thread_id, SIGUSR2);
2169 	return 0;
2170 }
2171 
2172 static int record__setup_sb_evlist(struct record *rec)
2173 {
2174 	struct record_opts *opts = &rec->opts;
2175 
2176 	if (rec->sb_evlist != NULL) {
2177 		/*
2178 		 * We get here if --switch-output-event populated the
2179 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2180 		 * to the main thread.
2181 		 */
2182 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2183 		rec->thread_id = pthread_self();
2184 	}
2185 #ifdef HAVE_LIBBPF_SUPPORT
2186 	if (!opts->no_bpf_event) {
2187 		if (rec->sb_evlist == NULL) {
2188 			rec->sb_evlist = evlist__new();
2189 
2190 			if (rec->sb_evlist == NULL) {
2191 				pr_err("Couldn't create side band evlist.\n.");
2192 				return -1;
2193 			}
2194 		}
2195 
2196 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2197 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2198 			return -1;
2199 		}
2200 	}
2201 #endif
2202 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2203 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2204 		opts->no_bpf_event = true;
2205 	}
2206 
2207 	return 0;
2208 }
2209 
2210 static int record__init_clock(struct record *rec)
2211 {
2212 	struct perf_session *session = rec->session;
2213 	struct timespec ref_clockid;
2214 	struct timeval ref_tod;
2215 	u64 ref;
2216 
2217 	if (!rec->opts.use_clockid)
2218 		return 0;
2219 
2220 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2221 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2222 
2223 	session->header.env.clock.clockid = rec->opts.clockid;
2224 
2225 	if (gettimeofday(&ref_tod, NULL) != 0) {
2226 		pr_err("gettimeofday failed, cannot set reference time.\n");
2227 		return -1;
2228 	}
2229 
2230 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2231 		pr_err("clock_gettime failed, cannot set reference time.\n");
2232 		return -1;
2233 	}
2234 
2235 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2236 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2237 
2238 	session->header.env.clock.tod_ns = ref;
2239 
2240 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2241 	      (u64) ref_clockid.tv_nsec;
2242 
2243 	session->header.env.clock.clockid_ns = ref;
2244 	return 0;
2245 }
2246 
2247 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2248 {
2249 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2250 		trigger_hit(&auxtrace_snapshot_trigger);
2251 		auxtrace_record__snapshot_started = 1;
2252 		if (auxtrace_record__snapshot_start(rec->itr))
2253 			trigger_error(&auxtrace_snapshot_trigger);
2254 	}
2255 }
2256 
2257 static int record__terminate_thread(struct record_thread *thread_data)
2258 {
2259 	int err;
2260 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2261 	pid_t tid = thread_data->tid;
2262 
2263 	close(thread_data->pipes.msg[1]);
2264 	thread_data->pipes.msg[1] = -1;
2265 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2266 	if (err > 0)
2267 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2268 	else
2269 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2270 			   thread->tid, tid);
2271 
2272 	return 0;
2273 }
2274 
2275 static int record__start_threads(struct record *rec)
2276 {
2277 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2278 	struct record_thread *thread_data = rec->thread_data;
2279 	sigset_t full, mask;
2280 	pthread_t handle;
2281 	pthread_attr_t attrs;
2282 
2283 	thread = &thread_data[0];
2284 
2285 	if (!record__threads_enabled(rec))
2286 		return 0;
2287 
2288 	sigfillset(&full);
2289 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2290 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2291 		return -1;
2292 	}
2293 
2294 	pthread_attr_init(&attrs);
2295 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2296 
2297 	for (t = 1; t < nr_threads; t++) {
2298 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2299 
2300 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2301 		pthread_attr_setaffinity_np(&attrs,
2302 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2303 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2304 #endif
2305 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2306 			for (tt = 1; tt < t; tt++)
2307 				record__terminate_thread(&thread_data[t]);
2308 			pr_err("Failed to start threads: %s\n", strerror(errno));
2309 			ret = -1;
2310 			goto out_err;
2311 		}
2312 
2313 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2314 		if (err > 0)
2315 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2316 				  thread_msg_tags[msg]);
2317 		else
2318 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2319 				   thread->tid, rec->thread_data[t].tid);
2320 	}
2321 
2322 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2323 			(cpu_set_t *)thread->mask->affinity.bits);
2324 
2325 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2326 
2327 out_err:
2328 	pthread_attr_destroy(&attrs);
2329 
2330 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2331 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2332 		ret = -1;
2333 	}
2334 
2335 	return ret;
2336 }
2337 
2338 static int record__stop_threads(struct record *rec)
2339 {
2340 	int t;
2341 	struct record_thread *thread_data = rec->thread_data;
2342 
2343 	for (t = 1; t < rec->nr_threads; t++)
2344 		record__terminate_thread(&thread_data[t]);
2345 
2346 	for (t = 0; t < rec->nr_threads; t++) {
2347 		rec->samples += thread_data[t].samples;
2348 		if (!record__threads_enabled(rec))
2349 			continue;
2350 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2351 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2352 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2353 			 thread_data[t].samples, thread_data[t].waking);
2354 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2355 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2356 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2357 		else
2358 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2359 	}
2360 
2361 	return 0;
2362 }
2363 
2364 static unsigned long record__waking(struct record *rec)
2365 {
2366 	int t;
2367 	unsigned long waking = 0;
2368 	struct record_thread *thread_data = rec->thread_data;
2369 
2370 	for (t = 0; t < rec->nr_threads; t++)
2371 		waking += thread_data[t].waking;
2372 
2373 	return waking;
2374 }
2375 
2376 static int __cmd_record(struct record *rec, int argc, const char **argv)
2377 {
2378 	int err;
2379 	int status = 0;
2380 	const bool forks = argc > 0;
2381 	struct perf_tool *tool = &rec->tool;
2382 	struct record_opts *opts = &rec->opts;
2383 	struct perf_data *data = &rec->data;
2384 	struct perf_session *session;
2385 	bool disabled = false, draining = false;
2386 	int fd;
2387 	float ratio = 0;
2388 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2389 
2390 	atexit(record__sig_exit);
2391 	signal(SIGCHLD, sig_handler);
2392 	signal(SIGINT, sig_handler);
2393 	signal(SIGTERM, sig_handler);
2394 	signal(SIGSEGV, sigsegv_handler);
2395 
2396 	if (rec->opts.record_cgroup) {
2397 #ifndef HAVE_FILE_HANDLE
2398 		pr_err("cgroup tracking is not supported\n");
2399 		return -1;
2400 #endif
2401 	}
2402 
2403 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2404 		signal(SIGUSR2, snapshot_sig_handler);
2405 		if (rec->opts.auxtrace_snapshot_mode)
2406 			trigger_on(&auxtrace_snapshot_trigger);
2407 		if (rec->switch_output.enabled)
2408 			trigger_on(&switch_output_trigger);
2409 	} else {
2410 		signal(SIGUSR2, SIG_IGN);
2411 	}
2412 
2413 	perf_tool__init(tool, /*ordered_events=*/true);
2414 	tool->sample		= process_sample_event;
2415 	tool->fork		= perf_event__process_fork;
2416 	tool->exit		= perf_event__process_exit;
2417 	tool->comm		= perf_event__process_comm;
2418 	tool->namespaces	= perf_event__process_namespaces;
2419 	tool->mmap		= build_id__process_mmap;
2420 	tool->mmap2		= build_id__process_mmap2;
2421 	tool->itrace_start	= process_timestamp_boundary;
2422 	tool->aux		= process_timestamp_boundary;
2423 	tool->namespace_events	= rec->opts.record_namespaces;
2424 	tool->cgroup_events	= rec->opts.record_cgroup;
2425 	session = perf_session__new(data, tool);
2426 	if (IS_ERR(session)) {
2427 		pr_err("Perf session creation failed.\n");
2428 		return PTR_ERR(session);
2429 	}
2430 
2431 	if (record__threads_enabled(rec)) {
2432 		if (perf_data__is_pipe(&rec->data)) {
2433 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2434 			return -1;
2435 		}
2436 		if (rec->opts.full_auxtrace) {
2437 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2438 			return -1;
2439 		}
2440 	}
2441 
2442 	fd = perf_data__fd(data);
2443 	rec->session = session;
2444 
2445 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2446 		pr_err("Compression initialization failed.\n");
2447 		return -1;
2448 	}
2449 #ifdef HAVE_EVENTFD_SUPPORT
2450 	done_fd = eventfd(0, EFD_NONBLOCK);
2451 	if (done_fd < 0) {
2452 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2453 		status = -1;
2454 		goto out_delete_session;
2455 	}
2456 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2457 	if (err < 0) {
2458 		pr_err("Failed to add wakeup eventfd to poll list\n");
2459 		status = err;
2460 		goto out_delete_session;
2461 	}
2462 #endif // HAVE_EVENTFD_SUPPORT
2463 
2464 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2465 	session->header.env.comp_level = rec->opts.comp_level;
2466 
2467 	if (rec->opts.kcore &&
2468 	    !record__kcore_readable(&session->machines.host)) {
2469 		pr_err("ERROR: kcore is not readable.\n");
2470 		return -1;
2471 	}
2472 
2473 	if (record__init_clock(rec))
2474 		return -1;
2475 
2476 	record__init_features(rec);
2477 
2478 	if (forks) {
2479 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2480 					       workload_exec_failed_signal);
2481 		if (err < 0) {
2482 			pr_err("Couldn't run the workload!\n");
2483 			status = err;
2484 			goto out_delete_session;
2485 		}
2486 	}
2487 
2488 	/*
2489 	 * If we have just single event and are sending data
2490 	 * through pipe, we need to force the ids allocation,
2491 	 * because we synthesize event name through the pipe
2492 	 * and need the id for that.
2493 	 */
2494 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2495 		rec->opts.sample_id = true;
2496 
2497 	if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2498 		rec->timestamp_filename = false;
2499 		pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2500 	}
2501 
2502 	/*
2503 	 * Use global stat_config that is zero meaning aggr_mode is AGGR_NONE
2504 	 * and hybrid_merge is false.
2505 	 */
2506 	evlist__uniquify_evsel_names(rec->evlist, &stat_config);
2507 
2508 	evlist__config(rec->evlist, opts, &callchain_param);
2509 
2510 	/* Debug message used by test scripts */
2511 	pr_debug3("perf record opening and mmapping events\n");
2512 	if (record__open(rec) != 0) {
2513 		err = -1;
2514 		goto out_free_threads;
2515 	}
2516 	/* Debug message used by test scripts */
2517 	pr_debug3("perf record done opening and mmapping events\n");
2518 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2519 
2520 	if (rec->opts.kcore) {
2521 		err = record__kcore_copy(&session->machines.host, data);
2522 		if (err) {
2523 			pr_err("ERROR: Failed to copy kcore\n");
2524 			goto out_free_threads;
2525 		}
2526 	}
2527 
2528 	/*
2529 	 * Normally perf_session__new would do this, but it doesn't have the
2530 	 * evlist.
2531 	 */
2532 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2533 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2534 		rec->tool.ordered_events = false;
2535 	}
2536 
2537 	if (evlist__nr_groups(rec->evlist) == 0)
2538 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2539 
2540 	if (data->is_pipe) {
2541 		err = perf_header__write_pipe(fd);
2542 		if (err < 0)
2543 			goto out_free_threads;
2544 	} else {
2545 		err = perf_session__write_header(session, rec->evlist, fd, false);
2546 		if (err < 0)
2547 			goto out_free_threads;
2548 	}
2549 
2550 	err = -1;
2551 	if (!rec->no_buildid
2552 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2553 		pr_err("Couldn't generate buildids. "
2554 		       "Use --no-buildid to profile anyway.\n");
2555 		goto out_free_threads;
2556 	}
2557 
2558 	if (!evlist__needs_bpf_sb_event(rec->evlist))
2559 		opts->no_bpf_event = true;
2560 
2561 	err = record__setup_sb_evlist(rec);
2562 	if (err)
2563 		goto out_free_threads;
2564 
2565 	err = record__synthesize(rec, false);
2566 	if (err < 0)
2567 		goto out_free_threads;
2568 
2569 	if (rec->realtime_prio) {
2570 		struct sched_param param;
2571 
2572 		param.sched_priority = rec->realtime_prio;
2573 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2574 			pr_err("Could not set realtime priority.\n");
2575 			err = -1;
2576 			goto out_free_threads;
2577 		}
2578 	}
2579 
2580 	if (record__start_threads(rec))
2581 		goto out_free_threads;
2582 
2583 	/*
2584 	 * When perf is starting the traced process, all the events
2585 	 * (apart from group members) have enable_on_exec=1 set,
2586 	 * so don't spoil it by prematurely enabling them.
2587 	 */
2588 	if (!target__none(&opts->target) && !opts->target.initial_delay)
2589 		evlist__enable(rec->evlist);
2590 
2591 	/*
2592 	 * offcpu-time does not call execve, so enable_on_exe wouldn't work
2593 	 * when recording a workload, do it manually
2594 	 */
2595 	if (rec->off_cpu)
2596 		evlist__enable_evsel(rec->evlist, (char *)OFFCPU_EVENT);
2597 
2598 	/*
2599 	 * Let the child rip
2600 	 */
2601 	if (forks) {
2602 		struct machine *machine = &session->machines.host;
2603 		union perf_event *event;
2604 		pid_t tgid;
2605 
2606 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2607 		if (event == NULL) {
2608 			err = -ENOMEM;
2609 			goto out_child;
2610 		}
2611 
2612 		/*
2613 		 * Some H/W events are generated before COMM event
2614 		 * which is emitted during exec(), so perf script
2615 		 * cannot see a correct process name for those events.
2616 		 * Synthesize COMM event to prevent it.
2617 		 */
2618 		tgid = perf_event__synthesize_comm(tool, event,
2619 						   rec->evlist->workload.pid,
2620 						   process_synthesized_event,
2621 						   machine);
2622 		free(event);
2623 
2624 		if (tgid == -1)
2625 			goto out_child;
2626 
2627 		event = malloc(sizeof(event->namespaces) +
2628 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2629 			       machine->id_hdr_size);
2630 		if (event == NULL) {
2631 			err = -ENOMEM;
2632 			goto out_child;
2633 		}
2634 
2635 		/*
2636 		 * Synthesize NAMESPACES event for the command specified.
2637 		 */
2638 		perf_event__synthesize_namespaces(tool, event,
2639 						  rec->evlist->workload.pid,
2640 						  tgid, process_synthesized_event,
2641 						  machine);
2642 		free(event);
2643 
2644 		evlist__start_workload(rec->evlist);
2645 	}
2646 
2647 	if (opts->target.initial_delay) {
2648 		pr_info(EVLIST_DISABLED_MSG);
2649 		if (opts->target.initial_delay > 0) {
2650 			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2651 			evlist__enable(rec->evlist);
2652 			pr_info(EVLIST_ENABLED_MSG);
2653 		}
2654 	}
2655 
2656 	err = event_enable_timer__start(rec->evlist->eet);
2657 	if (err)
2658 		goto out_child;
2659 
2660 	/* Debug message used by test scripts */
2661 	pr_debug3("perf record has started\n");
2662 	fflush(stderr);
2663 
2664 	trigger_ready(&auxtrace_snapshot_trigger);
2665 	trigger_ready(&switch_output_trigger);
2666 	perf_hooks__invoke_record_start();
2667 
2668 	/*
2669 	 * Must write FINISHED_INIT so it will be seen after all other
2670 	 * synthesized user events, but before any regular events.
2671 	 */
2672 	err = write_finished_init(rec, false);
2673 	if (err < 0)
2674 		goto out_child;
2675 
2676 	for (;;) {
2677 		unsigned long long hits = thread->samples;
2678 
2679 		/*
2680 		 * rec->evlist->bkw_mmap_state is possible to be
2681 		 * BKW_MMAP_EMPTY here: when done == true and
2682 		 * hits != rec->samples in previous round.
2683 		 *
2684 		 * evlist__toggle_bkw_mmap ensure we never
2685 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2686 		 */
2687 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2688 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2689 
2690 		if (record__mmap_read_all(rec, false) < 0) {
2691 			trigger_error(&auxtrace_snapshot_trigger);
2692 			trigger_error(&switch_output_trigger);
2693 			err = -1;
2694 			goto out_child;
2695 		}
2696 
2697 		if (auxtrace_record__snapshot_started) {
2698 			auxtrace_record__snapshot_started = 0;
2699 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2700 				record__read_auxtrace_snapshot(rec, false);
2701 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2702 				pr_err("AUX area tracing snapshot failed\n");
2703 				err = -1;
2704 				goto out_child;
2705 			}
2706 		}
2707 
2708 		if (trigger_is_hit(&switch_output_trigger)) {
2709 			/*
2710 			 * If switch_output_trigger is hit, the data in
2711 			 * overwritable ring buffer should have been collected,
2712 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2713 			 *
2714 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2715 			 * record__mmap_read_all() didn't collect data from
2716 			 * overwritable ring buffer. Read again.
2717 			 */
2718 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2719 				continue;
2720 			trigger_ready(&switch_output_trigger);
2721 
2722 			/*
2723 			 * Reenable events in overwrite ring buffer after
2724 			 * record__mmap_read_all(): we should have collected
2725 			 * data from it.
2726 			 */
2727 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2728 
2729 			if (!quiet)
2730 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2731 					record__waking(rec));
2732 			thread->waking = 0;
2733 			fd = record__switch_output(rec, false);
2734 			if (fd < 0) {
2735 				pr_err("Failed to switch to new file\n");
2736 				trigger_error(&switch_output_trigger);
2737 				err = fd;
2738 				goto out_child;
2739 			}
2740 
2741 			/* re-arm the alarm */
2742 			if (rec->switch_output.time)
2743 				alarm(rec->switch_output.time);
2744 		}
2745 
2746 		if (hits == thread->samples) {
2747 			if (done || draining)
2748 				break;
2749 			err = fdarray__poll(&thread->pollfd, -1);
2750 			/*
2751 			 * Propagate error, only if there's any. Ignore positive
2752 			 * number of returned events and interrupt error.
2753 			 */
2754 			if (err > 0 || (err < 0 && errno == EINTR))
2755 				err = 0;
2756 			thread->waking++;
2757 
2758 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2759 					    record__thread_munmap_filtered, NULL) == 0)
2760 				draining = true;
2761 
2762 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2763 			if (err)
2764 				goto out_child;
2765 		}
2766 
2767 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2768 			switch (cmd) {
2769 			case EVLIST_CTL_CMD_SNAPSHOT:
2770 				hit_auxtrace_snapshot_trigger(rec);
2771 				evlist__ctlfd_ack(rec->evlist);
2772 				break;
2773 			case EVLIST_CTL_CMD_STOP:
2774 				done = 1;
2775 				break;
2776 			case EVLIST_CTL_CMD_ACK:
2777 			case EVLIST_CTL_CMD_UNSUPPORTED:
2778 			case EVLIST_CTL_CMD_ENABLE:
2779 			case EVLIST_CTL_CMD_DISABLE:
2780 			case EVLIST_CTL_CMD_EVLIST:
2781 			case EVLIST_CTL_CMD_PING:
2782 			default:
2783 				break;
2784 			}
2785 		}
2786 
2787 		err = event_enable_timer__process(rec->evlist->eet);
2788 		if (err < 0)
2789 			goto out_child;
2790 		if (err) {
2791 			err = 0;
2792 			done = 1;
2793 		}
2794 
2795 		/*
2796 		 * When perf is starting the traced process, at the end events
2797 		 * die with the process and we wait for that. Thus no need to
2798 		 * disable events in this case.
2799 		 */
2800 		if (done && !disabled && !target__none(&opts->target)) {
2801 			trigger_off(&auxtrace_snapshot_trigger);
2802 			evlist__disable(rec->evlist);
2803 			disabled = true;
2804 		}
2805 	}
2806 
2807 	trigger_off(&auxtrace_snapshot_trigger);
2808 	trigger_off(&switch_output_trigger);
2809 
2810 	if (opts->auxtrace_snapshot_on_exit)
2811 		record__auxtrace_snapshot_exit(rec);
2812 
2813 	if (forks && workload_exec_errno) {
2814 		char msg[STRERR_BUFSIZE];
2815 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2816 		struct strbuf sb = STRBUF_INIT;
2817 
2818 		evlist__format_evsels(rec->evlist, &sb, 2048);
2819 
2820 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2821 			sb.buf, argv[0], emsg);
2822 		strbuf_release(&sb);
2823 		err = -1;
2824 		goto out_child;
2825 	}
2826 
2827 	if (!quiet)
2828 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2829 			record__waking(rec));
2830 
2831 	write_finished_init(rec, true);
2832 
2833 	if (target__none(&rec->opts.target))
2834 		record__synthesize_workload(rec, true);
2835 
2836 out_child:
2837 	record__stop_threads(rec);
2838 	record__mmap_read_all(rec, true);
2839 out_free_threads:
2840 	record__free_thread_data(rec);
2841 	evlist__finalize_ctlfd(rec->evlist);
2842 	record__aio_mmap_read_sync(rec);
2843 
2844 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2845 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2846 		session->header.env.comp_ratio = ratio + 0.5;
2847 	}
2848 
2849 	if (forks) {
2850 		int exit_status;
2851 
2852 		if (!child_finished)
2853 			kill(rec->evlist->workload.pid, SIGTERM);
2854 
2855 		wait(&exit_status);
2856 
2857 		if (err < 0)
2858 			status = err;
2859 		else if (WIFEXITED(exit_status))
2860 			status = WEXITSTATUS(exit_status);
2861 		else if (WIFSIGNALED(exit_status))
2862 			signr = WTERMSIG(exit_status);
2863 	} else
2864 		status = err;
2865 
2866 	if (rec->off_cpu)
2867 		rec->bytes_written += off_cpu_write(rec->session);
2868 
2869 	record__read_lost_samples(rec);
2870 	record__synthesize(rec, true);
2871 	/* this will be recalculated during process_buildids() */
2872 	rec->samples = 0;
2873 
2874 	if (!err) {
2875 		if (!rec->timestamp_filename) {
2876 			record__finish_output(rec);
2877 		} else {
2878 			fd = record__switch_output(rec, true);
2879 			if (fd < 0) {
2880 				status = fd;
2881 				goto out_delete_session;
2882 			}
2883 		}
2884 	}
2885 
2886 	perf_hooks__invoke_record_end();
2887 
2888 	if (!err && !quiet) {
2889 		char samples[128];
2890 		const char *postfix = rec->timestamp_filename ?
2891 					".<timestamp>" : "";
2892 
2893 		if (rec->samples && !rec->opts.full_auxtrace)
2894 			scnprintf(samples, sizeof(samples),
2895 				  " (%" PRIu64 " samples)", rec->samples);
2896 		else
2897 			samples[0] = '\0';
2898 
2899 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2900 			perf_data__size(data) / 1024.0 / 1024.0,
2901 			data->path, postfix, samples);
2902 		if (ratio) {
2903 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2904 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2905 					ratio);
2906 		}
2907 		fprintf(stderr, " ]\n");
2908 	}
2909 
2910 out_delete_session:
2911 #ifdef HAVE_EVENTFD_SUPPORT
2912 	if (done_fd >= 0) {
2913 		fd = done_fd;
2914 		done_fd = -1;
2915 
2916 		close(fd);
2917 	}
2918 #endif
2919 	zstd_fini(&session->zstd_data);
2920 	if (!opts->no_bpf_event)
2921 		evlist__stop_sb_thread(rec->sb_evlist);
2922 
2923 	perf_session__delete(session);
2924 	return status;
2925 }
2926 
2927 static void callchain_debug(struct callchain_param *callchain)
2928 {
2929 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2930 
2931 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2932 
2933 	if (callchain->record_mode == CALLCHAIN_DWARF)
2934 		pr_debug("callchain: stack dump size %d\n",
2935 			 callchain->dump_size);
2936 }
2937 
2938 int record_opts__parse_callchain(struct record_opts *record,
2939 				 struct callchain_param *callchain,
2940 				 const char *arg, bool unset)
2941 {
2942 	int ret;
2943 	callchain->enabled = !unset;
2944 
2945 	/* --no-call-graph */
2946 	if (unset) {
2947 		callchain->record_mode = CALLCHAIN_NONE;
2948 		pr_debug("callchain: disabled\n");
2949 		return 0;
2950 	}
2951 
2952 	ret = parse_callchain_record_opt(arg, callchain);
2953 	if (!ret) {
2954 		/* Enable data address sampling for DWARF unwind. */
2955 		if (callchain->record_mode == CALLCHAIN_DWARF)
2956 			record->sample_address = true;
2957 		callchain_debug(callchain);
2958 	}
2959 
2960 	return ret;
2961 }
2962 
2963 int record_parse_callchain_opt(const struct option *opt,
2964 			       const char *arg,
2965 			       int unset)
2966 {
2967 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2968 }
2969 
2970 int record_callchain_opt(const struct option *opt,
2971 			 const char *arg __maybe_unused,
2972 			 int unset __maybe_unused)
2973 {
2974 	struct callchain_param *callchain = opt->value;
2975 
2976 	callchain->enabled = true;
2977 
2978 	if (callchain->record_mode == CALLCHAIN_NONE)
2979 		callchain->record_mode = CALLCHAIN_FP;
2980 
2981 	callchain_debug(callchain);
2982 	return 0;
2983 }
2984 
2985 static int perf_record_config(const char *var, const char *value, void *cb)
2986 {
2987 	struct record *rec = cb;
2988 
2989 	if (!strcmp(var, "record.build-id")) {
2990 		if (!strcmp(value, "cache"))
2991 			rec->no_buildid_cache = false;
2992 		else if (!strcmp(value, "no-cache"))
2993 			rec->no_buildid_cache = true;
2994 		else if (!strcmp(value, "skip"))
2995 			rec->no_buildid = true;
2996 		else if (!strcmp(value, "mmap"))
2997 			rec->buildid_mmap = true;
2998 		else
2999 			return -1;
3000 		return 0;
3001 	}
3002 	if (!strcmp(var, "record.call-graph")) {
3003 		var = "call-graph.record-mode";
3004 		return perf_default_config(var, value, cb);
3005 	}
3006 #ifdef HAVE_AIO_SUPPORT
3007 	if (!strcmp(var, "record.aio")) {
3008 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
3009 		if (!rec->opts.nr_cblocks)
3010 			rec->opts.nr_cblocks = nr_cblocks_default;
3011 	}
3012 #endif
3013 	if (!strcmp(var, "record.debuginfod")) {
3014 		rec->debuginfod.urls = strdup(value);
3015 		if (!rec->debuginfod.urls)
3016 			return -ENOMEM;
3017 		rec->debuginfod.set = true;
3018 	}
3019 
3020 	return 0;
3021 }
3022 
3023 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
3024 {
3025 	struct record *rec = (struct record *)opt->value;
3026 
3027 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
3028 }
3029 
3030 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
3031 {
3032 	struct record_opts *opts = (struct record_opts *)opt->value;
3033 
3034 	if (unset || !str)
3035 		return 0;
3036 
3037 	if (!strcasecmp(str, "node"))
3038 		opts->affinity = PERF_AFFINITY_NODE;
3039 	else if (!strcasecmp(str, "cpu"))
3040 		opts->affinity = PERF_AFFINITY_CPU;
3041 
3042 	return 0;
3043 }
3044 
3045 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3046 {
3047 	mask->nbits = nr_bits;
3048 	mask->bits = bitmap_zalloc(mask->nbits);
3049 	if (!mask->bits)
3050 		return -ENOMEM;
3051 
3052 	return 0;
3053 }
3054 
3055 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3056 {
3057 	bitmap_free(mask->bits);
3058 	mask->nbits = 0;
3059 }
3060 
3061 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3062 {
3063 	int ret;
3064 
3065 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3066 	if (ret) {
3067 		mask->affinity.bits = NULL;
3068 		return ret;
3069 	}
3070 
3071 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3072 	if (ret) {
3073 		record__mmap_cpu_mask_free(&mask->maps);
3074 		mask->maps.bits = NULL;
3075 	}
3076 
3077 	return ret;
3078 }
3079 
3080 static void record__thread_mask_free(struct thread_mask *mask)
3081 {
3082 	record__mmap_cpu_mask_free(&mask->maps);
3083 	record__mmap_cpu_mask_free(&mask->affinity);
3084 }
3085 
3086 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3087 {
3088 	int s;
3089 	struct record_opts *opts = opt->value;
3090 
3091 	if (unset || !str || !strlen(str)) {
3092 		opts->threads_spec = THREAD_SPEC__CPU;
3093 	} else {
3094 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3095 			if (s == THREAD_SPEC__USER) {
3096 				opts->threads_user_spec = strdup(str);
3097 				if (!opts->threads_user_spec)
3098 					return -ENOMEM;
3099 				opts->threads_spec = THREAD_SPEC__USER;
3100 				break;
3101 			}
3102 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3103 				opts->threads_spec = s;
3104 				break;
3105 			}
3106 		}
3107 	}
3108 
3109 	if (opts->threads_spec == THREAD_SPEC__USER)
3110 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3111 	else
3112 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3113 
3114 	return 0;
3115 }
3116 
3117 static int parse_output_max_size(const struct option *opt,
3118 				 const char *str, int unset)
3119 {
3120 	unsigned long *s = (unsigned long *)opt->value;
3121 	static struct parse_tag tags_size[] = {
3122 		{ .tag  = 'B', .mult = 1       },
3123 		{ .tag  = 'K', .mult = 1 << 10 },
3124 		{ .tag  = 'M', .mult = 1 << 20 },
3125 		{ .tag  = 'G', .mult = 1 << 30 },
3126 		{ .tag  = 0 },
3127 	};
3128 	unsigned long val;
3129 
3130 	if (unset) {
3131 		*s = 0;
3132 		return 0;
3133 	}
3134 
3135 	val = parse_tag_value(str, tags_size);
3136 	if (val != (unsigned long) -1) {
3137 		*s = val;
3138 		return 0;
3139 	}
3140 
3141 	return -1;
3142 }
3143 
3144 static int record__parse_mmap_pages(const struct option *opt,
3145 				    const char *str,
3146 				    int unset __maybe_unused)
3147 {
3148 	struct record_opts *opts = opt->value;
3149 	char *s, *p;
3150 	unsigned int mmap_pages;
3151 	int ret;
3152 
3153 	if (!str)
3154 		return -EINVAL;
3155 
3156 	s = strdup(str);
3157 	if (!s)
3158 		return -ENOMEM;
3159 
3160 	p = strchr(s, ',');
3161 	if (p)
3162 		*p = '\0';
3163 
3164 	if (*s) {
3165 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3166 		if (ret)
3167 			goto out_free;
3168 		opts->mmap_pages = mmap_pages;
3169 	}
3170 
3171 	if (!p) {
3172 		ret = 0;
3173 		goto out_free;
3174 	}
3175 
3176 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3177 	if (ret)
3178 		goto out_free;
3179 
3180 	opts->auxtrace_mmap_pages = mmap_pages;
3181 
3182 out_free:
3183 	free(s);
3184 	return ret;
3185 }
3186 
3187 static int record__parse_off_cpu_thresh(const struct option *opt,
3188 					const char *str,
3189 					int unset __maybe_unused)
3190 {
3191 	struct record_opts *opts = opt->value;
3192 	char *endptr;
3193 	u64 off_cpu_thresh_ms;
3194 
3195 	if (!str)
3196 		return -EINVAL;
3197 
3198 	off_cpu_thresh_ms = strtoull(str, &endptr, 10);
3199 
3200 	/* the threshold isn't string "0", yet strtoull() returns 0, parsing failed */
3201 	if (*endptr || (off_cpu_thresh_ms == 0 && strcmp(str, "0")))
3202 		return -EINVAL;
3203 	else
3204 		opts->off_cpu_thresh_ns = off_cpu_thresh_ms * NSEC_PER_MSEC;
3205 
3206 	return 0;
3207 }
3208 
3209 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3210 {
3211 }
3212 
3213 static int parse_control_option(const struct option *opt,
3214 				const char *str,
3215 				int unset __maybe_unused)
3216 {
3217 	struct record_opts *opts = opt->value;
3218 
3219 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3220 }
3221 
3222 static void switch_output_size_warn(struct record *rec)
3223 {
3224 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3225 	struct switch_output *s = &rec->switch_output;
3226 
3227 	wakeup_size /= 2;
3228 
3229 	if (s->size < wakeup_size) {
3230 		char buf[100];
3231 
3232 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3233 		pr_warning("WARNING: switch-output data size lower than "
3234 			   "wakeup kernel buffer size (%s) "
3235 			   "expect bigger perf.data sizes\n", buf);
3236 	}
3237 }
3238 
3239 static int switch_output_setup(struct record *rec)
3240 {
3241 	struct switch_output *s = &rec->switch_output;
3242 	static struct parse_tag tags_size[] = {
3243 		{ .tag  = 'B', .mult = 1       },
3244 		{ .tag  = 'K', .mult = 1 << 10 },
3245 		{ .tag  = 'M', .mult = 1 << 20 },
3246 		{ .tag  = 'G', .mult = 1 << 30 },
3247 		{ .tag  = 0 },
3248 	};
3249 	static struct parse_tag tags_time[] = {
3250 		{ .tag  = 's', .mult = 1        },
3251 		{ .tag  = 'm', .mult = 60       },
3252 		{ .tag  = 'h', .mult = 60*60    },
3253 		{ .tag  = 'd', .mult = 60*60*24 },
3254 		{ .tag  = 0 },
3255 	};
3256 	unsigned long val;
3257 
3258 	/*
3259 	 * If we're using --switch-output-events, then we imply its
3260 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3261 	 *  thread to its parent.
3262 	 */
3263 	if (rec->switch_output_event_set) {
3264 		if (record__threads_enabled(rec)) {
3265 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3266 			return 0;
3267 		}
3268 		goto do_signal;
3269 	}
3270 
3271 	if (!s->set)
3272 		return 0;
3273 
3274 	if (record__threads_enabled(rec)) {
3275 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3276 		return 0;
3277 	}
3278 
3279 	if (!strcmp(s->str, "signal")) {
3280 do_signal:
3281 		s->signal = true;
3282 		pr_debug("switch-output with SIGUSR2 signal\n");
3283 		goto enabled;
3284 	}
3285 
3286 	val = parse_tag_value(s->str, tags_size);
3287 	if (val != (unsigned long) -1) {
3288 		s->size = val;
3289 		pr_debug("switch-output with %s size threshold\n", s->str);
3290 		goto enabled;
3291 	}
3292 
3293 	val = parse_tag_value(s->str, tags_time);
3294 	if (val != (unsigned long) -1) {
3295 		s->time = val;
3296 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3297 			 s->str, s->time);
3298 		goto enabled;
3299 	}
3300 
3301 	return -1;
3302 
3303 enabled:
3304 	rec->timestamp_filename = true;
3305 	s->enabled              = true;
3306 
3307 	if (s->size && !rec->opts.no_buffering)
3308 		switch_output_size_warn(rec);
3309 
3310 	return 0;
3311 }
3312 
3313 static const char * const __record_usage[] = {
3314 	"perf record [<options>] [<command>]",
3315 	"perf record [<options>] -- <command> [<options>]",
3316 	NULL
3317 };
3318 const char * const *record_usage = __record_usage;
3319 
3320 static int build_id__process_mmap(const struct perf_tool *tool, union perf_event *event,
3321 				  struct perf_sample *sample, struct machine *machine)
3322 {
3323 	/*
3324 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3325 	 * no need to add them twice.
3326 	 */
3327 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3328 		return 0;
3329 	return perf_event__process_mmap(tool, event, sample, machine);
3330 }
3331 
3332 static int build_id__process_mmap2(const struct perf_tool *tool, union perf_event *event,
3333 				   struct perf_sample *sample, struct machine *machine)
3334 {
3335 	/*
3336 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3337 	 * no need to add them twice.
3338 	 */
3339 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3340 		return 0;
3341 
3342 	return perf_event__process_mmap2(tool, event, sample, machine);
3343 }
3344 
3345 static int process_timestamp_boundary(const struct perf_tool *tool,
3346 				      union perf_event *event __maybe_unused,
3347 				      struct perf_sample *sample,
3348 				      struct machine *machine __maybe_unused)
3349 {
3350 	struct record *rec = container_of(tool, struct record, tool);
3351 
3352 	set_timestamp_boundary(rec, sample->time);
3353 	return 0;
3354 }
3355 
3356 static int parse_record_synth_option(const struct option *opt,
3357 				     const char *str,
3358 				     int unset __maybe_unused)
3359 {
3360 	struct record_opts *opts = opt->value;
3361 	char *p = strdup(str);
3362 
3363 	if (p == NULL)
3364 		return -1;
3365 
3366 	opts->synth = parse_synth_opt(p);
3367 	free(p);
3368 
3369 	if (opts->synth < 0) {
3370 		pr_err("Invalid synth option: %s\n", str);
3371 		return -1;
3372 	}
3373 	return 0;
3374 }
3375 
3376 /*
3377  * XXX Ideally would be local to cmd_record() and passed to a record__new
3378  * because we need to have access to it in record__exit, that is called
3379  * after cmd_record() exits, but since record_options need to be accessible to
3380  * builtin-script, leave it here.
3381  *
3382  * At least we don't ouch it in all the other functions here directly.
3383  *
3384  * Just say no to tons of global variables, sigh.
3385  */
3386 static struct record record = {
3387 	.opts = {
3388 		.sample_time	     = true,
3389 		.mmap_pages	     = UINT_MAX,
3390 		.user_freq	     = UINT_MAX,
3391 		.user_interval	     = ULLONG_MAX,
3392 		.freq		     = 4000,
3393 		.target		     = {
3394 			.uses_mmap   = true,
3395 			.default_per_cpu = true,
3396 		},
3397 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3398 		.nr_threads_synthesize = 1,
3399 		.ctl_fd              = -1,
3400 		.ctl_fd_ack          = -1,
3401 		.synth               = PERF_SYNTH_ALL,
3402 		.off_cpu_thresh_ns   = OFFCPU_THRESH,
3403 	},
3404 };
3405 
3406 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3407 	"\n\t\t\t\tDefault: fp";
3408 
3409 static bool dry_run;
3410 
3411 static struct parse_events_option_args parse_events_option_args = {
3412 	.evlistp = &record.evlist,
3413 };
3414 
3415 static struct parse_events_option_args switch_output_parse_events_option_args = {
3416 	.evlistp = &record.sb_evlist,
3417 };
3418 
3419 /*
3420  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3421  * with it and switch to use the library functions in perf_evlist that came
3422  * from builtin-record.c, i.e. use record_opts,
3423  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3424  * using pipes, etc.
3425  */
3426 static struct option __record_options[] = {
3427 	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3428 		     "event selector. use 'perf list' to list available events",
3429 		     parse_events_option),
3430 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3431 		     "event filter", parse_filter),
3432 	OPT_BOOLEAN(0, "latency", &record.latency,
3433 		    "Enable data collection for latency profiling.\n"
3434 		    "\t\t\t  Use perf report --latency for latency-centric profile."),
3435 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3436 			   NULL, "don't record events from perf itself",
3437 			   exclude_perf),
3438 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3439 		    "record events on existing process id"),
3440 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3441 		    "record events on existing thread id"),
3442 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3443 		    "collect data with this RT SCHED_FIFO priority"),
3444 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3445 		    "collect data without buffering"),
3446 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3447 		    "collect raw sample records from all opened counters"),
3448 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3449 			    "system-wide collection from all CPUs"),
3450 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3451 		    "list of cpus to monitor"),
3452 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3453 	OPT_STRING('o', "output", &record.data.path, "file",
3454 		    "output file name"),
3455 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3456 			&record.opts.no_inherit_set,
3457 			"child tasks do not inherit counters"),
3458 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3459 		    "synthesize non-sample events at the end of output"),
3460 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3461 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3462 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3463 		    "Fail if the specified frequency can't be used"),
3464 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3465 		     "profile at this frequency",
3466 		      record__parse_freq),
3467 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3468 		     "number of mmap data pages and AUX area tracing mmap pages",
3469 		     record__parse_mmap_pages),
3470 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3471 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3472 		     record__mmap_flush_parse),
3473 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3474 			   NULL, "enables call-graph recording" ,
3475 			   &record_callchain_opt),
3476 	OPT_CALLBACK(0, "call-graph", &record.opts,
3477 		     "record_mode[,record_size]", record_callchain_help,
3478 		     &record_parse_callchain_opt),
3479 	OPT_INCR('v', "verbose", &verbose,
3480 		    "be more verbose (show counter open errors, etc)"),
3481 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3482 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3483 		    "per thread counts"),
3484 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3485 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3486 		    "Record the sample physical addresses"),
3487 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3488 		    "Record the sampled data address data page size"),
3489 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3490 		    "Record the sampled code address (ip) page size"),
3491 	OPT_BOOLEAN(0, "sample-mem-info", &record.opts.sample_data_src,
3492 		    "Record the data source for memory operations"),
3493 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3494 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3495 		    "Record the sample identifier"),
3496 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3497 			&record.opts.sample_time_set,
3498 			"Record the sample timestamps"),
3499 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3500 			"Record the sample period"),
3501 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3502 		    "don't sample"),
3503 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3504 			&record.no_buildid_cache_set,
3505 			"do not update the buildid cache"),
3506 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3507 			&record.no_buildid_set,
3508 			"do not collect buildids in perf.data"),
3509 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3510 		     "monitor event in cgroup name only",
3511 		     parse_cgroups),
3512 	OPT_CALLBACK('D', "delay", &record, "ms",
3513 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3514 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3515 		     record__parse_event_enable_time),
3516 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3517 	OPT_STRING('u', "uid", &record.uid_str, "user", "user to profile"),
3518 
3519 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3520 		     "branch any", "sample any taken branches",
3521 		     parse_branch_stack),
3522 
3523 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3524 		     "branch filter mask", "branch stack filter modes",
3525 		     parse_branch_stack),
3526 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3527 		    "sample by weight (on special events only)"),
3528 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3529 		    "sample transaction flags (special events only)"),
3530 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3531 		    "use per-thread mmaps"),
3532 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3533 		    "sample selected machine registers on interrupt,"
3534 		    " use '-I?' to list register names", parse_intr_regs),
3535 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3536 		    "sample selected machine registers in user space,"
3537 		    " use '--user-regs=?' to list register names", parse_user_regs),
3538 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3539 		    "Record running/enabled time of read (:S) events"),
3540 	OPT_CALLBACK('k', "clockid", &record.opts,
3541 	"clockid", "clockid to use for events, see clock_gettime()",
3542 	parse_clockid),
3543 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3544 			  "opts", "AUX area tracing Snapshot Mode", ""),
3545 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3546 			  "opts", "sample AUX area", ""),
3547 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3548 			"per thread proc mmap processing timeout in ms"),
3549 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3550 		    "Record namespaces events"),
3551 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3552 		    "Record cgroup events"),
3553 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3554 			&record.opts.record_switch_events_set,
3555 			"Record context switch events"),
3556 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3557 			 "Configure all used events to run in kernel space.",
3558 			 PARSE_OPT_EXCLUSIVE),
3559 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3560 			 "Configure all used events to run in user space.",
3561 			 PARSE_OPT_EXCLUSIVE),
3562 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3563 		    "collect kernel callchains"),
3564 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3565 		    "collect user callchains"),
3566 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3567 		   "file", "vmlinux pathname"),
3568 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3569 		    "Record build-id of all DSOs regardless of hits"),
3570 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3571 		    "Record build-id in map events"),
3572 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3573 		    "append timestamp to output filename"),
3574 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3575 		    "Record timestamp boundary (time of first/last samples)"),
3576 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3577 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3578 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3579 			  "signal"),
3580 	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3581 			 &record.switch_output_event_set, "switch output event",
3582 			 "switch output event selector. use 'perf list' to list available events",
3583 			 parse_events_option_new_evlist),
3584 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3585 		   "Limit number of switch output generated files"),
3586 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3587 		    "Parse options then exit"),
3588 #ifdef HAVE_AIO_SUPPORT
3589 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3590 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3591 		     record__aio_parse),
3592 #endif
3593 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3594 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3595 		     record__parse_affinity),
3596 #ifdef HAVE_ZSTD_SUPPORT
3597 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3598 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3599 			    record__parse_comp_level),
3600 #endif
3601 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3602 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3603 	OPT_UINTEGER(0, "num-thread-synthesize",
3604 		     &record.opts.nr_threads_synthesize,
3605 		     "number of threads to run for event synthesis"),
3606 #ifdef HAVE_LIBPFM
3607 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3608 		"libpfm4 event selector. use 'perf list' to list available events",
3609 		parse_libpfm_events_option),
3610 #endif
3611 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3612 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3613 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3614 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3615 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3616 		      parse_control_option),
3617 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3618 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3619 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3620 			  &record.debuginfod.set, "debuginfod urls",
3621 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3622 			  "system"),
3623 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3624 			    "write collected trace data into several data files using parallel threads",
3625 			    record__parse_threads),
3626 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3627 	OPT_STRING(0, "setup-filter", &record.filter_action, "pin|unpin",
3628 		   "BPF filter action"),
3629 	OPT_CALLBACK(0, "off-cpu-thresh", &record.opts, "ms",
3630 		     "Dump off-cpu samples if off-cpu time exceeds this threshold (in milliseconds). (Default: 500ms)",
3631 		     record__parse_off_cpu_thresh),
3632 	OPT_END()
3633 };
3634 
3635 struct option *record_options = __record_options;
3636 
3637 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3638 {
3639 	struct perf_cpu cpu;
3640 	int idx;
3641 
3642 	if (cpu_map__is_dummy(cpus))
3643 		return 0;
3644 
3645 	perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3646 		/* Return ENODEV is input cpu is greater than max cpu */
3647 		if ((unsigned long)cpu.cpu > mask->nbits)
3648 			return -ENODEV;
3649 		__set_bit(cpu.cpu, mask->bits);
3650 	}
3651 
3652 	return 0;
3653 }
3654 
3655 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3656 {
3657 	struct perf_cpu_map *cpus;
3658 
3659 	cpus = perf_cpu_map__new(mask_spec);
3660 	if (!cpus)
3661 		return -ENOMEM;
3662 
3663 	bitmap_zero(mask->bits, mask->nbits);
3664 	if (record__mmap_cpu_mask_init(mask, cpus))
3665 		return -ENODEV;
3666 
3667 	perf_cpu_map__put(cpus);
3668 
3669 	return 0;
3670 }
3671 
3672 static void record__free_thread_masks(struct record *rec, int nr_threads)
3673 {
3674 	int t;
3675 
3676 	if (rec->thread_masks)
3677 		for (t = 0; t < nr_threads; t++)
3678 			record__thread_mask_free(&rec->thread_masks[t]);
3679 
3680 	zfree(&rec->thread_masks);
3681 }
3682 
3683 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3684 {
3685 	int t, ret;
3686 
3687 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3688 	if (!rec->thread_masks) {
3689 		pr_err("Failed to allocate thread masks\n");
3690 		return -ENOMEM;
3691 	}
3692 
3693 	for (t = 0; t < nr_threads; t++) {
3694 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3695 		if (ret) {
3696 			pr_err("Failed to allocate thread masks[%d]\n", t);
3697 			goto out_free;
3698 		}
3699 	}
3700 
3701 	return 0;
3702 
3703 out_free:
3704 	record__free_thread_masks(rec, nr_threads);
3705 
3706 	return ret;
3707 }
3708 
3709 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3710 {
3711 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3712 
3713 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3714 	if (ret)
3715 		return ret;
3716 
3717 	rec->nr_threads = nr_cpus;
3718 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3719 
3720 	for (t = 0; t < rec->nr_threads; t++) {
3721 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3722 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3723 		if (verbose > 0) {
3724 			pr_debug("thread_masks[%d]: ", t);
3725 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3726 			pr_debug("thread_masks[%d]: ", t);
3727 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3728 		}
3729 	}
3730 
3731 	return 0;
3732 }
3733 
3734 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3735 					  const char **maps_spec, const char **affinity_spec,
3736 					  u32 nr_spec)
3737 {
3738 	u32 s;
3739 	int ret = 0, t = 0;
3740 	struct mmap_cpu_mask cpus_mask;
3741 	struct thread_mask thread_mask, full_mask, *thread_masks;
3742 
3743 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3744 	if (ret) {
3745 		pr_err("Failed to allocate CPUs mask\n");
3746 		return ret;
3747 	}
3748 
3749 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3750 	if (ret) {
3751 		pr_err("Failed to init cpu mask\n");
3752 		goto out_free_cpu_mask;
3753 	}
3754 
3755 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3756 	if (ret) {
3757 		pr_err("Failed to allocate full mask\n");
3758 		goto out_free_cpu_mask;
3759 	}
3760 
3761 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3762 	if (ret) {
3763 		pr_err("Failed to allocate thread mask\n");
3764 		goto out_free_full_and_cpu_masks;
3765 	}
3766 
3767 	for (s = 0; s < nr_spec; s++) {
3768 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3769 		if (ret) {
3770 			pr_err("Failed to initialize maps thread mask\n");
3771 			goto out_free;
3772 		}
3773 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3774 		if (ret) {
3775 			pr_err("Failed to initialize affinity thread mask\n");
3776 			goto out_free;
3777 		}
3778 
3779 		/* ignore invalid CPUs but do not allow empty masks */
3780 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3781 				cpus_mask.bits, thread_mask.maps.nbits)) {
3782 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3783 			ret = -EINVAL;
3784 			goto out_free;
3785 		}
3786 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3787 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3788 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3789 			ret = -EINVAL;
3790 			goto out_free;
3791 		}
3792 
3793 		/* do not allow intersection with other masks (full_mask) */
3794 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3795 				      thread_mask.maps.nbits)) {
3796 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3797 			ret = -EINVAL;
3798 			goto out_free;
3799 		}
3800 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3801 				      thread_mask.affinity.nbits)) {
3802 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3803 			ret = -EINVAL;
3804 			goto out_free;
3805 		}
3806 
3807 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3808 			  thread_mask.maps.bits, full_mask.maps.nbits);
3809 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3810 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3811 
3812 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3813 		if (!thread_masks) {
3814 			pr_err("Failed to reallocate thread masks\n");
3815 			ret = -ENOMEM;
3816 			goto out_free;
3817 		}
3818 		rec->thread_masks = thread_masks;
3819 		rec->thread_masks[t] = thread_mask;
3820 		if (verbose > 0) {
3821 			pr_debug("thread_masks[%d]: ", t);
3822 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3823 			pr_debug("thread_masks[%d]: ", t);
3824 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3825 		}
3826 		t++;
3827 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3828 		if (ret) {
3829 			pr_err("Failed to allocate thread mask\n");
3830 			goto out_free_full_and_cpu_masks;
3831 		}
3832 	}
3833 	rec->nr_threads = t;
3834 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3835 	if (!rec->nr_threads)
3836 		ret = -EINVAL;
3837 
3838 out_free:
3839 	record__thread_mask_free(&thread_mask);
3840 out_free_full_and_cpu_masks:
3841 	record__thread_mask_free(&full_mask);
3842 out_free_cpu_mask:
3843 	record__mmap_cpu_mask_free(&cpus_mask);
3844 
3845 	return ret;
3846 }
3847 
3848 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3849 {
3850 	int ret;
3851 	struct cpu_topology *topo;
3852 
3853 	topo = cpu_topology__new();
3854 	if (!topo) {
3855 		pr_err("Failed to allocate CPU topology\n");
3856 		return -ENOMEM;
3857 	}
3858 
3859 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3860 					     topo->core_cpus_list, topo->core_cpus_lists);
3861 	cpu_topology__delete(topo);
3862 
3863 	return ret;
3864 }
3865 
3866 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3867 {
3868 	int ret;
3869 	struct cpu_topology *topo;
3870 
3871 	topo = cpu_topology__new();
3872 	if (!topo) {
3873 		pr_err("Failed to allocate CPU topology\n");
3874 		return -ENOMEM;
3875 	}
3876 
3877 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3878 					     topo->package_cpus_list, topo->package_cpus_lists);
3879 	cpu_topology__delete(topo);
3880 
3881 	return ret;
3882 }
3883 
3884 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3885 {
3886 	u32 s;
3887 	int ret;
3888 	const char **spec;
3889 	struct numa_topology *topo;
3890 
3891 	topo = numa_topology__new();
3892 	if (!topo) {
3893 		pr_err("Failed to allocate NUMA topology\n");
3894 		return -ENOMEM;
3895 	}
3896 
3897 	spec = zalloc(topo->nr * sizeof(char *));
3898 	if (!spec) {
3899 		pr_err("Failed to allocate NUMA spec\n");
3900 		ret = -ENOMEM;
3901 		goto out_delete_topo;
3902 	}
3903 	for (s = 0; s < topo->nr; s++)
3904 		spec[s] = topo->nodes[s].cpus;
3905 
3906 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3907 
3908 	zfree(&spec);
3909 
3910 out_delete_topo:
3911 	numa_topology__delete(topo);
3912 
3913 	return ret;
3914 }
3915 
3916 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3917 {
3918 	int t, ret;
3919 	u32 s, nr_spec = 0;
3920 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3921 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3922 
3923 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3924 		spec = strtok_r(user_spec, ":", &spec_ptr);
3925 		if (spec == NULL)
3926 			break;
3927 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3928 		mask = strtok_r(spec, "/", &mask_ptr);
3929 		if (mask == NULL)
3930 			break;
3931 		pr_debug2("  maps mask: %s\n", mask);
3932 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3933 		if (!tmp_spec) {
3934 			pr_err("Failed to reallocate maps spec\n");
3935 			ret = -ENOMEM;
3936 			goto out_free;
3937 		}
3938 		maps_spec = tmp_spec;
3939 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3940 		if (!maps_spec[nr_spec]) {
3941 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3942 			ret = -ENOMEM;
3943 			goto out_free;
3944 		}
3945 		mask = strtok_r(NULL, "/", &mask_ptr);
3946 		if (mask == NULL) {
3947 			pr_err("Invalid thread maps or affinity specs\n");
3948 			ret = -EINVAL;
3949 			goto out_free;
3950 		}
3951 		pr_debug2("  affinity mask: %s\n", mask);
3952 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3953 		if (!tmp_spec) {
3954 			pr_err("Failed to reallocate affinity spec\n");
3955 			ret = -ENOMEM;
3956 			goto out_free;
3957 		}
3958 		affinity_spec = tmp_spec;
3959 		affinity_spec[nr_spec] = strdup(mask);
3960 		if (!affinity_spec[nr_spec]) {
3961 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3962 			ret = -ENOMEM;
3963 			goto out_free;
3964 		}
3965 		dup_mask = NULL;
3966 		nr_spec++;
3967 	}
3968 
3969 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3970 					     (const char **)affinity_spec, nr_spec);
3971 
3972 out_free:
3973 	free(dup_mask);
3974 	for (s = 0; s < nr_spec; s++) {
3975 		if (maps_spec)
3976 			free(maps_spec[s]);
3977 		if (affinity_spec)
3978 			free(affinity_spec[s]);
3979 	}
3980 	free(affinity_spec);
3981 	free(maps_spec);
3982 
3983 	return ret;
3984 }
3985 
3986 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3987 {
3988 	int ret;
3989 
3990 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3991 	if (ret)
3992 		return ret;
3993 
3994 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3995 		return -ENODEV;
3996 
3997 	rec->nr_threads = 1;
3998 
3999 	return 0;
4000 }
4001 
4002 static int record__init_thread_masks(struct record *rec)
4003 {
4004 	int ret = 0;
4005 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
4006 
4007 	if (!record__threads_enabled(rec))
4008 		return record__init_thread_default_masks(rec, cpus);
4009 
4010 	if (evlist__per_thread(rec->evlist)) {
4011 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
4012 		return -EINVAL;
4013 	}
4014 
4015 	switch (rec->opts.threads_spec) {
4016 	case THREAD_SPEC__CPU:
4017 		ret = record__init_thread_cpu_masks(rec, cpus);
4018 		break;
4019 	case THREAD_SPEC__CORE:
4020 		ret = record__init_thread_core_masks(rec, cpus);
4021 		break;
4022 	case THREAD_SPEC__PACKAGE:
4023 		ret = record__init_thread_package_masks(rec, cpus);
4024 		break;
4025 	case THREAD_SPEC__NUMA:
4026 		ret = record__init_thread_numa_masks(rec, cpus);
4027 		break;
4028 	case THREAD_SPEC__USER:
4029 		ret = record__init_thread_user_masks(rec, cpus);
4030 		break;
4031 	default:
4032 		break;
4033 	}
4034 
4035 	return ret;
4036 }
4037 
4038 int cmd_record(int argc, const char **argv)
4039 {
4040 	int err;
4041 	struct record *rec = &record;
4042 	char errbuf[BUFSIZ];
4043 
4044 	setlocale(LC_ALL, "");
4045 
4046 #ifndef HAVE_BPF_SKEL
4047 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
4048 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
4049 # undef set_nobuild
4050 #endif
4051 
4052 	/* Disable eager loading of kernel symbols that adds overhead to perf record. */
4053 	symbol_conf.lazy_load_kernel_maps = true;
4054 	rec->opts.affinity = PERF_AFFINITY_SYS;
4055 
4056 	rec->evlist = evlist__new();
4057 	if (rec->evlist == NULL)
4058 		return -ENOMEM;
4059 
4060 	err = perf_config(perf_record_config, rec);
4061 	if (err)
4062 		return err;
4063 
4064 	argc = parse_options(argc, argv, record_options, record_usage,
4065 			    PARSE_OPT_STOP_AT_NON_OPTION);
4066 	if (quiet)
4067 		perf_quiet_option();
4068 
4069 	err = symbol__validate_sym_arguments();
4070 	if (err)
4071 		return err;
4072 
4073 	perf_debuginfod_setup(&record.debuginfod);
4074 
4075 	/* Make system wide (-a) the default target. */
4076 	if (!argc && target__none(&rec->opts.target))
4077 		rec->opts.target.system_wide = true;
4078 
4079 	if (nr_cgroups && !rec->opts.target.system_wide) {
4080 		usage_with_options_msg(record_usage, record_options,
4081 			"cgroup monitoring only available in system-wide mode");
4082 
4083 	}
4084 
4085 	if (record.latency) {
4086 		/*
4087 		 * There is no fundamental reason why latency profiling
4088 		 * can't work for system-wide mode, but exact semantics
4089 		 * and details are to be defined.
4090 		 * See the following thread for details:
4091 		 * https://lore.kernel.org/all/Z4XDJyvjiie3howF@google.com/
4092 		 */
4093 		if (record.opts.target.system_wide) {
4094 			pr_err("Failed: latency profiling is not supported with system-wide collection.\n");
4095 			err = -EINVAL;
4096 			goto out_opts;
4097 		}
4098 		record.opts.record_switch_events = true;
4099 	}
4100 
4101 	if (rec->buildid_mmap) {
4102 		if (!perf_can_record_build_id()) {
4103 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4104 			err = -EINVAL;
4105 			goto out_opts;
4106 		}
4107 		pr_debug("Enabling build id in mmap2 events.\n");
4108 		/* Enable mmap build id synthesizing. */
4109 		symbol_conf.buildid_mmap2 = true;
4110 		/* Enable perf_event_attr::build_id bit. */
4111 		rec->opts.build_id = true;
4112 		/* Disable build id cache. */
4113 		rec->no_buildid = true;
4114 	}
4115 
4116 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4117 		pr_err("Kernel has no cgroup sampling support.\n");
4118 		err = -EINVAL;
4119 		goto out_opts;
4120 	}
4121 
4122 	if (rec->opts.kcore)
4123 		rec->opts.text_poke = true;
4124 
4125 	if (rec->opts.kcore || record__threads_enabled(rec))
4126 		rec->data.is_dir = true;
4127 
4128 	if (record__threads_enabled(rec)) {
4129 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4130 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4131 			goto out_opts;
4132 		}
4133 		if (record__aio_enabled(rec)) {
4134 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4135 			goto out_opts;
4136 		}
4137 	}
4138 
4139 	if (rec->opts.comp_level != 0) {
4140 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4141 		rec->no_buildid = true;
4142 	}
4143 
4144 	if (rec->opts.record_switch_events &&
4145 	    !perf_can_record_switch_events()) {
4146 		ui__error("kernel does not support recording context switch events\n");
4147 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4148 		err = -EINVAL;
4149 		goto out_opts;
4150 	}
4151 
4152 	if (switch_output_setup(rec)) {
4153 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4154 		err = -EINVAL;
4155 		goto out_opts;
4156 	}
4157 
4158 	if (rec->switch_output.time) {
4159 		signal(SIGALRM, alarm_sig_handler);
4160 		alarm(rec->switch_output.time);
4161 	}
4162 
4163 	if (rec->switch_output.num_files) {
4164 		rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4165 						      sizeof(char *));
4166 		if (!rec->switch_output.filenames) {
4167 			err = -EINVAL;
4168 			goto out_opts;
4169 		}
4170 	}
4171 
4172 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4173 		rec->timestamp_filename = false;
4174 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4175 	}
4176 
4177 	if (rec->filter_action) {
4178 		if (!strcmp(rec->filter_action, "pin"))
4179 			err = perf_bpf_filter__pin();
4180 		else if (!strcmp(rec->filter_action, "unpin"))
4181 			err = perf_bpf_filter__unpin();
4182 		else {
4183 			pr_warning("Unknown BPF filter action: %s\n", rec->filter_action);
4184 			err = -EINVAL;
4185 		}
4186 		goto out_opts;
4187 	}
4188 
4189 	/* For backward compatibility, -d implies --mem-info */
4190 	if (rec->opts.sample_address)
4191 		rec->opts.sample_data_src = true;
4192 
4193 	/*
4194 	 * Allow aliases to facilitate the lookup of symbols for address
4195 	 * filters. Refer to auxtrace_parse_filters().
4196 	 */
4197 	symbol_conf.allow_aliases = true;
4198 
4199 	symbol__init(NULL);
4200 
4201 	err = record__auxtrace_init(rec);
4202 	if (err)
4203 		goto out;
4204 
4205 	if (dry_run)
4206 		goto out;
4207 
4208 	err = -ENOMEM;
4209 
4210 	if (rec->no_buildid_cache || rec->no_buildid) {
4211 		disable_buildid_cache();
4212 	} else if (rec->switch_output.enabled) {
4213 		/*
4214 		 * In 'perf record --switch-output', disable buildid
4215 		 * generation by default to reduce data file switching
4216 		 * overhead. Still generate buildid if they are required
4217 		 * explicitly using
4218 		 *
4219 		 *  perf record --switch-output --no-no-buildid \
4220 		 *              --no-no-buildid-cache
4221 		 *
4222 		 * Following code equals to:
4223 		 *
4224 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4225 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4226 		 *         disable_buildid_cache();
4227 		 */
4228 		bool disable = true;
4229 
4230 		if (rec->no_buildid_set && !rec->no_buildid)
4231 			disable = false;
4232 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4233 			disable = false;
4234 		if (disable) {
4235 			rec->no_buildid = true;
4236 			rec->no_buildid_cache = true;
4237 			disable_buildid_cache();
4238 		}
4239 	}
4240 
4241 	if (record.opts.overwrite)
4242 		record.opts.tail_synthesize = true;
4243 
4244 	if (rec->evlist->core.nr_entries == 0) {
4245 		err = parse_event(rec->evlist, "cycles:P");
4246 		if (err)
4247 			goto out;
4248 	}
4249 
4250 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4251 		rec->opts.no_inherit = true;
4252 
4253 	err = target__validate(&rec->opts.target);
4254 	if (err) {
4255 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4256 		ui__warning("%s\n", errbuf);
4257 	}
4258 
4259 	if (rec->uid_str) {
4260 		uid_t uid = parse_uid(rec->uid_str);
4261 
4262 		if (uid == UINT_MAX) {
4263 			ui__error("Invalid User: %s", rec->uid_str);
4264 			err = -EINVAL;
4265 			goto out;
4266 		}
4267 		err = parse_uid_filter(rec->evlist, uid);
4268 		if (err)
4269 			goto out;
4270 
4271 		/* User ID filtering implies system wide. */
4272 		rec->opts.target.system_wide = true;
4273 	}
4274 
4275 	/* Enable ignoring missing threads when -p option is defined. */
4276 	rec->opts.ignore_missing_thread = rec->opts.target.pid;
4277 
4278 	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4279 
4280 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4281 		arch__add_leaf_frame_record_opts(&rec->opts);
4282 
4283 	err = -ENOMEM;
4284 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4285 		if (rec->opts.target.pid != NULL) {
4286 			pr_err("Couldn't create thread/CPU maps: %s\n",
4287 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4288 			goto out;
4289 		}
4290 		else
4291 			usage_with_options(record_usage, record_options);
4292 	}
4293 
4294 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4295 	if (err)
4296 		goto out;
4297 
4298 	/*
4299 	 * We take all buildids when the file contains
4300 	 * AUX area tracing data because we do not decode the
4301 	 * trace because it would take too long.
4302 	 */
4303 	if (rec->opts.full_auxtrace)
4304 		rec->buildid_all = true;
4305 
4306 	if (rec->opts.text_poke) {
4307 		err = record__config_text_poke(rec->evlist);
4308 		if (err) {
4309 			pr_err("record__config_text_poke failed, error %d\n", err);
4310 			goto out;
4311 		}
4312 	}
4313 
4314 	if (rec->off_cpu) {
4315 		err = record__config_off_cpu(rec);
4316 		if (err) {
4317 			pr_err("record__config_off_cpu failed, error %d\n", err);
4318 			goto out;
4319 		}
4320 	}
4321 
4322 	if (record_opts__config(&rec->opts)) {
4323 		err = -EINVAL;
4324 		goto out;
4325 	}
4326 
4327 	err = record__config_tracking_events(rec);
4328 	if (err) {
4329 		pr_err("record__config_tracking_events failed, error %d\n", err);
4330 		goto out;
4331 	}
4332 
4333 	err = record__init_thread_masks(rec);
4334 	if (err) {
4335 		pr_err("Failed to initialize parallel data streaming masks\n");
4336 		goto out;
4337 	}
4338 
4339 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4340 		rec->opts.nr_cblocks = nr_cblocks_max;
4341 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4342 
4343 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4344 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4345 
4346 	if (rec->opts.comp_level > comp_level_max)
4347 		rec->opts.comp_level = comp_level_max;
4348 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4349 
4350 	err = __cmd_record(&record, argc, argv);
4351 out:
4352 	record__free_thread_masks(rec, rec->nr_threads);
4353 	rec->nr_threads = 0;
4354 	symbol__exit();
4355 	auxtrace_record__free(rec->itr);
4356 out_opts:
4357 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4358 	evlist__delete(rec->evlist);
4359 	return err;
4360 }
4361 
4362 static void snapshot_sig_handler(int sig __maybe_unused)
4363 {
4364 	struct record *rec = &record;
4365 
4366 	hit_auxtrace_snapshot_trigger(rec);
4367 
4368 	if (switch_output_signal(rec))
4369 		trigger_hit(&switch_output_trigger);
4370 }
4371 
4372 static void alarm_sig_handler(int sig __maybe_unused)
4373 {
4374 	struct record *rec = &record;
4375 
4376 	if (switch_output_time(rec))
4377 		trigger_hit(&switch_output_trigger);
4378 }
4379