xref: /linux/tools/perf/builtin-record.c (revision 722ecdbce68a87de2d9296f91308f44ea900a039)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/perf_api_probe.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/clockid.h"
50 #include "util/pmu-hybrid.h"
51 #include "util/evlist-hybrid.h"
52 #include "util/off_cpu.h"
53 #include "asm/bug.h"
54 #include "perf.h"
55 #include "cputopo.h"
56 
57 #include <errno.h>
58 #include <inttypes.h>
59 #include <locale.h>
60 #include <poll.h>
61 #include <pthread.h>
62 #include <unistd.h>
63 #ifndef HAVE_GETTID
64 #include <syscall.h>
65 #endif
66 #include <sched.h>
67 #include <signal.h>
68 #ifdef HAVE_EVENTFD_SUPPORT
69 #include <sys/eventfd.h>
70 #endif
71 #include <sys/mman.h>
72 #include <sys/wait.h>
73 #include <sys/types.h>
74 #include <sys/stat.h>
75 #include <fcntl.h>
76 #include <linux/err.h>
77 #include <linux/string.h>
78 #include <linux/time64.h>
79 #include <linux/zalloc.h>
80 #include <linux/bitmap.h>
81 #include <sys/time.h>
82 
83 struct switch_output {
84 	bool		 enabled;
85 	bool		 signal;
86 	unsigned long	 size;
87 	unsigned long	 time;
88 	const char	*str;
89 	bool		 set;
90 	char		 **filenames;
91 	int		 num_files;
92 	int		 cur_file;
93 };
94 
95 struct thread_mask {
96 	struct mmap_cpu_mask	maps;
97 	struct mmap_cpu_mask	affinity;
98 };
99 
100 struct record_thread {
101 	pid_t			tid;
102 	struct thread_mask	*mask;
103 	struct {
104 		int		msg[2];
105 		int		ack[2];
106 	} pipes;
107 	struct fdarray		pollfd;
108 	int			ctlfd_pos;
109 	int			nr_mmaps;
110 	struct mmap		**maps;
111 	struct mmap		**overwrite_maps;
112 	struct record		*rec;
113 	unsigned long long	samples;
114 	unsigned long		waking;
115 	u64			bytes_written;
116 	u64			bytes_transferred;
117 	u64			bytes_compressed;
118 };
119 
120 static __thread struct record_thread *thread;
121 
122 enum thread_msg {
123 	THREAD_MSG__UNDEFINED = 0,
124 	THREAD_MSG__READY,
125 	THREAD_MSG__MAX,
126 };
127 
128 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
129 	"UNDEFINED", "READY"
130 };
131 
132 enum thread_spec {
133 	THREAD_SPEC__UNDEFINED = 0,
134 	THREAD_SPEC__CPU,
135 	THREAD_SPEC__CORE,
136 	THREAD_SPEC__PACKAGE,
137 	THREAD_SPEC__NUMA,
138 	THREAD_SPEC__USER,
139 	THREAD_SPEC__MAX,
140 };
141 
142 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
143 	"undefined", "cpu", "core", "package", "numa", "user"
144 };
145 
146 struct record {
147 	struct perf_tool	tool;
148 	struct record_opts	opts;
149 	u64			bytes_written;
150 	struct perf_data	data;
151 	struct auxtrace_record	*itr;
152 	struct evlist	*evlist;
153 	struct perf_session	*session;
154 	struct evlist		*sb_evlist;
155 	pthread_t		thread_id;
156 	int			realtime_prio;
157 	bool			switch_output_event_set;
158 	bool			no_buildid;
159 	bool			no_buildid_set;
160 	bool			no_buildid_cache;
161 	bool			no_buildid_cache_set;
162 	bool			buildid_all;
163 	bool			buildid_mmap;
164 	bool			timestamp_filename;
165 	bool			timestamp_boundary;
166 	bool			off_cpu;
167 	struct switch_output	switch_output;
168 	unsigned long long	samples;
169 	unsigned long		output_max_size;	/* = 0: unlimited */
170 	struct perf_debuginfod	debuginfod;
171 	int			nr_threads;
172 	struct thread_mask	*thread_masks;
173 	struct record_thread	*thread_data;
174 };
175 
176 static volatile int done;
177 
178 static volatile int auxtrace_record__snapshot_started;
179 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
180 static DEFINE_TRIGGER(switch_output_trigger);
181 
182 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
183 	"SYS", "NODE", "CPU"
184 };
185 
186 #ifndef HAVE_GETTID
187 static inline pid_t gettid(void)
188 {
189 	return (pid_t)syscall(__NR_gettid);
190 }
191 #endif
192 
193 static int record__threads_enabled(struct record *rec)
194 {
195 	return rec->opts.threads_spec;
196 }
197 
198 static bool switch_output_signal(struct record *rec)
199 {
200 	return rec->switch_output.signal &&
201 	       trigger_is_ready(&switch_output_trigger);
202 }
203 
204 static bool switch_output_size(struct record *rec)
205 {
206 	return rec->switch_output.size &&
207 	       trigger_is_ready(&switch_output_trigger) &&
208 	       (rec->bytes_written >= rec->switch_output.size);
209 }
210 
211 static bool switch_output_time(struct record *rec)
212 {
213 	return rec->switch_output.time &&
214 	       trigger_is_ready(&switch_output_trigger);
215 }
216 
217 static u64 record__bytes_written(struct record *rec)
218 {
219 	int t;
220 	u64 bytes_written = rec->bytes_written;
221 	struct record_thread *thread_data = rec->thread_data;
222 
223 	for (t = 0; t < rec->nr_threads; t++)
224 		bytes_written += thread_data[t].bytes_written;
225 
226 	return bytes_written;
227 }
228 
229 static bool record__output_max_size_exceeded(struct record *rec)
230 {
231 	return rec->output_max_size &&
232 	       (record__bytes_written(rec) >= rec->output_max_size);
233 }
234 
235 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
236 			 void *bf, size_t size)
237 {
238 	struct perf_data_file *file = &rec->session->data->file;
239 
240 	if (map && map->file)
241 		file = map->file;
242 
243 	if (perf_data_file__write(file, bf, size) < 0) {
244 		pr_err("failed to write perf data, error: %m\n");
245 		return -1;
246 	}
247 
248 	if (map && map->file)
249 		thread->bytes_written += size;
250 	else
251 		rec->bytes_written += size;
252 
253 	if (record__output_max_size_exceeded(rec) && !done) {
254 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
255 				" stopping session ]\n",
256 				record__bytes_written(rec) >> 10);
257 		done = 1;
258 	}
259 
260 	if (switch_output_size(rec))
261 		trigger_hit(&switch_output_trigger);
262 
263 	return 0;
264 }
265 
266 static int record__aio_enabled(struct record *rec);
267 static int record__comp_enabled(struct record *rec);
268 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
269 			    void *dst, size_t dst_size, void *src, size_t src_size);
270 
271 #ifdef HAVE_AIO_SUPPORT
272 static int record__aio_write(struct aiocb *cblock, int trace_fd,
273 		void *buf, size_t size, off_t off)
274 {
275 	int rc;
276 
277 	cblock->aio_fildes = trace_fd;
278 	cblock->aio_buf    = buf;
279 	cblock->aio_nbytes = size;
280 	cblock->aio_offset = off;
281 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
282 
283 	do {
284 		rc = aio_write(cblock);
285 		if (rc == 0) {
286 			break;
287 		} else if (errno != EAGAIN) {
288 			cblock->aio_fildes = -1;
289 			pr_err("failed to queue perf data, error: %m\n");
290 			break;
291 		}
292 	} while (1);
293 
294 	return rc;
295 }
296 
297 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
298 {
299 	void *rem_buf;
300 	off_t rem_off;
301 	size_t rem_size;
302 	int rc, aio_errno;
303 	ssize_t aio_ret, written;
304 
305 	aio_errno = aio_error(cblock);
306 	if (aio_errno == EINPROGRESS)
307 		return 0;
308 
309 	written = aio_ret = aio_return(cblock);
310 	if (aio_ret < 0) {
311 		if (aio_errno != EINTR)
312 			pr_err("failed to write perf data, error: %m\n");
313 		written = 0;
314 	}
315 
316 	rem_size = cblock->aio_nbytes - written;
317 
318 	if (rem_size == 0) {
319 		cblock->aio_fildes = -1;
320 		/*
321 		 * md->refcount is incremented in record__aio_pushfn() for
322 		 * every aio write request started in record__aio_push() so
323 		 * decrement it because the request is now complete.
324 		 */
325 		perf_mmap__put(&md->core);
326 		rc = 1;
327 	} else {
328 		/*
329 		 * aio write request may require restart with the
330 		 * reminder if the kernel didn't write whole
331 		 * chunk at once.
332 		 */
333 		rem_off = cblock->aio_offset + written;
334 		rem_buf = (void *)(cblock->aio_buf + written);
335 		record__aio_write(cblock, cblock->aio_fildes,
336 				rem_buf, rem_size, rem_off);
337 		rc = 0;
338 	}
339 
340 	return rc;
341 }
342 
343 static int record__aio_sync(struct mmap *md, bool sync_all)
344 {
345 	struct aiocb **aiocb = md->aio.aiocb;
346 	struct aiocb *cblocks = md->aio.cblocks;
347 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
348 	int i, do_suspend;
349 
350 	do {
351 		do_suspend = 0;
352 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
353 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
354 				if (sync_all)
355 					aiocb[i] = NULL;
356 				else
357 					return i;
358 			} else {
359 				/*
360 				 * Started aio write is not complete yet
361 				 * so it has to be waited before the
362 				 * next allocation.
363 				 */
364 				aiocb[i] = &cblocks[i];
365 				do_suspend = 1;
366 			}
367 		}
368 		if (!do_suspend)
369 			return -1;
370 
371 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
372 			if (!(errno == EAGAIN || errno == EINTR))
373 				pr_err("failed to sync perf data, error: %m\n");
374 		}
375 	} while (1);
376 }
377 
378 struct record_aio {
379 	struct record	*rec;
380 	void		*data;
381 	size_t		size;
382 };
383 
384 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
385 {
386 	struct record_aio *aio = to;
387 
388 	/*
389 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
390 	 * to release space in the kernel buffer as fast as possible, calling
391 	 * perf_mmap__consume() from perf_mmap__push() function.
392 	 *
393 	 * That lets the kernel to proceed with storing more profiling data into
394 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
395 	 *
396 	 * Coping can be done in two steps in case the chunk of profiling data
397 	 * crosses the upper bound of the kernel buffer. In this case we first move
398 	 * part of data from map->start till the upper bound and then the reminder
399 	 * from the beginning of the kernel buffer till the end of the data chunk.
400 	 */
401 
402 	if (record__comp_enabled(aio->rec)) {
403 		size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
404 				     mmap__mmap_len(map) - aio->size,
405 				     buf, size);
406 	} else {
407 		memcpy(aio->data + aio->size, buf, size);
408 	}
409 
410 	if (!aio->size) {
411 		/*
412 		 * Increment map->refcount to guard map->aio.data[] buffer
413 		 * from premature deallocation because map object can be
414 		 * released earlier than aio write request started on
415 		 * map->aio.data[] buffer is complete.
416 		 *
417 		 * perf_mmap__put() is done at record__aio_complete()
418 		 * after started aio request completion or at record__aio_push()
419 		 * if the request failed to start.
420 		 */
421 		perf_mmap__get(&map->core);
422 	}
423 
424 	aio->size += size;
425 
426 	return size;
427 }
428 
429 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
430 {
431 	int ret, idx;
432 	int trace_fd = rec->session->data->file.fd;
433 	struct record_aio aio = { .rec = rec, .size = 0 };
434 
435 	/*
436 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
437 	 * becomes available after previous aio write operation.
438 	 */
439 
440 	idx = record__aio_sync(map, false);
441 	aio.data = map->aio.data[idx];
442 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
443 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
444 		return ret;
445 
446 	rec->samples++;
447 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
448 	if (!ret) {
449 		*off += aio.size;
450 		rec->bytes_written += aio.size;
451 		if (switch_output_size(rec))
452 			trigger_hit(&switch_output_trigger);
453 	} else {
454 		/*
455 		 * Decrement map->refcount incremented in record__aio_pushfn()
456 		 * back if record__aio_write() operation failed to start, otherwise
457 		 * map->refcount is decremented in record__aio_complete() after
458 		 * aio write operation finishes successfully.
459 		 */
460 		perf_mmap__put(&map->core);
461 	}
462 
463 	return ret;
464 }
465 
466 static off_t record__aio_get_pos(int trace_fd)
467 {
468 	return lseek(trace_fd, 0, SEEK_CUR);
469 }
470 
471 static void record__aio_set_pos(int trace_fd, off_t pos)
472 {
473 	lseek(trace_fd, pos, SEEK_SET);
474 }
475 
476 static void record__aio_mmap_read_sync(struct record *rec)
477 {
478 	int i;
479 	struct evlist *evlist = rec->evlist;
480 	struct mmap *maps = evlist->mmap;
481 
482 	if (!record__aio_enabled(rec))
483 		return;
484 
485 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
486 		struct mmap *map = &maps[i];
487 
488 		if (map->core.base)
489 			record__aio_sync(map, true);
490 	}
491 }
492 
493 static int nr_cblocks_default = 1;
494 static int nr_cblocks_max = 4;
495 
496 static int record__aio_parse(const struct option *opt,
497 			     const char *str,
498 			     int unset)
499 {
500 	struct record_opts *opts = (struct record_opts *)opt->value;
501 
502 	if (unset) {
503 		opts->nr_cblocks = 0;
504 	} else {
505 		if (str)
506 			opts->nr_cblocks = strtol(str, NULL, 0);
507 		if (!opts->nr_cblocks)
508 			opts->nr_cblocks = nr_cblocks_default;
509 	}
510 
511 	return 0;
512 }
513 #else /* HAVE_AIO_SUPPORT */
514 static int nr_cblocks_max = 0;
515 
516 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
517 			    off_t *off __maybe_unused)
518 {
519 	return -1;
520 }
521 
522 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
523 {
524 	return -1;
525 }
526 
527 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
528 {
529 }
530 
531 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
532 {
533 }
534 #endif
535 
536 static int record__aio_enabled(struct record *rec)
537 {
538 	return rec->opts.nr_cblocks > 0;
539 }
540 
541 #define MMAP_FLUSH_DEFAULT 1
542 static int record__mmap_flush_parse(const struct option *opt,
543 				    const char *str,
544 				    int unset)
545 {
546 	int flush_max;
547 	struct record_opts *opts = (struct record_opts *)opt->value;
548 	static struct parse_tag tags[] = {
549 			{ .tag  = 'B', .mult = 1       },
550 			{ .tag  = 'K', .mult = 1 << 10 },
551 			{ .tag  = 'M', .mult = 1 << 20 },
552 			{ .tag  = 'G', .mult = 1 << 30 },
553 			{ .tag  = 0 },
554 	};
555 
556 	if (unset)
557 		return 0;
558 
559 	if (str) {
560 		opts->mmap_flush = parse_tag_value(str, tags);
561 		if (opts->mmap_flush == (int)-1)
562 			opts->mmap_flush = strtol(str, NULL, 0);
563 	}
564 
565 	if (!opts->mmap_flush)
566 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
567 
568 	flush_max = evlist__mmap_size(opts->mmap_pages);
569 	flush_max /= 4;
570 	if (opts->mmap_flush > flush_max)
571 		opts->mmap_flush = flush_max;
572 
573 	return 0;
574 }
575 
576 #ifdef HAVE_ZSTD_SUPPORT
577 static unsigned int comp_level_default = 1;
578 
579 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
580 {
581 	struct record_opts *opts = opt->value;
582 
583 	if (unset) {
584 		opts->comp_level = 0;
585 	} else {
586 		if (str)
587 			opts->comp_level = strtol(str, NULL, 0);
588 		if (!opts->comp_level)
589 			opts->comp_level = comp_level_default;
590 	}
591 
592 	return 0;
593 }
594 #endif
595 static unsigned int comp_level_max = 22;
596 
597 static int record__comp_enabled(struct record *rec)
598 {
599 	return rec->opts.comp_level > 0;
600 }
601 
602 static int process_synthesized_event(struct perf_tool *tool,
603 				     union perf_event *event,
604 				     struct perf_sample *sample __maybe_unused,
605 				     struct machine *machine __maybe_unused)
606 {
607 	struct record *rec = container_of(tool, struct record, tool);
608 	return record__write(rec, NULL, event, event->header.size);
609 }
610 
611 static int process_locked_synthesized_event(struct perf_tool *tool,
612 				     union perf_event *event,
613 				     struct perf_sample *sample __maybe_unused,
614 				     struct machine *machine __maybe_unused)
615 {
616 	static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
617 	int ret;
618 
619 	pthread_mutex_lock(&synth_lock);
620 	ret = process_synthesized_event(tool, event, sample, machine);
621 	pthread_mutex_unlock(&synth_lock);
622 	return ret;
623 }
624 
625 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
626 {
627 	struct record *rec = to;
628 
629 	if (record__comp_enabled(rec)) {
630 		size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size);
631 		bf   = map->data;
632 	}
633 
634 	thread->samples++;
635 	return record__write(rec, map, bf, size);
636 }
637 
638 static volatile int signr = -1;
639 static volatile int child_finished;
640 #ifdef HAVE_EVENTFD_SUPPORT
641 static int done_fd = -1;
642 #endif
643 
644 static void sig_handler(int sig)
645 {
646 	if (sig == SIGCHLD)
647 		child_finished = 1;
648 	else
649 		signr = sig;
650 
651 	done = 1;
652 #ifdef HAVE_EVENTFD_SUPPORT
653 {
654 	u64 tmp = 1;
655 	/*
656 	 * It is possible for this signal handler to run after done is checked
657 	 * in the main loop, but before the perf counter fds are polled. If this
658 	 * happens, the poll() will continue to wait even though done is set,
659 	 * and will only break out if either another signal is received, or the
660 	 * counters are ready for read. To ensure the poll() doesn't sleep when
661 	 * done is set, use an eventfd (done_fd) to wake up the poll().
662 	 */
663 	if (write(done_fd, &tmp, sizeof(tmp)) < 0)
664 		pr_err("failed to signal wakeup fd, error: %m\n");
665 }
666 #endif // HAVE_EVENTFD_SUPPORT
667 }
668 
669 static void sigsegv_handler(int sig)
670 {
671 	perf_hooks__recover();
672 	sighandler_dump_stack(sig);
673 }
674 
675 static void record__sig_exit(void)
676 {
677 	if (signr == -1)
678 		return;
679 
680 	signal(signr, SIG_DFL);
681 	raise(signr);
682 }
683 
684 #ifdef HAVE_AUXTRACE_SUPPORT
685 
686 static int record__process_auxtrace(struct perf_tool *tool,
687 				    struct mmap *map,
688 				    union perf_event *event, void *data1,
689 				    size_t len1, void *data2, size_t len2)
690 {
691 	struct record *rec = container_of(tool, struct record, tool);
692 	struct perf_data *data = &rec->data;
693 	size_t padding;
694 	u8 pad[8] = {0};
695 
696 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
697 		off_t file_offset;
698 		int fd = perf_data__fd(data);
699 		int err;
700 
701 		file_offset = lseek(fd, 0, SEEK_CUR);
702 		if (file_offset == -1)
703 			return -1;
704 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
705 						     event, file_offset);
706 		if (err)
707 			return err;
708 	}
709 
710 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
711 	padding = (len1 + len2) & 7;
712 	if (padding)
713 		padding = 8 - padding;
714 
715 	record__write(rec, map, event, event->header.size);
716 	record__write(rec, map, data1, len1);
717 	if (len2)
718 		record__write(rec, map, data2, len2);
719 	record__write(rec, map, &pad, padding);
720 
721 	return 0;
722 }
723 
724 static int record__auxtrace_mmap_read(struct record *rec,
725 				      struct mmap *map)
726 {
727 	int ret;
728 
729 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
730 				  record__process_auxtrace);
731 	if (ret < 0)
732 		return ret;
733 
734 	if (ret)
735 		rec->samples++;
736 
737 	return 0;
738 }
739 
740 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
741 					       struct mmap *map)
742 {
743 	int ret;
744 
745 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
746 					   record__process_auxtrace,
747 					   rec->opts.auxtrace_snapshot_size);
748 	if (ret < 0)
749 		return ret;
750 
751 	if (ret)
752 		rec->samples++;
753 
754 	return 0;
755 }
756 
757 static int record__auxtrace_read_snapshot_all(struct record *rec)
758 {
759 	int i;
760 	int rc = 0;
761 
762 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
763 		struct mmap *map = &rec->evlist->mmap[i];
764 
765 		if (!map->auxtrace_mmap.base)
766 			continue;
767 
768 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
769 			rc = -1;
770 			goto out;
771 		}
772 	}
773 out:
774 	return rc;
775 }
776 
777 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
778 {
779 	pr_debug("Recording AUX area tracing snapshot\n");
780 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
781 		trigger_error(&auxtrace_snapshot_trigger);
782 	} else {
783 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
784 			trigger_error(&auxtrace_snapshot_trigger);
785 		else
786 			trigger_ready(&auxtrace_snapshot_trigger);
787 	}
788 }
789 
790 static int record__auxtrace_snapshot_exit(struct record *rec)
791 {
792 	if (trigger_is_error(&auxtrace_snapshot_trigger))
793 		return 0;
794 
795 	if (!auxtrace_record__snapshot_started &&
796 	    auxtrace_record__snapshot_start(rec->itr))
797 		return -1;
798 
799 	record__read_auxtrace_snapshot(rec, true);
800 	if (trigger_is_error(&auxtrace_snapshot_trigger))
801 		return -1;
802 
803 	return 0;
804 }
805 
806 static int record__auxtrace_init(struct record *rec)
807 {
808 	int err;
809 
810 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
811 	    && record__threads_enabled(rec)) {
812 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
813 		return -EINVAL;
814 	}
815 
816 	if (!rec->itr) {
817 		rec->itr = auxtrace_record__init(rec->evlist, &err);
818 		if (err)
819 			return err;
820 	}
821 
822 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
823 					      rec->opts.auxtrace_snapshot_opts);
824 	if (err)
825 		return err;
826 
827 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
828 					    rec->opts.auxtrace_sample_opts);
829 	if (err)
830 		return err;
831 
832 	auxtrace_regroup_aux_output(rec->evlist);
833 
834 	return auxtrace_parse_filters(rec->evlist);
835 }
836 
837 #else
838 
839 static inline
840 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
841 			       struct mmap *map __maybe_unused)
842 {
843 	return 0;
844 }
845 
846 static inline
847 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
848 				    bool on_exit __maybe_unused)
849 {
850 }
851 
852 static inline
853 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
854 {
855 	return 0;
856 }
857 
858 static inline
859 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
860 {
861 	return 0;
862 }
863 
864 static int record__auxtrace_init(struct record *rec __maybe_unused)
865 {
866 	return 0;
867 }
868 
869 #endif
870 
871 static int record__config_text_poke(struct evlist *evlist)
872 {
873 	struct evsel *evsel;
874 
875 	/* Nothing to do if text poke is already configured */
876 	evlist__for_each_entry(evlist, evsel) {
877 		if (evsel->core.attr.text_poke)
878 			return 0;
879 	}
880 
881 	evsel = evlist__add_dummy_on_all_cpus(evlist);
882 	if (!evsel)
883 		return -ENOMEM;
884 
885 	evsel->core.attr.text_poke = 1;
886 	evsel->core.attr.ksymbol = 1;
887 	evsel->immediate = true;
888 	evsel__set_sample_bit(evsel, TIME);
889 
890 	return 0;
891 }
892 
893 static int record__config_off_cpu(struct record *rec)
894 {
895 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
896 }
897 
898 static bool record__kcore_readable(struct machine *machine)
899 {
900 	char kcore[PATH_MAX];
901 	int fd;
902 
903 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
904 
905 	fd = open(kcore, O_RDONLY);
906 	if (fd < 0)
907 		return false;
908 
909 	close(fd);
910 
911 	return true;
912 }
913 
914 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
915 {
916 	char from_dir[PATH_MAX];
917 	char kcore_dir[PATH_MAX];
918 	int ret;
919 
920 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
921 
922 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
923 	if (ret)
924 		return ret;
925 
926 	return kcore_copy(from_dir, kcore_dir);
927 }
928 
929 static void record__thread_data_init_pipes(struct record_thread *thread_data)
930 {
931 	thread_data->pipes.msg[0] = -1;
932 	thread_data->pipes.msg[1] = -1;
933 	thread_data->pipes.ack[0] = -1;
934 	thread_data->pipes.ack[1] = -1;
935 }
936 
937 static int record__thread_data_open_pipes(struct record_thread *thread_data)
938 {
939 	if (pipe(thread_data->pipes.msg))
940 		return -EINVAL;
941 
942 	if (pipe(thread_data->pipes.ack)) {
943 		close(thread_data->pipes.msg[0]);
944 		thread_data->pipes.msg[0] = -1;
945 		close(thread_data->pipes.msg[1]);
946 		thread_data->pipes.msg[1] = -1;
947 		return -EINVAL;
948 	}
949 
950 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
951 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
952 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
953 
954 	return 0;
955 }
956 
957 static void record__thread_data_close_pipes(struct record_thread *thread_data)
958 {
959 	if (thread_data->pipes.msg[0] != -1) {
960 		close(thread_data->pipes.msg[0]);
961 		thread_data->pipes.msg[0] = -1;
962 	}
963 	if (thread_data->pipes.msg[1] != -1) {
964 		close(thread_data->pipes.msg[1]);
965 		thread_data->pipes.msg[1] = -1;
966 	}
967 	if (thread_data->pipes.ack[0] != -1) {
968 		close(thread_data->pipes.ack[0]);
969 		thread_data->pipes.ack[0] = -1;
970 	}
971 	if (thread_data->pipes.ack[1] != -1) {
972 		close(thread_data->pipes.ack[1]);
973 		thread_data->pipes.ack[1] = -1;
974 	}
975 }
976 
977 static bool evlist__per_thread(struct evlist *evlist)
978 {
979 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
980 }
981 
982 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
983 {
984 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
985 	struct mmap *mmap = evlist->mmap;
986 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
987 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
988 	bool per_thread = evlist__per_thread(evlist);
989 
990 	if (per_thread)
991 		thread_data->nr_mmaps = nr_mmaps;
992 	else
993 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
994 						      thread_data->mask->maps.nbits);
995 	if (mmap) {
996 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
997 		if (!thread_data->maps)
998 			return -ENOMEM;
999 	}
1000 	if (overwrite_mmap) {
1001 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1002 		if (!thread_data->overwrite_maps) {
1003 			zfree(&thread_data->maps);
1004 			return -ENOMEM;
1005 		}
1006 	}
1007 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1008 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1009 
1010 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1011 		if (per_thread ||
1012 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1013 			if (thread_data->maps) {
1014 				thread_data->maps[tm] = &mmap[m];
1015 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1016 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1017 			}
1018 			if (thread_data->overwrite_maps) {
1019 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1020 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1021 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1022 			}
1023 			tm++;
1024 		}
1025 	}
1026 
1027 	return 0;
1028 }
1029 
1030 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1031 {
1032 	int f, tm, pos;
1033 	struct mmap *map, *overwrite_map;
1034 
1035 	fdarray__init(&thread_data->pollfd, 64);
1036 
1037 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1038 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1039 		overwrite_map = thread_data->overwrite_maps ?
1040 				thread_data->overwrite_maps[tm] : NULL;
1041 
1042 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1043 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1044 
1045 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1046 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1047 							      &evlist->core.pollfd);
1048 				if (pos < 0)
1049 					return pos;
1050 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1051 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1052 			}
1053 		}
1054 	}
1055 
1056 	return 0;
1057 }
1058 
1059 static void record__free_thread_data(struct record *rec)
1060 {
1061 	int t;
1062 	struct record_thread *thread_data = rec->thread_data;
1063 
1064 	if (thread_data == NULL)
1065 		return;
1066 
1067 	for (t = 0; t < rec->nr_threads; t++) {
1068 		record__thread_data_close_pipes(&thread_data[t]);
1069 		zfree(&thread_data[t].maps);
1070 		zfree(&thread_data[t].overwrite_maps);
1071 		fdarray__exit(&thread_data[t].pollfd);
1072 	}
1073 
1074 	zfree(&rec->thread_data);
1075 }
1076 
1077 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1078 {
1079 	int t, ret;
1080 	struct record_thread *thread_data;
1081 
1082 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1083 	if (!rec->thread_data) {
1084 		pr_err("Failed to allocate thread data\n");
1085 		return -ENOMEM;
1086 	}
1087 	thread_data = rec->thread_data;
1088 
1089 	for (t = 0; t < rec->nr_threads; t++)
1090 		record__thread_data_init_pipes(&thread_data[t]);
1091 
1092 	for (t = 0; t < rec->nr_threads; t++) {
1093 		thread_data[t].rec = rec;
1094 		thread_data[t].mask = &rec->thread_masks[t];
1095 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1096 		if (ret) {
1097 			pr_err("Failed to initialize thread[%d] maps\n", t);
1098 			goto out_free;
1099 		}
1100 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1101 		if (ret) {
1102 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1103 			goto out_free;
1104 		}
1105 		if (t) {
1106 			thread_data[t].tid = -1;
1107 			ret = record__thread_data_open_pipes(&thread_data[t]);
1108 			if (ret) {
1109 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1110 				goto out_free;
1111 			}
1112 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1113 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1114 			if (ret < 0) {
1115 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1116 				goto out_free;
1117 			}
1118 			thread_data[t].ctlfd_pos = ret;
1119 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1120 				 thread_data, thread_data[t].ctlfd_pos,
1121 				 thread_data[t].pipes.msg[0]);
1122 		} else {
1123 			thread_data[t].tid = gettid();
1124 			if (evlist->ctl_fd.pos == -1)
1125 				continue;
1126 			ret = fdarray__dup_entry_from(&thread_data[t].pollfd, evlist->ctl_fd.pos,
1127 						      &evlist->core.pollfd);
1128 			if (ret < 0) {
1129 				pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1130 				goto out_free;
1131 			}
1132 			thread_data[t].ctlfd_pos = ret;
1133 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1134 				 thread_data, thread_data[t].ctlfd_pos,
1135 				 evlist->core.pollfd.entries[evlist->ctl_fd.pos].fd);
1136 		}
1137 	}
1138 
1139 	return 0;
1140 
1141 out_free:
1142 	record__free_thread_data(rec);
1143 
1144 	return ret;
1145 }
1146 
1147 static int record__mmap_evlist(struct record *rec,
1148 			       struct evlist *evlist)
1149 {
1150 	int i, ret;
1151 	struct record_opts *opts = &rec->opts;
1152 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1153 				  opts->auxtrace_sample_mode;
1154 	char msg[512];
1155 
1156 	if (opts->affinity != PERF_AFFINITY_SYS)
1157 		cpu__setup_cpunode_map();
1158 
1159 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1160 				 opts->auxtrace_mmap_pages,
1161 				 auxtrace_overwrite,
1162 				 opts->nr_cblocks, opts->affinity,
1163 				 opts->mmap_flush, opts->comp_level) < 0) {
1164 		if (errno == EPERM) {
1165 			pr_err("Permission error mapping pages.\n"
1166 			       "Consider increasing "
1167 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1168 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1169 			       "(current value: %u,%u)\n",
1170 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1171 			return -errno;
1172 		} else {
1173 			pr_err("failed to mmap with %d (%s)\n", errno,
1174 				str_error_r(errno, msg, sizeof(msg)));
1175 			if (errno)
1176 				return -errno;
1177 			else
1178 				return -EINVAL;
1179 		}
1180 	}
1181 
1182 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1183 		return -1;
1184 
1185 	ret = record__alloc_thread_data(rec, evlist);
1186 	if (ret)
1187 		return ret;
1188 
1189 	if (record__threads_enabled(rec)) {
1190 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1191 		if (ret) {
1192 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1193 			return ret;
1194 		}
1195 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1196 			if (evlist->mmap)
1197 				evlist->mmap[i].file = &rec->data.dir.files[i];
1198 			if (evlist->overwrite_mmap)
1199 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1200 		}
1201 	}
1202 
1203 	return 0;
1204 }
1205 
1206 static int record__mmap(struct record *rec)
1207 {
1208 	return record__mmap_evlist(rec, rec->evlist);
1209 }
1210 
1211 static int record__open(struct record *rec)
1212 {
1213 	char msg[BUFSIZ];
1214 	struct evsel *pos;
1215 	struct evlist *evlist = rec->evlist;
1216 	struct perf_session *session = rec->session;
1217 	struct record_opts *opts = &rec->opts;
1218 	int rc = 0;
1219 
1220 	/*
1221 	 * For initial_delay, system wide or a hybrid system, we need to add a
1222 	 * dummy event so that we can track PERF_RECORD_MMAP to cover the delay
1223 	 * of waiting or event synthesis.
1224 	 */
1225 	if (opts->initial_delay || target__has_cpu(&opts->target) ||
1226 	    perf_pmu__has_hybrid()) {
1227 		pos = evlist__get_tracking_event(evlist);
1228 		if (!evsel__is_dummy_event(pos)) {
1229 			/* Set up dummy event. */
1230 			if (evlist__add_dummy(evlist))
1231 				return -ENOMEM;
1232 			pos = evlist__last(evlist);
1233 			evlist__set_tracking_event(evlist, pos);
1234 		}
1235 
1236 		/*
1237 		 * Enable the dummy event when the process is forked for
1238 		 * initial_delay, immediately for system wide.
1239 		 */
1240 		if (opts->initial_delay && !pos->immediate &&
1241 		    !target__has_cpu(&opts->target))
1242 			pos->core.attr.enable_on_exec = 1;
1243 		else
1244 			pos->immediate = 1;
1245 	}
1246 
1247 	evlist__config(evlist, opts, &callchain_param);
1248 
1249 	evlist__for_each_entry(evlist, pos) {
1250 try_again:
1251 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1252 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
1253 				if (verbose > 0)
1254 					ui__warning("%s\n", msg);
1255 				goto try_again;
1256 			}
1257 			if ((errno == EINVAL || errno == EBADF) &&
1258 			    pos->core.leader != &pos->core &&
1259 			    pos->weak_group) {
1260 			        pos = evlist__reset_weak_group(evlist, pos, true);
1261 				goto try_again;
1262 			}
1263 			rc = -errno;
1264 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1265 			ui__error("%s\n", msg);
1266 			goto out;
1267 		}
1268 
1269 		pos->supported = true;
1270 	}
1271 
1272 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1273 		pr_warning(
1274 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1275 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1276 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1277 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1278 "Samples in kernel modules won't be resolved at all.\n\n"
1279 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1280 "even with a suitable vmlinux or kallsyms file.\n\n");
1281 	}
1282 
1283 	if (evlist__apply_filters(evlist, &pos)) {
1284 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1285 			pos->filter, evsel__name(pos), errno,
1286 			str_error_r(errno, msg, sizeof(msg)));
1287 		rc = -1;
1288 		goto out;
1289 	}
1290 
1291 	rc = record__mmap(rec);
1292 	if (rc)
1293 		goto out;
1294 
1295 	session->evlist = evlist;
1296 	perf_session__set_id_hdr_size(session);
1297 out:
1298 	return rc;
1299 }
1300 
1301 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1302 {
1303 	if (rec->evlist->first_sample_time == 0)
1304 		rec->evlist->first_sample_time = sample_time;
1305 
1306 	if (sample_time)
1307 		rec->evlist->last_sample_time = sample_time;
1308 }
1309 
1310 static int process_sample_event(struct perf_tool *tool,
1311 				union perf_event *event,
1312 				struct perf_sample *sample,
1313 				struct evsel *evsel,
1314 				struct machine *machine)
1315 {
1316 	struct record *rec = container_of(tool, struct record, tool);
1317 
1318 	set_timestamp_boundary(rec, sample->time);
1319 
1320 	if (rec->buildid_all)
1321 		return 0;
1322 
1323 	rec->samples++;
1324 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1325 }
1326 
1327 static int process_buildids(struct record *rec)
1328 {
1329 	struct perf_session *session = rec->session;
1330 
1331 	if (perf_data__size(&rec->data) == 0)
1332 		return 0;
1333 
1334 	/*
1335 	 * During this process, it'll load kernel map and replace the
1336 	 * dso->long_name to a real pathname it found.  In this case
1337 	 * we prefer the vmlinux path like
1338 	 *   /lib/modules/3.16.4/build/vmlinux
1339 	 *
1340 	 * rather than build-id path (in debug directory).
1341 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1342 	 */
1343 	symbol_conf.ignore_vmlinux_buildid = true;
1344 
1345 	/*
1346 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1347 	 * so no need to process samples. But if timestamp_boundary is enabled,
1348 	 * it still needs to walk on all samples to get the timestamps of
1349 	 * first/last samples.
1350 	 */
1351 	if (rec->buildid_all && !rec->timestamp_boundary)
1352 		rec->tool.sample = NULL;
1353 
1354 	return perf_session__process_events(session);
1355 }
1356 
1357 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1358 {
1359 	int err;
1360 	struct perf_tool *tool = data;
1361 	/*
1362 	 *As for guest kernel when processing subcommand record&report,
1363 	 *we arrange module mmap prior to guest kernel mmap and trigger
1364 	 *a preload dso because default guest module symbols are loaded
1365 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1366 	 *method is used to avoid symbol missing when the first addr is
1367 	 *in module instead of in guest kernel.
1368 	 */
1369 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1370 					     machine);
1371 	if (err < 0)
1372 		pr_err("Couldn't record guest kernel [%d]'s reference"
1373 		       " relocation symbol.\n", machine->pid);
1374 
1375 	/*
1376 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1377 	 * have no _text sometimes.
1378 	 */
1379 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1380 						 machine);
1381 	if (err < 0)
1382 		pr_err("Couldn't record guest kernel [%d]'s reference"
1383 		       " relocation symbol.\n", machine->pid);
1384 }
1385 
1386 static struct perf_event_header finished_round_event = {
1387 	.size = sizeof(struct perf_event_header),
1388 	.type = PERF_RECORD_FINISHED_ROUND,
1389 };
1390 
1391 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1392 {
1393 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1394 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1395 			  thread->mask->affinity.nbits)) {
1396 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1397 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1398 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1399 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1400 					(cpu_set_t *)thread->mask->affinity.bits);
1401 		if (verbose == 2) {
1402 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1403 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1404 		}
1405 	}
1406 }
1407 
1408 static size_t process_comp_header(void *record, size_t increment)
1409 {
1410 	struct perf_record_compressed *event = record;
1411 	size_t size = sizeof(*event);
1412 
1413 	if (increment) {
1414 		event->header.size += increment;
1415 		return increment;
1416 	}
1417 
1418 	event->header.type = PERF_RECORD_COMPRESSED;
1419 	event->header.size = size;
1420 
1421 	return size;
1422 }
1423 
1424 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
1425 			    void *dst, size_t dst_size, void *src, size_t src_size)
1426 {
1427 	size_t compressed;
1428 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1429 	struct zstd_data *zstd_data = &session->zstd_data;
1430 
1431 	if (map && map->file)
1432 		zstd_data = &map->zstd_data;
1433 
1434 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1435 						     max_record_size, process_comp_header);
1436 
1437 	if (map && map->file) {
1438 		thread->bytes_transferred += src_size;
1439 		thread->bytes_compressed  += compressed;
1440 	} else {
1441 		session->bytes_transferred += src_size;
1442 		session->bytes_compressed  += compressed;
1443 	}
1444 
1445 	return compressed;
1446 }
1447 
1448 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1449 				    bool overwrite, bool synch)
1450 {
1451 	u64 bytes_written = rec->bytes_written;
1452 	int i;
1453 	int rc = 0;
1454 	int nr_mmaps;
1455 	struct mmap **maps;
1456 	int trace_fd = rec->data.file.fd;
1457 	off_t off = 0;
1458 
1459 	if (!evlist)
1460 		return 0;
1461 
1462 	nr_mmaps = thread->nr_mmaps;
1463 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1464 
1465 	if (!maps)
1466 		return 0;
1467 
1468 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1469 		return 0;
1470 
1471 	if (record__aio_enabled(rec))
1472 		off = record__aio_get_pos(trace_fd);
1473 
1474 	for (i = 0; i < nr_mmaps; i++) {
1475 		u64 flush = 0;
1476 		struct mmap *map = maps[i];
1477 
1478 		if (map->core.base) {
1479 			record__adjust_affinity(rec, map);
1480 			if (synch) {
1481 				flush = map->core.flush;
1482 				map->core.flush = 1;
1483 			}
1484 			if (!record__aio_enabled(rec)) {
1485 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1486 					if (synch)
1487 						map->core.flush = flush;
1488 					rc = -1;
1489 					goto out;
1490 				}
1491 			} else {
1492 				if (record__aio_push(rec, map, &off) < 0) {
1493 					record__aio_set_pos(trace_fd, off);
1494 					if (synch)
1495 						map->core.flush = flush;
1496 					rc = -1;
1497 					goto out;
1498 				}
1499 			}
1500 			if (synch)
1501 				map->core.flush = flush;
1502 		}
1503 
1504 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1505 		    !rec->opts.auxtrace_sample_mode &&
1506 		    record__auxtrace_mmap_read(rec, map) != 0) {
1507 			rc = -1;
1508 			goto out;
1509 		}
1510 	}
1511 
1512 	if (record__aio_enabled(rec))
1513 		record__aio_set_pos(trace_fd, off);
1514 
1515 	/*
1516 	 * Mark the round finished in case we wrote
1517 	 * at least one event.
1518 	 *
1519 	 * No need for round events in directory mode,
1520 	 * because per-cpu maps and files have data
1521 	 * sorted by kernel.
1522 	 */
1523 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1524 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1525 
1526 	if (overwrite)
1527 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1528 out:
1529 	return rc;
1530 }
1531 
1532 static int record__mmap_read_all(struct record *rec, bool synch)
1533 {
1534 	int err;
1535 
1536 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1537 	if (err)
1538 		return err;
1539 
1540 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1541 }
1542 
1543 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1544 					   void *arg __maybe_unused)
1545 {
1546 	struct perf_mmap *map = fda->priv[fd].ptr;
1547 
1548 	if (map)
1549 		perf_mmap__put(map);
1550 }
1551 
1552 static void *record__thread(void *arg)
1553 {
1554 	enum thread_msg msg = THREAD_MSG__READY;
1555 	bool terminate = false;
1556 	struct fdarray *pollfd;
1557 	int err, ctlfd_pos;
1558 
1559 	thread = arg;
1560 	thread->tid = gettid();
1561 
1562 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1563 	if (err == -1)
1564 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1565 			   thread->tid, strerror(errno));
1566 
1567 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1568 
1569 	pollfd = &thread->pollfd;
1570 	ctlfd_pos = thread->ctlfd_pos;
1571 
1572 	for (;;) {
1573 		unsigned long long hits = thread->samples;
1574 
1575 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1576 			break;
1577 
1578 		if (hits == thread->samples) {
1579 
1580 			err = fdarray__poll(pollfd, -1);
1581 			/*
1582 			 * Propagate error, only if there's any. Ignore positive
1583 			 * number of returned events and interrupt error.
1584 			 */
1585 			if (err > 0 || (err < 0 && errno == EINTR))
1586 				err = 0;
1587 			thread->waking++;
1588 
1589 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1590 					    record__thread_munmap_filtered, NULL) == 0)
1591 				break;
1592 		}
1593 
1594 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1595 			terminate = true;
1596 			close(thread->pipes.msg[0]);
1597 			thread->pipes.msg[0] = -1;
1598 			pollfd->entries[ctlfd_pos].fd = -1;
1599 			pollfd->entries[ctlfd_pos].events = 0;
1600 		}
1601 
1602 		pollfd->entries[ctlfd_pos].revents = 0;
1603 	}
1604 	record__mmap_read_all(thread->rec, true);
1605 
1606 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1607 	if (err == -1)
1608 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1609 			   thread->tid, strerror(errno));
1610 
1611 	return NULL;
1612 }
1613 
1614 static void record__init_features(struct record *rec)
1615 {
1616 	struct perf_session *session = rec->session;
1617 	int feat;
1618 
1619 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1620 		perf_header__set_feat(&session->header, feat);
1621 
1622 	if (rec->no_buildid)
1623 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1624 
1625 	if (!have_tracepoints(&rec->evlist->core.entries))
1626 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1627 
1628 	if (!rec->opts.branch_stack)
1629 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1630 
1631 	if (!rec->opts.full_auxtrace)
1632 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1633 
1634 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1635 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1636 
1637 	if (!rec->opts.use_clockid)
1638 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1639 
1640 	if (!record__threads_enabled(rec))
1641 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1642 
1643 	if (!record__comp_enabled(rec))
1644 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1645 
1646 	perf_header__clear_feat(&session->header, HEADER_STAT);
1647 }
1648 
1649 static void
1650 record__finish_output(struct record *rec)
1651 {
1652 	int i;
1653 	struct perf_data *data = &rec->data;
1654 	int fd = perf_data__fd(data);
1655 
1656 	if (data->is_pipe)
1657 		return;
1658 
1659 	rec->session->header.data_size += rec->bytes_written;
1660 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1661 	if (record__threads_enabled(rec)) {
1662 		for (i = 0; i < data->dir.nr; i++)
1663 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1664 	}
1665 
1666 	if (!rec->no_buildid) {
1667 		process_buildids(rec);
1668 
1669 		if (rec->buildid_all)
1670 			dsos__hit_all(rec->session);
1671 	}
1672 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1673 
1674 	return;
1675 }
1676 
1677 static int record__synthesize_workload(struct record *rec, bool tail)
1678 {
1679 	int err;
1680 	struct perf_thread_map *thread_map;
1681 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1682 
1683 	if (rec->opts.tail_synthesize != tail)
1684 		return 0;
1685 
1686 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1687 	if (thread_map == NULL)
1688 		return -1;
1689 
1690 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1691 						 process_synthesized_event,
1692 						 &rec->session->machines.host,
1693 						 needs_mmap,
1694 						 rec->opts.sample_address);
1695 	perf_thread_map__put(thread_map);
1696 	return err;
1697 }
1698 
1699 static int record__synthesize(struct record *rec, bool tail);
1700 
1701 static int
1702 record__switch_output(struct record *rec, bool at_exit)
1703 {
1704 	struct perf_data *data = &rec->data;
1705 	int fd, err;
1706 	char *new_filename;
1707 
1708 	/* Same Size:      "2015122520103046"*/
1709 	char timestamp[] = "InvalidTimestamp";
1710 
1711 	record__aio_mmap_read_sync(rec);
1712 
1713 	record__synthesize(rec, true);
1714 	if (target__none(&rec->opts.target))
1715 		record__synthesize_workload(rec, true);
1716 
1717 	rec->samples = 0;
1718 	record__finish_output(rec);
1719 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1720 	if (err) {
1721 		pr_err("Failed to get current timestamp\n");
1722 		return -EINVAL;
1723 	}
1724 
1725 	fd = perf_data__switch(data, timestamp,
1726 				    rec->session->header.data_offset,
1727 				    at_exit, &new_filename);
1728 	if (fd >= 0 && !at_exit) {
1729 		rec->bytes_written = 0;
1730 		rec->session->header.data_size = 0;
1731 	}
1732 
1733 	if (!quiet)
1734 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1735 			data->path, timestamp);
1736 
1737 	if (rec->switch_output.num_files) {
1738 		int n = rec->switch_output.cur_file + 1;
1739 
1740 		if (n >= rec->switch_output.num_files)
1741 			n = 0;
1742 		rec->switch_output.cur_file = n;
1743 		if (rec->switch_output.filenames[n]) {
1744 			remove(rec->switch_output.filenames[n]);
1745 			zfree(&rec->switch_output.filenames[n]);
1746 		}
1747 		rec->switch_output.filenames[n] = new_filename;
1748 	} else {
1749 		free(new_filename);
1750 	}
1751 
1752 	/* Output tracking events */
1753 	if (!at_exit) {
1754 		record__synthesize(rec, false);
1755 
1756 		/*
1757 		 * In 'perf record --switch-output' without -a,
1758 		 * record__synthesize() in record__switch_output() won't
1759 		 * generate tracking events because there's no thread_map
1760 		 * in evlist. Which causes newly created perf.data doesn't
1761 		 * contain map and comm information.
1762 		 * Create a fake thread_map and directly call
1763 		 * perf_event__synthesize_thread_map() for those events.
1764 		 */
1765 		if (target__none(&rec->opts.target))
1766 			record__synthesize_workload(rec, false);
1767 	}
1768 	return fd;
1769 }
1770 
1771 static volatile int workload_exec_errno;
1772 
1773 /*
1774  * evlist__prepare_workload will send a SIGUSR1
1775  * if the fork fails, since we asked by setting its
1776  * want_signal to true.
1777  */
1778 static void workload_exec_failed_signal(int signo __maybe_unused,
1779 					siginfo_t *info,
1780 					void *ucontext __maybe_unused)
1781 {
1782 	workload_exec_errno = info->si_value.sival_int;
1783 	done = 1;
1784 	child_finished = 1;
1785 }
1786 
1787 static void snapshot_sig_handler(int sig);
1788 static void alarm_sig_handler(int sig);
1789 
1790 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1791 {
1792 	if (evlist) {
1793 		if (evlist->mmap && evlist->mmap[0].core.base)
1794 			return evlist->mmap[0].core.base;
1795 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1796 			return evlist->overwrite_mmap[0].core.base;
1797 	}
1798 	return NULL;
1799 }
1800 
1801 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1802 {
1803 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1804 	if (pc)
1805 		return pc;
1806 	return NULL;
1807 }
1808 
1809 static int record__synthesize(struct record *rec, bool tail)
1810 {
1811 	struct perf_session *session = rec->session;
1812 	struct machine *machine = &session->machines.host;
1813 	struct perf_data *data = &rec->data;
1814 	struct record_opts *opts = &rec->opts;
1815 	struct perf_tool *tool = &rec->tool;
1816 	int err = 0;
1817 	event_op f = process_synthesized_event;
1818 
1819 	if (rec->opts.tail_synthesize != tail)
1820 		return 0;
1821 
1822 	if (data->is_pipe) {
1823 		err = perf_event__synthesize_for_pipe(tool, session, data,
1824 						      process_synthesized_event);
1825 		if (err < 0)
1826 			goto out;
1827 
1828 		rec->bytes_written += err;
1829 	}
1830 
1831 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1832 					  process_synthesized_event, machine);
1833 	if (err)
1834 		goto out;
1835 
1836 	/* Synthesize id_index before auxtrace_info */
1837 	if (rec->opts.auxtrace_sample_mode || rec->opts.full_auxtrace) {
1838 		err = perf_event__synthesize_id_index(tool,
1839 						      process_synthesized_event,
1840 						      session->evlist, machine);
1841 		if (err)
1842 			goto out;
1843 	}
1844 
1845 	if (rec->opts.full_auxtrace) {
1846 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1847 					session, process_synthesized_event);
1848 		if (err)
1849 			goto out;
1850 	}
1851 
1852 	if (!evlist__exclude_kernel(rec->evlist)) {
1853 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1854 							 machine);
1855 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1856 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1857 				   "Check /proc/kallsyms permission or run as root.\n");
1858 
1859 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1860 						     machine);
1861 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1862 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1863 				   "Check /proc/modules permission or run as root.\n");
1864 	}
1865 
1866 	if (perf_guest) {
1867 		machines__process_guests(&session->machines,
1868 					 perf_event__synthesize_guest_os, tool);
1869 	}
1870 
1871 	err = perf_event__synthesize_extra_attr(&rec->tool,
1872 						rec->evlist,
1873 						process_synthesized_event,
1874 						data->is_pipe);
1875 	if (err)
1876 		goto out;
1877 
1878 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1879 						 process_synthesized_event,
1880 						NULL);
1881 	if (err < 0) {
1882 		pr_err("Couldn't synthesize thread map.\n");
1883 		return err;
1884 	}
1885 
1886 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
1887 					     process_synthesized_event, NULL);
1888 	if (err < 0) {
1889 		pr_err("Couldn't synthesize cpu map.\n");
1890 		return err;
1891 	}
1892 
1893 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1894 						machine, opts);
1895 	if (err < 0)
1896 		pr_warning("Couldn't synthesize bpf events.\n");
1897 
1898 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
1899 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1900 						     machine);
1901 		if (err < 0)
1902 			pr_warning("Couldn't synthesize cgroup events.\n");
1903 	}
1904 
1905 	if (rec->opts.nr_threads_synthesize > 1) {
1906 		perf_set_multithreaded();
1907 		f = process_locked_synthesized_event;
1908 	}
1909 
1910 	if (rec->opts.synth & PERF_SYNTH_TASK) {
1911 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1912 
1913 		err = __machine__synthesize_threads(machine, tool, &opts->target,
1914 						    rec->evlist->core.threads,
1915 						    f, needs_mmap, opts->sample_address,
1916 						    rec->opts.nr_threads_synthesize);
1917 	}
1918 
1919 	if (rec->opts.nr_threads_synthesize > 1)
1920 		perf_set_singlethreaded();
1921 
1922 out:
1923 	return err;
1924 }
1925 
1926 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
1927 {
1928 	struct record *rec = data;
1929 	pthread_kill(rec->thread_id, SIGUSR2);
1930 	return 0;
1931 }
1932 
1933 static int record__setup_sb_evlist(struct record *rec)
1934 {
1935 	struct record_opts *opts = &rec->opts;
1936 
1937 	if (rec->sb_evlist != NULL) {
1938 		/*
1939 		 * We get here if --switch-output-event populated the
1940 		 * sb_evlist, so associate a callback that will send a SIGUSR2
1941 		 * to the main thread.
1942 		 */
1943 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
1944 		rec->thread_id = pthread_self();
1945 	}
1946 #ifdef HAVE_LIBBPF_SUPPORT
1947 	if (!opts->no_bpf_event) {
1948 		if (rec->sb_evlist == NULL) {
1949 			rec->sb_evlist = evlist__new();
1950 
1951 			if (rec->sb_evlist == NULL) {
1952 				pr_err("Couldn't create side band evlist.\n.");
1953 				return -1;
1954 			}
1955 		}
1956 
1957 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
1958 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
1959 			return -1;
1960 		}
1961 	}
1962 #endif
1963 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
1964 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1965 		opts->no_bpf_event = true;
1966 	}
1967 
1968 	return 0;
1969 }
1970 
1971 static int record__init_clock(struct record *rec)
1972 {
1973 	struct perf_session *session = rec->session;
1974 	struct timespec ref_clockid;
1975 	struct timeval ref_tod;
1976 	u64 ref;
1977 
1978 	if (!rec->opts.use_clockid)
1979 		return 0;
1980 
1981 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1982 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
1983 
1984 	session->header.env.clock.clockid = rec->opts.clockid;
1985 
1986 	if (gettimeofday(&ref_tod, NULL) != 0) {
1987 		pr_err("gettimeofday failed, cannot set reference time.\n");
1988 		return -1;
1989 	}
1990 
1991 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
1992 		pr_err("clock_gettime failed, cannot set reference time.\n");
1993 		return -1;
1994 	}
1995 
1996 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
1997 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
1998 
1999 	session->header.env.clock.tod_ns = ref;
2000 
2001 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2002 	      (u64) ref_clockid.tv_nsec;
2003 
2004 	session->header.env.clock.clockid_ns = ref;
2005 	return 0;
2006 }
2007 
2008 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2009 {
2010 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2011 		trigger_hit(&auxtrace_snapshot_trigger);
2012 		auxtrace_record__snapshot_started = 1;
2013 		if (auxtrace_record__snapshot_start(rec->itr))
2014 			trigger_error(&auxtrace_snapshot_trigger);
2015 	}
2016 }
2017 
2018 static void record__uniquify_name(struct record *rec)
2019 {
2020 	struct evsel *pos;
2021 	struct evlist *evlist = rec->evlist;
2022 	char *new_name;
2023 	int ret;
2024 
2025 	if (!perf_pmu__has_hybrid())
2026 		return;
2027 
2028 	evlist__for_each_entry(evlist, pos) {
2029 		if (!evsel__is_hybrid(pos))
2030 			continue;
2031 
2032 		if (strchr(pos->name, '/'))
2033 			continue;
2034 
2035 		ret = asprintf(&new_name, "%s/%s/",
2036 			       pos->pmu_name, pos->name);
2037 		if (ret) {
2038 			free(pos->name);
2039 			pos->name = new_name;
2040 		}
2041 	}
2042 }
2043 
2044 static int record__terminate_thread(struct record_thread *thread_data)
2045 {
2046 	int err;
2047 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2048 	pid_t tid = thread_data->tid;
2049 
2050 	close(thread_data->pipes.msg[1]);
2051 	thread_data->pipes.msg[1] = -1;
2052 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2053 	if (err > 0)
2054 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2055 	else
2056 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2057 			   thread->tid, tid);
2058 
2059 	return 0;
2060 }
2061 
2062 static int record__start_threads(struct record *rec)
2063 {
2064 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2065 	struct record_thread *thread_data = rec->thread_data;
2066 	sigset_t full, mask;
2067 	pthread_t handle;
2068 	pthread_attr_t attrs;
2069 
2070 	thread = &thread_data[0];
2071 
2072 	if (!record__threads_enabled(rec))
2073 		return 0;
2074 
2075 	sigfillset(&full);
2076 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2077 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2078 		return -1;
2079 	}
2080 
2081 	pthread_attr_init(&attrs);
2082 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2083 
2084 	for (t = 1; t < nr_threads; t++) {
2085 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2086 
2087 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2088 		pthread_attr_setaffinity_np(&attrs,
2089 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2090 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2091 #endif
2092 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2093 			for (tt = 1; tt < t; tt++)
2094 				record__terminate_thread(&thread_data[t]);
2095 			pr_err("Failed to start threads: %s\n", strerror(errno));
2096 			ret = -1;
2097 			goto out_err;
2098 		}
2099 
2100 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2101 		if (err > 0)
2102 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2103 				  thread_msg_tags[msg]);
2104 		else
2105 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2106 				   thread->tid, rec->thread_data[t].tid);
2107 	}
2108 
2109 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2110 			(cpu_set_t *)thread->mask->affinity.bits);
2111 
2112 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2113 
2114 out_err:
2115 	pthread_attr_destroy(&attrs);
2116 
2117 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2118 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2119 		ret = -1;
2120 	}
2121 
2122 	return ret;
2123 }
2124 
2125 static int record__stop_threads(struct record *rec)
2126 {
2127 	int t;
2128 	struct record_thread *thread_data = rec->thread_data;
2129 
2130 	for (t = 1; t < rec->nr_threads; t++)
2131 		record__terminate_thread(&thread_data[t]);
2132 
2133 	for (t = 0; t < rec->nr_threads; t++) {
2134 		rec->samples += thread_data[t].samples;
2135 		if (!record__threads_enabled(rec))
2136 			continue;
2137 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2138 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2139 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2140 			 thread_data[t].samples, thread_data[t].waking);
2141 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2142 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2143 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2144 		else
2145 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2146 	}
2147 
2148 	return 0;
2149 }
2150 
2151 static unsigned long record__waking(struct record *rec)
2152 {
2153 	int t;
2154 	unsigned long waking = 0;
2155 	struct record_thread *thread_data = rec->thread_data;
2156 
2157 	for (t = 0; t < rec->nr_threads; t++)
2158 		waking += thread_data[t].waking;
2159 
2160 	return waking;
2161 }
2162 
2163 static int __cmd_record(struct record *rec, int argc, const char **argv)
2164 {
2165 	int err;
2166 	int status = 0;
2167 	const bool forks = argc > 0;
2168 	struct perf_tool *tool = &rec->tool;
2169 	struct record_opts *opts = &rec->opts;
2170 	struct perf_data *data = &rec->data;
2171 	struct perf_session *session;
2172 	bool disabled = false, draining = false;
2173 	int fd;
2174 	float ratio = 0;
2175 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2176 
2177 	atexit(record__sig_exit);
2178 	signal(SIGCHLD, sig_handler);
2179 	signal(SIGINT, sig_handler);
2180 	signal(SIGTERM, sig_handler);
2181 	signal(SIGSEGV, sigsegv_handler);
2182 
2183 	if (rec->opts.record_namespaces)
2184 		tool->namespace_events = true;
2185 
2186 	if (rec->opts.record_cgroup) {
2187 #ifdef HAVE_FILE_HANDLE
2188 		tool->cgroup_events = true;
2189 #else
2190 		pr_err("cgroup tracking is not supported\n");
2191 		return -1;
2192 #endif
2193 	}
2194 
2195 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2196 		signal(SIGUSR2, snapshot_sig_handler);
2197 		if (rec->opts.auxtrace_snapshot_mode)
2198 			trigger_on(&auxtrace_snapshot_trigger);
2199 		if (rec->switch_output.enabled)
2200 			trigger_on(&switch_output_trigger);
2201 	} else {
2202 		signal(SIGUSR2, SIG_IGN);
2203 	}
2204 
2205 	session = perf_session__new(data, tool);
2206 	if (IS_ERR(session)) {
2207 		pr_err("Perf session creation failed.\n");
2208 		return PTR_ERR(session);
2209 	}
2210 
2211 	if (record__threads_enabled(rec)) {
2212 		if (perf_data__is_pipe(&rec->data)) {
2213 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2214 			return -1;
2215 		}
2216 		if (rec->opts.full_auxtrace) {
2217 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2218 			return -1;
2219 		}
2220 	}
2221 
2222 	fd = perf_data__fd(data);
2223 	rec->session = session;
2224 
2225 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2226 		pr_err("Compression initialization failed.\n");
2227 		return -1;
2228 	}
2229 #ifdef HAVE_EVENTFD_SUPPORT
2230 	done_fd = eventfd(0, EFD_NONBLOCK);
2231 	if (done_fd < 0) {
2232 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2233 		status = -1;
2234 		goto out_delete_session;
2235 	}
2236 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2237 	if (err < 0) {
2238 		pr_err("Failed to add wakeup eventfd to poll list\n");
2239 		status = err;
2240 		goto out_delete_session;
2241 	}
2242 #endif // HAVE_EVENTFD_SUPPORT
2243 
2244 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2245 	session->header.env.comp_level = rec->opts.comp_level;
2246 
2247 	if (rec->opts.kcore &&
2248 	    !record__kcore_readable(&session->machines.host)) {
2249 		pr_err("ERROR: kcore is not readable.\n");
2250 		return -1;
2251 	}
2252 
2253 	if (record__init_clock(rec))
2254 		return -1;
2255 
2256 	record__init_features(rec);
2257 
2258 	if (forks) {
2259 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2260 					       workload_exec_failed_signal);
2261 		if (err < 0) {
2262 			pr_err("Couldn't run the workload!\n");
2263 			status = err;
2264 			goto out_delete_session;
2265 		}
2266 	}
2267 
2268 	/*
2269 	 * If we have just single event and are sending data
2270 	 * through pipe, we need to force the ids allocation,
2271 	 * because we synthesize event name through the pipe
2272 	 * and need the id for that.
2273 	 */
2274 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2275 		rec->opts.sample_id = true;
2276 
2277 	record__uniquify_name(rec);
2278 
2279 	if (record__open(rec) != 0) {
2280 		err = -1;
2281 		goto out_free_threads;
2282 	}
2283 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2284 
2285 	if (rec->opts.kcore) {
2286 		err = record__kcore_copy(&session->machines.host, data);
2287 		if (err) {
2288 			pr_err("ERROR: Failed to copy kcore\n");
2289 			goto out_free_threads;
2290 		}
2291 	}
2292 
2293 	err = bpf__apply_obj_config();
2294 	if (err) {
2295 		char errbuf[BUFSIZ];
2296 
2297 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2298 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2299 			 errbuf);
2300 		goto out_free_threads;
2301 	}
2302 
2303 	/*
2304 	 * Normally perf_session__new would do this, but it doesn't have the
2305 	 * evlist.
2306 	 */
2307 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2308 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2309 		rec->tool.ordered_events = false;
2310 	}
2311 
2312 	if (!rec->evlist->core.nr_groups)
2313 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2314 
2315 	if (data->is_pipe) {
2316 		err = perf_header__write_pipe(fd);
2317 		if (err < 0)
2318 			goto out_free_threads;
2319 	} else {
2320 		err = perf_session__write_header(session, rec->evlist, fd, false);
2321 		if (err < 0)
2322 			goto out_free_threads;
2323 	}
2324 
2325 	err = -1;
2326 	if (!rec->no_buildid
2327 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2328 		pr_err("Couldn't generate buildids. "
2329 		       "Use --no-buildid to profile anyway.\n");
2330 		goto out_free_threads;
2331 	}
2332 
2333 	err = record__setup_sb_evlist(rec);
2334 	if (err)
2335 		goto out_free_threads;
2336 
2337 	err = record__synthesize(rec, false);
2338 	if (err < 0)
2339 		goto out_free_threads;
2340 
2341 	if (rec->realtime_prio) {
2342 		struct sched_param param;
2343 
2344 		param.sched_priority = rec->realtime_prio;
2345 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2346 			pr_err("Could not set realtime priority.\n");
2347 			err = -1;
2348 			goto out_free_threads;
2349 		}
2350 	}
2351 
2352 	if (record__start_threads(rec))
2353 		goto out_free_threads;
2354 
2355 	/*
2356 	 * When perf is starting the traced process, all the events
2357 	 * (apart from group members) have enable_on_exec=1 set,
2358 	 * so don't spoil it by prematurely enabling them.
2359 	 */
2360 	if (!target__none(&opts->target) && !opts->initial_delay)
2361 		evlist__enable(rec->evlist);
2362 
2363 	/*
2364 	 * Let the child rip
2365 	 */
2366 	if (forks) {
2367 		struct machine *machine = &session->machines.host;
2368 		union perf_event *event;
2369 		pid_t tgid;
2370 
2371 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2372 		if (event == NULL) {
2373 			err = -ENOMEM;
2374 			goto out_child;
2375 		}
2376 
2377 		/*
2378 		 * Some H/W events are generated before COMM event
2379 		 * which is emitted during exec(), so perf script
2380 		 * cannot see a correct process name for those events.
2381 		 * Synthesize COMM event to prevent it.
2382 		 */
2383 		tgid = perf_event__synthesize_comm(tool, event,
2384 						   rec->evlist->workload.pid,
2385 						   process_synthesized_event,
2386 						   machine);
2387 		free(event);
2388 
2389 		if (tgid == -1)
2390 			goto out_child;
2391 
2392 		event = malloc(sizeof(event->namespaces) +
2393 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2394 			       machine->id_hdr_size);
2395 		if (event == NULL) {
2396 			err = -ENOMEM;
2397 			goto out_child;
2398 		}
2399 
2400 		/*
2401 		 * Synthesize NAMESPACES event for the command specified.
2402 		 */
2403 		perf_event__synthesize_namespaces(tool, event,
2404 						  rec->evlist->workload.pid,
2405 						  tgid, process_synthesized_event,
2406 						  machine);
2407 		free(event);
2408 
2409 		evlist__start_workload(rec->evlist);
2410 	}
2411 
2412 	if (opts->initial_delay) {
2413 		pr_info(EVLIST_DISABLED_MSG);
2414 		if (opts->initial_delay > 0) {
2415 			usleep(opts->initial_delay * USEC_PER_MSEC);
2416 			evlist__enable(rec->evlist);
2417 			pr_info(EVLIST_ENABLED_MSG);
2418 		}
2419 	}
2420 
2421 	trigger_ready(&auxtrace_snapshot_trigger);
2422 	trigger_ready(&switch_output_trigger);
2423 	perf_hooks__invoke_record_start();
2424 	for (;;) {
2425 		unsigned long long hits = thread->samples;
2426 
2427 		/*
2428 		 * rec->evlist->bkw_mmap_state is possible to be
2429 		 * BKW_MMAP_EMPTY here: when done == true and
2430 		 * hits != rec->samples in previous round.
2431 		 *
2432 		 * evlist__toggle_bkw_mmap ensure we never
2433 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2434 		 */
2435 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2436 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2437 
2438 		if (record__mmap_read_all(rec, false) < 0) {
2439 			trigger_error(&auxtrace_snapshot_trigger);
2440 			trigger_error(&switch_output_trigger);
2441 			err = -1;
2442 			goto out_child;
2443 		}
2444 
2445 		if (auxtrace_record__snapshot_started) {
2446 			auxtrace_record__snapshot_started = 0;
2447 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2448 				record__read_auxtrace_snapshot(rec, false);
2449 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2450 				pr_err("AUX area tracing snapshot failed\n");
2451 				err = -1;
2452 				goto out_child;
2453 			}
2454 		}
2455 
2456 		if (trigger_is_hit(&switch_output_trigger)) {
2457 			/*
2458 			 * If switch_output_trigger is hit, the data in
2459 			 * overwritable ring buffer should have been collected,
2460 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2461 			 *
2462 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2463 			 * record__mmap_read_all() didn't collect data from
2464 			 * overwritable ring buffer. Read again.
2465 			 */
2466 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2467 				continue;
2468 			trigger_ready(&switch_output_trigger);
2469 
2470 			/*
2471 			 * Reenable events in overwrite ring buffer after
2472 			 * record__mmap_read_all(): we should have collected
2473 			 * data from it.
2474 			 */
2475 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2476 
2477 			if (!quiet)
2478 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2479 					record__waking(rec));
2480 			thread->waking = 0;
2481 			fd = record__switch_output(rec, false);
2482 			if (fd < 0) {
2483 				pr_err("Failed to switch to new file\n");
2484 				trigger_error(&switch_output_trigger);
2485 				err = fd;
2486 				goto out_child;
2487 			}
2488 
2489 			/* re-arm the alarm */
2490 			if (rec->switch_output.time)
2491 				alarm(rec->switch_output.time);
2492 		}
2493 
2494 		if (hits == thread->samples) {
2495 			if (done || draining)
2496 				break;
2497 			err = fdarray__poll(&thread->pollfd, -1);
2498 			/*
2499 			 * Propagate error, only if there's any. Ignore positive
2500 			 * number of returned events and interrupt error.
2501 			 */
2502 			if (err > 0 || (err < 0 && errno == EINTR))
2503 				err = 0;
2504 			thread->waking++;
2505 
2506 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2507 					    record__thread_munmap_filtered, NULL) == 0)
2508 				draining = true;
2509 
2510 			evlist__ctlfd_update(rec->evlist,
2511 				&thread->pollfd.entries[thread->ctlfd_pos]);
2512 		}
2513 
2514 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2515 			switch (cmd) {
2516 			case EVLIST_CTL_CMD_SNAPSHOT:
2517 				hit_auxtrace_snapshot_trigger(rec);
2518 				evlist__ctlfd_ack(rec->evlist);
2519 				break;
2520 			case EVLIST_CTL_CMD_STOP:
2521 				done = 1;
2522 				break;
2523 			case EVLIST_CTL_CMD_ACK:
2524 			case EVLIST_CTL_CMD_UNSUPPORTED:
2525 			case EVLIST_CTL_CMD_ENABLE:
2526 			case EVLIST_CTL_CMD_DISABLE:
2527 			case EVLIST_CTL_CMD_EVLIST:
2528 			case EVLIST_CTL_CMD_PING:
2529 			default:
2530 				break;
2531 			}
2532 		}
2533 
2534 		/*
2535 		 * When perf is starting the traced process, at the end events
2536 		 * die with the process and we wait for that. Thus no need to
2537 		 * disable events in this case.
2538 		 */
2539 		if (done && !disabled && !target__none(&opts->target)) {
2540 			trigger_off(&auxtrace_snapshot_trigger);
2541 			evlist__disable(rec->evlist);
2542 			disabled = true;
2543 		}
2544 	}
2545 
2546 	trigger_off(&auxtrace_snapshot_trigger);
2547 	trigger_off(&switch_output_trigger);
2548 
2549 	if (opts->auxtrace_snapshot_on_exit)
2550 		record__auxtrace_snapshot_exit(rec);
2551 
2552 	if (forks && workload_exec_errno) {
2553 		char msg[STRERR_BUFSIZE], strevsels[2048];
2554 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2555 
2556 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2557 
2558 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2559 			strevsels, argv[0], emsg);
2560 		err = -1;
2561 		goto out_child;
2562 	}
2563 
2564 	if (!quiet)
2565 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2566 			record__waking(rec));
2567 
2568 	if (target__none(&rec->opts.target))
2569 		record__synthesize_workload(rec, true);
2570 
2571 out_child:
2572 	record__stop_threads(rec);
2573 	record__mmap_read_all(rec, true);
2574 out_free_threads:
2575 	record__free_thread_data(rec);
2576 	evlist__finalize_ctlfd(rec->evlist);
2577 	record__aio_mmap_read_sync(rec);
2578 
2579 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2580 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2581 		session->header.env.comp_ratio = ratio + 0.5;
2582 	}
2583 
2584 	if (forks) {
2585 		int exit_status;
2586 
2587 		if (!child_finished)
2588 			kill(rec->evlist->workload.pid, SIGTERM);
2589 
2590 		wait(&exit_status);
2591 
2592 		if (err < 0)
2593 			status = err;
2594 		else if (WIFEXITED(exit_status))
2595 			status = WEXITSTATUS(exit_status);
2596 		else if (WIFSIGNALED(exit_status))
2597 			signr = WTERMSIG(exit_status);
2598 	} else
2599 		status = err;
2600 
2601 	if (rec->off_cpu)
2602 		rec->bytes_written += off_cpu_write(rec->session);
2603 
2604 	record__synthesize(rec, true);
2605 	/* this will be recalculated during process_buildids() */
2606 	rec->samples = 0;
2607 
2608 	if (!err) {
2609 		if (!rec->timestamp_filename) {
2610 			record__finish_output(rec);
2611 		} else {
2612 			fd = record__switch_output(rec, true);
2613 			if (fd < 0) {
2614 				status = fd;
2615 				goto out_delete_session;
2616 			}
2617 		}
2618 	}
2619 
2620 	perf_hooks__invoke_record_end();
2621 
2622 	if (!err && !quiet) {
2623 		char samples[128];
2624 		const char *postfix = rec->timestamp_filename ?
2625 					".<timestamp>" : "";
2626 
2627 		if (rec->samples && !rec->opts.full_auxtrace)
2628 			scnprintf(samples, sizeof(samples),
2629 				  " (%" PRIu64 " samples)", rec->samples);
2630 		else
2631 			samples[0] = '\0';
2632 
2633 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2634 			perf_data__size(data) / 1024.0 / 1024.0,
2635 			data->path, postfix, samples);
2636 		if (ratio) {
2637 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2638 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2639 					ratio);
2640 		}
2641 		fprintf(stderr, " ]\n");
2642 	}
2643 
2644 out_delete_session:
2645 #ifdef HAVE_EVENTFD_SUPPORT
2646 	if (done_fd >= 0)
2647 		close(done_fd);
2648 #endif
2649 	zstd_fini(&session->zstd_data);
2650 	perf_session__delete(session);
2651 
2652 	if (!opts->no_bpf_event)
2653 		evlist__stop_sb_thread(rec->sb_evlist);
2654 	return status;
2655 }
2656 
2657 static void callchain_debug(struct callchain_param *callchain)
2658 {
2659 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2660 
2661 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2662 
2663 	if (callchain->record_mode == CALLCHAIN_DWARF)
2664 		pr_debug("callchain: stack dump size %d\n",
2665 			 callchain->dump_size);
2666 }
2667 
2668 int record_opts__parse_callchain(struct record_opts *record,
2669 				 struct callchain_param *callchain,
2670 				 const char *arg, bool unset)
2671 {
2672 	int ret;
2673 	callchain->enabled = !unset;
2674 
2675 	/* --no-call-graph */
2676 	if (unset) {
2677 		callchain->record_mode = CALLCHAIN_NONE;
2678 		pr_debug("callchain: disabled\n");
2679 		return 0;
2680 	}
2681 
2682 	ret = parse_callchain_record_opt(arg, callchain);
2683 	if (!ret) {
2684 		/* Enable data address sampling for DWARF unwind. */
2685 		if (callchain->record_mode == CALLCHAIN_DWARF)
2686 			record->sample_address = true;
2687 		callchain_debug(callchain);
2688 	}
2689 
2690 	return ret;
2691 }
2692 
2693 int record_parse_callchain_opt(const struct option *opt,
2694 			       const char *arg,
2695 			       int unset)
2696 {
2697 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2698 }
2699 
2700 int record_callchain_opt(const struct option *opt,
2701 			 const char *arg __maybe_unused,
2702 			 int unset __maybe_unused)
2703 {
2704 	struct callchain_param *callchain = opt->value;
2705 
2706 	callchain->enabled = true;
2707 
2708 	if (callchain->record_mode == CALLCHAIN_NONE)
2709 		callchain->record_mode = CALLCHAIN_FP;
2710 
2711 	callchain_debug(callchain);
2712 	return 0;
2713 }
2714 
2715 static int perf_record_config(const char *var, const char *value, void *cb)
2716 {
2717 	struct record *rec = cb;
2718 
2719 	if (!strcmp(var, "record.build-id")) {
2720 		if (!strcmp(value, "cache"))
2721 			rec->no_buildid_cache = false;
2722 		else if (!strcmp(value, "no-cache"))
2723 			rec->no_buildid_cache = true;
2724 		else if (!strcmp(value, "skip"))
2725 			rec->no_buildid = true;
2726 		else if (!strcmp(value, "mmap"))
2727 			rec->buildid_mmap = true;
2728 		else
2729 			return -1;
2730 		return 0;
2731 	}
2732 	if (!strcmp(var, "record.call-graph")) {
2733 		var = "call-graph.record-mode";
2734 		return perf_default_config(var, value, cb);
2735 	}
2736 #ifdef HAVE_AIO_SUPPORT
2737 	if (!strcmp(var, "record.aio")) {
2738 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2739 		if (!rec->opts.nr_cblocks)
2740 			rec->opts.nr_cblocks = nr_cblocks_default;
2741 	}
2742 #endif
2743 	if (!strcmp(var, "record.debuginfod")) {
2744 		rec->debuginfod.urls = strdup(value);
2745 		if (!rec->debuginfod.urls)
2746 			return -ENOMEM;
2747 		rec->debuginfod.set = true;
2748 	}
2749 
2750 	return 0;
2751 }
2752 
2753 
2754 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2755 {
2756 	struct record_opts *opts = (struct record_opts *)opt->value;
2757 
2758 	if (unset || !str)
2759 		return 0;
2760 
2761 	if (!strcasecmp(str, "node"))
2762 		opts->affinity = PERF_AFFINITY_NODE;
2763 	else if (!strcasecmp(str, "cpu"))
2764 		opts->affinity = PERF_AFFINITY_CPU;
2765 
2766 	return 0;
2767 }
2768 
2769 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
2770 {
2771 	mask->nbits = nr_bits;
2772 	mask->bits = bitmap_zalloc(mask->nbits);
2773 	if (!mask->bits)
2774 		return -ENOMEM;
2775 
2776 	return 0;
2777 }
2778 
2779 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
2780 {
2781 	bitmap_free(mask->bits);
2782 	mask->nbits = 0;
2783 }
2784 
2785 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
2786 {
2787 	int ret;
2788 
2789 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
2790 	if (ret) {
2791 		mask->affinity.bits = NULL;
2792 		return ret;
2793 	}
2794 
2795 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
2796 	if (ret) {
2797 		record__mmap_cpu_mask_free(&mask->maps);
2798 		mask->maps.bits = NULL;
2799 	}
2800 
2801 	return ret;
2802 }
2803 
2804 static void record__thread_mask_free(struct thread_mask *mask)
2805 {
2806 	record__mmap_cpu_mask_free(&mask->maps);
2807 	record__mmap_cpu_mask_free(&mask->affinity);
2808 }
2809 
2810 static int record__parse_threads(const struct option *opt, const char *str, int unset)
2811 {
2812 	int s;
2813 	struct record_opts *opts = opt->value;
2814 
2815 	if (unset || !str || !strlen(str)) {
2816 		opts->threads_spec = THREAD_SPEC__CPU;
2817 	} else {
2818 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
2819 			if (s == THREAD_SPEC__USER) {
2820 				opts->threads_user_spec = strdup(str);
2821 				if (!opts->threads_user_spec)
2822 					return -ENOMEM;
2823 				opts->threads_spec = THREAD_SPEC__USER;
2824 				break;
2825 			}
2826 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
2827 				opts->threads_spec = s;
2828 				break;
2829 			}
2830 		}
2831 	}
2832 
2833 	if (opts->threads_spec == THREAD_SPEC__USER)
2834 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
2835 	else
2836 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
2837 
2838 	return 0;
2839 }
2840 
2841 static int parse_output_max_size(const struct option *opt,
2842 				 const char *str, int unset)
2843 {
2844 	unsigned long *s = (unsigned long *)opt->value;
2845 	static struct parse_tag tags_size[] = {
2846 		{ .tag  = 'B', .mult = 1       },
2847 		{ .tag  = 'K', .mult = 1 << 10 },
2848 		{ .tag  = 'M', .mult = 1 << 20 },
2849 		{ .tag  = 'G', .mult = 1 << 30 },
2850 		{ .tag  = 0 },
2851 	};
2852 	unsigned long val;
2853 
2854 	if (unset) {
2855 		*s = 0;
2856 		return 0;
2857 	}
2858 
2859 	val = parse_tag_value(str, tags_size);
2860 	if (val != (unsigned long) -1) {
2861 		*s = val;
2862 		return 0;
2863 	}
2864 
2865 	return -1;
2866 }
2867 
2868 static int record__parse_mmap_pages(const struct option *opt,
2869 				    const char *str,
2870 				    int unset __maybe_unused)
2871 {
2872 	struct record_opts *opts = opt->value;
2873 	char *s, *p;
2874 	unsigned int mmap_pages;
2875 	int ret;
2876 
2877 	if (!str)
2878 		return -EINVAL;
2879 
2880 	s = strdup(str);
2881 	if (!s)
2882 		return -ENOMEM;
2883 
2884 	p = strchr(s, ',');
2885 	if (p)
2886 		*p = '\0';
2887 
2888 	if (*s) {
2889 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
2890 		if (ret)
2891 			goto out_free;
2892 		opts->mmap_pages = mmap_pages;
2893 	}
2894 
2895 	if (!p) {
2896 		ret = 0;
2897 		goto out_free;
2898 	}
2899 
2900 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
2901 	if (ret)
2902 		goto out_free;
2903 
2904 	opts->auxtrace_mmap_pages = mmap_pages;
2905 
2906 out_free:
2907 	free(s);
2908 	return ret;
2909 }
2910 
2911 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
2912 {
2913 }
2914 
2915 static int parse_control_option(const struct option *opt,
2916 				const char *str,
2917 				int unset __maybe_unused)
2918 {
2919 	struct record_opts *opts = opt->value;
2920 
2921 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
2922 }
2923 
2924 static void switch_output_size_warn(struct record *rec)
2925 {
2926 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2927 	struct switch_output *s = &rec->switch_output;
2928 
2929 	wakeup_size /= 2;
2930 
2931 	if (s->size < wakeup_size) {
2932 		char buf[100];
2933 
2934 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2935 		pr_warning("WARNING: switch-output data size lower than "
2936 			   "wakeup kernel buffer size (%s) "
2937 			   "expect bigger perf.data sizes\n", buf);
2938 	}
2939 }
2940 
2941 static int switch_output_setup(struct record *rec)
2942 {
2943 	struct switch_output *s = &rec->switch_output;
2944 	static struct parse_tag tags_size[] = {
2945 		{ .tag  = 'B', .mult = 1       },
2946 		{ .tag  = 'K', .mult = 1 << 10 },
2947 		{ .tag  = 'M', .mult = 1 << 20 },
2948 		{ .tag  = 'G', .mult = 1 << 30 },
2949 		{ .tag  = 0 },
2950 	};
2951 	static struct parse_tag tags_time[] = {
2952 		{ .tag  = 's', .mult = 1        },
2953 		{ .tag  = 'm', .mult = 60       },
2954 		{ .tag  = 'h', .mult = 60*60    },
2955 		{ .tag  = 'd', .mult = 60*60*24 },
2956 		{ .tag  = 0 },
2957 	};
2958 	unsigned long val;
2959 
2960 	/*
2961 	 * If we're using --switch-output-events, then we imply its
2962 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
2963 	 *  thread to its parent.
2964 	 */
2965 	if (rec->switch_output_event_set) {
2966 		if (record__threads_enabled(rec)) {
2967 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
2968 			return 0;
2969 		}
2970 		goto do_signal;
2971 	}
2972 
2973 	if (!s->set)
2974 		return 0;
2975 
2976 	if (record__threads_enabled(rec)) {
2977 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
2978 		return 0;
2979 	}
2980 
2981 	if (!strcmp(s->str, "signal")) {
2982 do_signal:
2983 		s->signal = true;
2984 		pr_debug("switch-output with SIGUSR2 signal\n");
2985 		goto enabled;
2986 	}
2987 
2988 	val = parse_tag_value(s->str, tags_size);
2989 	if (val != (unsigned long) -1) {
2990 		s->size = val;
2991 		pr_debug("switch-output with %s size threshold\n", s->str);
2992 		goto enabled;
2993 	}
2994 
2995 	val = parse_tag_value(s->str, tags_time);
2996 	if (val != (unsigned long) -1) {
2997 		s->time = val;
2998 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2999 			 s->str, s->time);
3000 		goto enabled;
3001 	}
3002 
3003 	return -1;
3004 
3005 enabled:
3006 	rec->timestamp_filename = true;
3007 	s->enabled              = true;
3008 
3009 	if (s->size && !rec->opts.no_buffering)
3010 		switch_output_size_warn(rec);
3011 
3012 	return 0;
3013 }
3014 
3015 static const char * const __record_usage[] = {
3016 	"perf record [<options>] [<command>]",
3017 	"perf record [<options>] -- <command> [<options>]",
3018 	NULL
3019 };
3020 const char * const *record_usage = __record_usage;
3021 
3022 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3023 				  struct perf_sample *sample, struct machine *machine)
3024 {
3025 	/*
3026 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3027 	 * no need to add them twice.
3028 	 */
3029 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3030 		return 0;
3031 	return perf_event__process_mmap(tool, event, sample, machine);
3032 }
3033 
3034 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3035 				   struct perf_sample *sample, struct machine *machine)
3036 {
3037 	/*
3038 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3039 	 * no need to add them twice.
3040 	 */
3041 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3042 		return 0;
3043 
3044 	return perf_event__process_mmap2(tool, event, sample, machine);
3045 }
3046 
3047 static int process_timestamp_boundary(struct perf_tool *tool,
3048 				      union perf_event *event __maybe_unused,
3049 				      struct perf_sample *sample,
3050 				      struct machine *machine __maybe_unused)
3051 {
3052 	struct record *rec = container_of(tool, struct record, tool);
3053 
3054 	set_timestamp_boundary(rec, sample->time);
3055 	return 0;
3056 }
3057 
3058 static int parse_record_synth_option(const struct option *opt,
3059 				     const char *str,
3060 				     int unset __maybe_unused)
3061 {
3062 	struct record_opts *opts = opt->value;
3063 	char *p = strdup(str);
3064 
3065 	if (p == NULL)
3066 		return -1;
3067 
3068 	opts->synth = parse_synth_opt(p);
3069 	free(p);
3070 
3071 	if (opts->synth < 0) {
3072 		pr_err("Invalid synth option: %s\n", str);
3073 		return -1;
3074 	}
3075 	return 0;
3076 }
3077 
3078 /*
3079  * XXX Ideally would be local to cmd_record() and passed to a record__new
3080  * because we need to have access to it in record__exit, that is called
3081  * after cmd_record() exits, but since record_options need to be accessible to
3082  * builtin-script, leave it here.
3083  *
3084  * At least we don't ouch it in all the other functions here directly.
3085  *
3086  * Just say no to tons of global variables, sigh.
3087  */
3088 static struct record record = {
3089 	.opts = {
3090 		.sample_time	     = true,
3091 		.mmap_pages	     = UINT_MAX,
3092 		.user_freq	     = UINT_MAX,
3093 		.user_interval	     = ULLONG_MAX,
3094 		.freq		     = 4000,
3095 		.target		     = {
3096 			.uses_mmap   = true,
3097 			.default_per_cpu = true,
3098 		},
3099 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3100 		.nr_threads_synthesize = 1,
3101 		.ctl_fd              = -1,
3102 		.ctl_fd_ack          = -1,
3103 		.synth               = PERF_SYNTH_ALL,
3104 	},
3105 	.tool = {
3106 		.sample		= process_sample_event,
3107 		.fork		= perf_event__process_fork,
3108 		.exit		= perf_event__process_exit,
3109 		.comm		= perf_event__process_comm,
3110 		.namespaces	= perf_event__process_namespaces,
3111 		.mmap		= build_id__process_mmap,
3112 		.mmap2		= build_id__process_mmap2,
3113 		.itrace_start	= process_timestamp_boundary,
3114 		.aux		= process_timestamp_boundary,
3115 		.ordered_events	= true,
3116 	},
3117 };
3118 
3119 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3120 	"\n\t\t\t\tDefault: fp";
3121 
3122 static bool dry_run;
3123 
3124 /*
3125  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3126  * with it and switch to use the library functions in perf_evlist that came
3127  * from builtin-record.c, i.e. use record_opts,
3128  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3129  * using pipes, etc.
3130  */
3131 static struct option __record_options[] = {
3132 	OPT_CALLBACK('e', "event", &record.evlist, "event",
3133 		     "event selector. use 'perf list' to list available events",
3134 		     parse_events_option),
3135 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3136 		     "event filter", parse_filter),
3137 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3138 			   NULL, "don't record events from perf itself",
3139 			   exclude_perf),
3140 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3141 		    "record events on existing process id"),
3142 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3143 		    "record events on existing thread id"),
3144 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3145 		    "collect data with this RT SCHED_FIFO priority"),
3146 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3147 		    "collect data without buffering"),
3148 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3149 		    "collect raw sample records from all opened counters"),
3150 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3151 			    "system-wide collection from all CPUs"),
3152 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3153 		    "list of cpus to monitor"),
3154 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3155 	OPT_STRING('o', "output", &record.data.path, "file",
3156 		    "output file name"),
3157 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3158 			&record.opts.no_inherit_set,
3159 			"child tasks do not inherit counters"),
3160 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3161 		    "synthesize non-sample events at the end of output"),
3162 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3163 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3164 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3165 		    "Fail if the specified frequency can't be used"),
3166 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3167 		     "profile at this frequency",
3168 		      record__parse_freq),
3169 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3170 		     "number of mmap data pages and AUX area tracing mmap pages",
3171 		     record__parse_mmap_pages),
3172 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3173 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3174 		     record__mmap_flush_parse),
3175 	OPT_BOOLEAN(0, "group", &record.opts.group,
3176 		    "put the counters into a counter group"),
3177 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3178 			   NULL, "enables call-graph recording" ,
3179 			   &record_callchain_opt),
3180 	OPT_CALLBACK(0, "call-graph", &record.opts,
3181 		     "record_mode[,record_size]", record_callchain_help,
3182 		     &record_parse_callchain_opt),
3183 	OPT_INCR('v', "verbose", &verbose,
3184 		    "be more verbose (show counter open errors, etc)"),
3185 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
3186 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3187 		    "per thread counts"),
3188 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3189 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3190 		    "Record the sample physical addresses"),
3191 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3192 		    "Record the sampled data address data page size"),
3193 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3194 		    "Record the sampled code address (ip) page size"),
3195 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3196 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3197 			&record.opts.sample_time_set,
3198 			"Record the sample timestamps"),
3199 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3200 			"Record the sample period"),
3201 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3202 		    "don't sample"),
3203 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3204 			&record.no_buildid_cache_set,
3205 			"do not update the buildid cache"),
3206 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3207 			&record.no_buildid_set,
3208 			"do not collect buildids in perf.data"),
3209 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3210 		     "monitor event in cgroup name only",
3211 		     parse_cgroups),
3212 	OPT_INTEGER('D', "delay", &record.opts.initial_delay,
3213 		  "ms to wait before starting measurement after program start (-1: start with events disabled)"),
3214 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3215 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3216 		   "user to profile"),
3217 
3218 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3219 		     "branch any", "sample any taken branches",
3220 		     parse_branch_stack),
3221 
3222 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3223 		     "branch filter mask", "branch stack filter modes",
3224 		     parse_branch_stack),
3225 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3226 		    "sample by weight (on special events only)"),
3227 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3228 		    "sample transaction flags (special events only)"),
3229 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3230 		    "use per-thread mmaps"),
3231 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3232 		    "sample selected machine registers on interrupt,"
3233 		    " use '-I?' to list register names", parse_intr_regs),
3234 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3235 		    "sample selected machine registers on interrupt,"
3236 		    " use '--user-regs=?' to list register names", parse_user_regs),
3237 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3238 		    "Record running/enabled time of read (:S) events"),
3239 	OPT_CALLBACK('k', "clockid", &record.opts,
3240 	"clockid", "clockid to use for events, see clock_gettime()",
3241 	parse_clockid),
3242 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3243 			  "opts", "AUX area tracing Snapshot Mode", ""),
3244 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3245 			  "opts", "sample AUX area", ""),
3246 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3247 			"per thread proc mmap processing timeout in ms"),
3248 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3249 		    "Record namespaces events"),
3250 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3251 		    "Record cgroup events"),
3252 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3253 			&record.opts.record_switch_events_set,
3254 			"Record context switch events"),
3255 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3256 			 "Configure all used events to run in kernel space.",
3257 			 PARSE_OPT_EXCLUSIVE),
3258 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3259 			 "Configure all used events to run in user space.",
3260 			 PARSE_OPT_EXCLUSIVE),
3261 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3262 		    "collect kernel callchains"),
3263 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3264 		    "collect user callchains"),
3265 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
3266 		   "clang binary to use for compiling BPF scriptlets"),
3267 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
3268 		   "options passed to clang when compiling BPF scriptlets"),
3269 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3270 		   "file", "vmlinux pathname"),
3271 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3272 		    "Record build-id of all DSOs regardless of hits"),
3273 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3274 		    "Record build-id in map events"),
3275 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3276 		    "append timestamp to output filename"),
3277 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3278 		    "Record timestamp boundary (time of first/last samples)"),
3279 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3280 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3281 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3282 			  "signal"),
3283 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
3284 			 "switch output event selector. use 'perf list' to list available events",
3285 			 parse_events_option_new_evlist),
3286 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3287 		   "Limit number of switch output generated files"),
3288 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3289 		    "Parse options then exit"),
3290 #ifdef HAVE_AIO_SUPPORT
3291 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3292 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3293 		     record__aio_parse),
3294 #endif
3295 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3296 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3297 		     record__parse_affinity),
3298 #ifdef HAVE_ZSTD_SUPPORT
3299 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3300 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3301 			    record__parse_comp_level),
3302 #endif
3303 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3304 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3305 	OPT_UINTEGER(0, "num-thread-synthesize",
3306 		     &record.opts.nr_threads_synthesize,
3307 		     "number of threads to run for event synthesis"),
3308 #ifdef HAVE_LIBPFM
3309 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3310 		"libpfm4 event selector. use 'perf list' to list available events",
3311 		parse_libpfm_events_option),
3312 #endif
3313 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3314 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3315 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3316 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3317 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3318 		      parse_control_option),
3319 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3320 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3321 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3322 			  &record.debuginfod.set, "debuginfod urls",
3323 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3324 			  "system"),
3325 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3326 			    "write collected trace data into several data files using parallel threads",
3327 			    record__parse_threads),
3328 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3329 	OPT_END()
3330 };
3331 
3332 struct option *record_options = __record_options;
3333 
3334 static void record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3335 {
3336 	struct perf_cpu cpu;
3337 	int idx;
3338 
3339 	if (cpu_map__is_dummy(cpus))
3340 		return;
3341 
3342 	perf_cpu_map__for_each_cpu(cpu, idx, cpus)
3343 		set_bit(cpu.cpu, mask->bits);
3344 }
3345 
3346 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3347 {
3348 	struct perf_cpu_map *cpus;
3349 
3350 	cpus = perf_cpu_map__new(mask_spec);
3351 	if (!cpus)
3352 		return -ENOMEM;
3353 
3354 	bitmap_zero(mask->bits, mask->nbits);
3355 	record__mmap_cpu_mask_init(mask, cpus);
3356 	perf_cpu_map__put(cpus);
3357 
3358 	return 0;
3359 }
3360 
3361 static void record__free_thread_masks(struct record *rec, int nr_threads)
3362 {
3363 	int t;
3364 
3365 	if (rec->thread_masks)
3366 		for (t = 0; t < nr_threads; t++)
3367 			record__thread_mask_free(&rec->thread_masks[t]);
3368 
3369 	zfree(&rec->thread_masks);
3370 }
3371 
3372 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3373 {
3374 	int t, ret;
3375 
3376 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3377 	if (!rec->thread_masks) {
3378 		pr_err("Failed to allocate thread masks\n");
3379 		return -ENOMEM;
3380 	}
3381 
3382 	for (t = 0; t < nr_threads; t++) {
3383 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3384 		if (ret) {
3385 			pr_err("Failed to allocate thread masks[%d]\n", t);
3386 			goto out_free;
3387 		}
3388 	}
3389 
3390 	return 0;
3391 
3392 out_free:
3393 	record__free_thread_masks(rec, nr_threads);
3394 
3395 	return ret;
3396 }
3397 
3398 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3399 {
3400 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3401 
3402 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3403 	if (ret)
3404 		return ret;
3405 
3406 	rec->nr_threads = nr_cpus;
3407 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3408 
3409 	for (t = 0; t < rec->nr_threads; t++) {
3410 		set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3411 		set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3412 		if (verbose) {
3413 			pr_debug("thread_masks[%d]: ", t);
3414 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3415 			pr_debug("thread_masks[%d]: ", t);
3416 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3417 		}
3418 	}
3419 
3420 	return 0;
3421 }
3422 
3423 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3424 					  const char **maps_spec, const char **affinity_spec,
3425 					  u32 nr_spec)
3426 {
3427 	u32 s;
3428 	int ret = 0, t = 0;
3429 	struct mmap_cpu_mask cpus_mask;
3430 	struct thread_mask thread_mask, full_mask, *thread_masks;
3431 
3432 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3433 	if (ret) {
3434 		pr_err("Failed to allocate CPUs mask\n");
3435 		return ret;
3436 	}
3437 	record__mmap_cpu_mask_init(&cpus_mask, cpus);
3438 
3439 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3440 	if (ret) {
3441 		pr_err("Failed to allocate full mask\n");
3442 		goto out_free_cpu_mask;
3443 	}
3444 
3445 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3446 	if (ret) {
3447 		pr_err("Failed to allocate thread mask\n");
3448 		goto out_free_full_and_cpu_masks;
3449 	}
3450 
3451 	for (s = 0; s < nr_spec; s++) {
3452 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3453 		if (ret) {
3454 			pr_err("Failed to initialize maps thread mask\n");
3455 			goto out_free;
3456 		}
3457 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3458 		if (ret) {
3459 			pr_err("Failed to initialize affinity thread mask\n");
3460 			goto out_free;
3461 		}
3462 
3463 		/* ignore invalid CPUs but do not allow empty masks */
3464 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3465 				cpus_mask.bits, thread_mask.maps.nbits)) {
3466 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3467 			ret = -EINVAL;
3468 			goto out_free;
3469 		}
3470 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3471 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3472 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3473 			ret = -EINVAL;
3474 			goto out_free;
3475 		}
3476 
3477 		/* do not allow intersection with other masks (full_mask) */
3478 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3479 				      thread_mask.maps.nbits)) {
3480 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3481 			ret = -EINVAL;
3482 			goto out_free;
3483 		}
3484 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3485 				      thread_mask.affinity.nbits)) {
3486 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3487 			ret = -EINVAL;
3488 			goto out_free;
3489 		}
3490 
3491 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3492 			  thread_mask.maps.bits, full_mask.maps.nbits);
3493 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3494 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3495 
3496 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3497 		if (!thread_masks) {
3498 			pr_err("Failed to reallocate thread masks\n");
3499 			ret = -ENOMEM;
3500 			goto out_free;
3501 		}
3502 		rec->thread_masks = thread_masks;
3503 		rec->thread_masks[t] = thread_mask;
3504 		if (verbose) {
3505 			pr_debug("thread_masks[%d]: ", t);
3506 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3507 			pr_debug("thread_masks[%d]: ", t);
3508 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3509 		}
3510 		t++;
3511 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3512 		if (ret) {
3513 			pr_err("Failed to allocate thread mask\n");
3514 			goto out_free_full_and_cpu_masks;
3515 		}
3516 	}
3517 	rec->nr_threads = t;
3518 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3519 	if (!rec->nr_threads)
3520 		ret = -EINVAL;
3521 
3522 out_free:
3523 	record__thread_mask_free(&thread_mask);
3524 out_free_full_and_cpu_masks:
3525 	record__thread_mask_free(&full_mask);
3526 out_free_cpu_mask:
3527 	record__mmap_cpu_mask_free(&cpus_mask);
3528 
3529 	return ret;
3530 }
3531 
3532 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3533 {
3534 	int ret;
3535 	struct cpu_topology *topo;
3536 
3537 	topo = cpu_topology__new();
3538 	if (!topo) {
3539 		pr_err("Failed to allocate CPU topology\n");
3540 		return -ENOMEM;
3541 	}
3542 
3543 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3544 					     topo->core_cpus_list, topo->core_cpus_lists);
3545 	cpu_topology__delete(topo);
3546 
3547 	return ret;
3548 }
3549 
3550 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3551 {
3552 	int ret;
3553 	struct cpu_topology *topo;
3554 
3555 	topo = cpu_topology__new();
3556 	if (!topo) {
3557 		pr_err("Failed to allocate CPU topology\n");
3558 		return -ENOMEM;
3559 	}
3560 
3561 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3562 					     topo->package_cpus_list, topo->package_cpus_lists);
3563 	cpu_topology__delete(topo);
3564 
3565 	return ret;
3566 }
3567 
3568 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3569 {
3570 	u32 s;
3571 	int ret;
3572 	const char **spec;
3573 	struct numa_topology *topo;
3574 
3575 	topo = numa_topology__new();
3576 	if (!topo) {
3577 		pr_err("Failed to allocate NUMA topology\n");
3578 		return -ENOMEM;
3579 	}
3580 
3581 	spec = zalloc(topo->nr * sizeof(char *));
3582 	if (!spec) {
3583 		pr_err("Failed to allocate NUMA spec\n");
3584 		ret = -ENOMEM;
3585 		goto out_delete_topo;
3586 	}
3587 	for (s = 0; s < topo->nr; s++)
3588 		spec[s] = topo->nodes[s].cpus;
3589 
3590 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3591 
3592 	zfree(&spec);
3593 
3594 out_delete_topo:
3595 	numa_topology__delete(topo);
3596 
3597 	return ret;
3598 }
3599 
3600 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3601 {
3602 	int t, ret;
3603 	u32 s, nr_spec = 0;
3604 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3605 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3606 
3607 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3608 		spec = strtok_r(user_spec, ":", &spec_ptr);
3609 		if (spec == NULL)
3610 			break;
3611 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3612 		mask = strtok_r(spec, "/", &mask_ptr);
3613 		if (mask == NULL)
3614 			break;
3615 		pr_debug2("  maps mask: %s\n", mask);
3616 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3617 		if (!tmp_spec) {
3618 			pr_err("Failed to reallocate maps spec\n");
3619 			ret = -ENOMEM;
3620 			goto out_free;
3621 		}
3622 		maps_spec = tmp_spec;
3623 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3624 		if (!maps_spec[nr_spec]) {
3625 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3626 			ret = -ENOMEM;
3627 			goto out_free;
3628 		}
3629 		mask = strtok_r(NULL, "/", &mask_ptr);
3630 		if (mask == NULL) {
3631 			pr_err("Invalid thread maps or affinity specs\n");
3632 			ret = -EINVAL;
3633 			goto out_free;
3634 		}
3635 		pr_debug2("  affinity mask: %s\n", mask);
3636 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3637 		if (!tmp_spec) {
3638 			pr_err("Failed to reallocate affinity spec\n");
3639 			ret = -ENOMEM;
3640 			goto out_free;
3641 		}
3642 		affinity_spec = tmp_spec;
3643 		affinity_spec[nr_spec] = strdup(mask);
3644 		if (!affinity_spec[nr_spec]) {
3645 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3646 			ret = -ENOMEM;
3647 			goto out_free;
3648 		}
3649 		dup_mask = NULL;
3650 		nr_spec++;
3651 	}
3652 
3653 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3654 					     (const char **)affinity_spec, nr_spec);
3655 
3656 out_free:
3657 	free(dup_mask);
3658 	for (s = 0; s < nr_spec; s++) {
3659 		if (maps_spec)
3660 			free(maps_spec[s]);
3661 		if (affinity_spec)
3662 			free(affinity_spec[s]);
3663 	}
3664 	free(affinity_spec);
3665 	free(maps_spec);
3666 
3667 	return ret;
3668 }
3669 
3670 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3671 {
3672 	int ret;
3673 
3674 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3675 	if (ret)
3676 		return ret;
3677 
3678 	record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus);
3679 
3680 	rec->nr_threads = 1;
3681 
3682 	return 0;
3683 }
3684 
3685 static int record__init_thread_masks(struct record *rec)
3686 {
3687 	int ret = 0;
3688 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3689 
3690 	if (!record__threads_enabled(rec))
3691 		return record__init_thread_default_masks(rec, cpus);
3692 
3693 	if (evlist__per_thread(rec->evlist)) {
3694 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3695 		return -EINVAL;
3696 	}
3697 
3698 	switch (rec->opts.threads_spec) {
3699 	case THREAD_SPEC__CPU:
3700 		ret = record__init_thread_cpu_masks(rec, cpus);
3701 		break;
3702 	case THREAD_SPEC__CORE:
3703 		ret = record__init_thread_core_masks(rec, cpus);
3704 		break;
3705 	case THREAD_SPEC__PACKAGE:
3706 		ret = record__init_thread_package_masks(rec, cpus);
3707 		break;
3708 	case THREAD_SPEC__NUMA:
3709 		ret = record__init_thread_numa_masks(rec, cpus);
3710 		break;
3711 	case THREAD_SPEC__USER:
3712 		ret = record__init_thread_user_masks(rec, cpus);
3713 		break;
3714 	default:
3715 		break;
3716 	}
3717 
3718 	return ret;
3719 }
3720 
3721 int cmd_record(int argc, const char **argv)
3722 {
3723 	int err;
3724 	struct record *rec = &record;
3725 	char errbuf[BUFSIZ];
3726 
3727 	setlocale(LC_ALL, "");
3728 
3729 #ifndef HAVE_LIBBPF_SUPPORT
3730 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
3731 	set_nobuild('\0', "clang-path", true);
3732 	set_nobuild('\0', "clang-opt", true);
3733 # undef set_nobuild
3734 #endif
3735 
3736 #ifndef HAVE_BPF_PROLOGUE
3737 # if !defined (HAVE_DWARF_SUPPORT)
3738 #  define REASON  "NO_DWARF=1"
3739 # elif !defined (HAVE_LIBBPF_SUPPORT)
3740 #  define REASON  "NO_LIBBPF=1"
3741 # else
3742 #  define REASON  "this architecture doesn't support BPF prologue"
3743 # endif
3744 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
3745 	set_nobuild('\0', "vmlinux", true);
3746 # undef set_nobuild
3747 # undef REASON
3748 #endif
3749 
3750 #ifndef HAVE_BPF_SKEL
3751 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3752 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3753 # undef set_nobuild
3754 #endif
3755 
3756 	rec->opts.affinity = PERF_AFFINITY_SYS;
3757 
3758 	rec->evlist = evlist__new();
3759 	if (rec->evlist == NULL)
3760 		return -ENOMEM;
3761 
3762 	err = perf_config(perf_record_config, rec);
3763 	if (err)
3764 		return err;
3765 
3766 	argc = parse_options(argc, argv, record_options, record_usage,
3767 			    PARSE_OPT_STOP_AT_NON_OPTION);
3768 	if (quiet)
3769 		perf_quiet_option();
3770 
3771 	err = symbol__validate_sym_arguments();
3772 	if (err)
3773 		return err;
3774 
3775 	perf_debuginfod_setup(&record.debuginfod);
3776 
3777 	/* Make system wide (-a) the default target. */
3778 	if (!argc && target__none(&rec->opts.target))
3779 		rec->opts.target.system_wide = true;
3780 
3781 	if (nr_cgroups && !rec->opts.target.system_wide) {
3782 		usage_with_options_msg(record_usage, record_options,
3783 			"cgroup monitoring only available in system-wide mode");
3784 
3785 	}
3786 
3787 	if (rec->buildid_mmap) {
3788 		if (!perf_can_record_build_id()) {
3789 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
3790 			err = -EINVAL;
3791 			goto out_opts;
3792 		}
3793 		pr_debug("Enabling build id in mmap2 events.\n");
3794 		/* Enable mmap build id synthesizing. */
3795 		symbol_conf.buildid_mmap2 = true;
3796 		/* Enable perf_event_attr::build_id bit. */
3797 		rec->opts.build_id = true;
3798 		/* Disable build id cache. */
3799 		rec->no_buildid = true;
3800 	}
3801 
3802 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
3803 		pr_err("Kernel has no cgroup sampling support.\n");
3804 		err = -EINVAL;
3805 		goto out_opts;
3806 	}
3807 
3808 	if (rec->opts.kcore || record__threads_enabled(rec))
3809 		rec->data.is_dir = true;
3810 
3811 	if (record__threads_enabled(rec)) {
3812 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
3813 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
3814 			goto out_opts;
3815 		}
3816 		if (record__aio_enabled(rec)) {
3817 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
3818 			goto out_opts;
3819 		}
3820 	}
3821 
3822 	if (rec->opts.comp_level != 0) {
3823 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
3824 		rec->no_buildid = true;
3825 	}
3826 
3827 	if (rec->opts.record_switch_events &&
3828 	    !perf_can_record_switch_events()) {
3829 		ui__error("kernel does not support recording context switch events\n");
3830 		parse_options_usage(record_usage, record_options, "switch-events", 0);
3831 		err = -EINVAL;
3832 		goto out_opts;
3833 	}
3834 
3835 	if (switch_output_setup(rec)) {
3836 		parse_options_usage(record_usage, record_options, "switch-output", 0);
3837 		err = -EINVAL;
3838 		goto out_opts;
3839 	}
3840 
3841 	if (rec->switch_output.time) {
3842 		signal(SIGALRM, alarm_sig_handler);
3843 		alarm(rec->switch_output.time);
3844 	}
3845 
3846 	if (rec->switch_output.num_files) {
3847 		rec->switch_output.filenames = calloc(sizeof(char *),
3848 						      rec->switch_output.num_files);
3849 		if (!rec->switch_output.filenames) {
3850 			err = -EINVAL;
3851 			goto out_opts;
3852 		}
3853 	}
3854 
3855 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
3856 		rec->timestamp_filename = false;
3857 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
3858 	}
3859 
3860 	/*
3861 	 * Allow aliases to facilitate the lookup of symbols for address
3862 	 * filters. Refer to auxtrace_parse_filters().
3863 	 */
3864 	symbol_conf.allow_aliases = true;
3865 
3866 	symbol__init(NULL);
3867 
3868 	err = record__auxtrace_init(rec);
3869 	if (err)
3870 		goto out;
3871 
3872 	if (dry_run)
3873 		goto out;
3874 
3875 	err = bpf__setup_stdout(rec->evlist);
3876 	if (err) {
3877 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
3878 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
3879 			 errbuf);
3880 		goto out;
3881 	}
3882 
3883 	err = -ENOMEM;
3884 
3885 	if (rec->no_buildid_cache || rec->no_buildid) {
3886 		disable_buildid_cache();
3887 	} else if (rec->switch_output.enabled) {
3888 		/*
3889 		 * In 'perf record --switch-output', disable buildid
3890 		 * generation by default to reduce data file switching
3891 		 * overhead. Still generate buildid if they are required
3892 		 * explicitly using
3893 		 *
3894 		 *  perf record --switch-output --no-no-buildid \
3895 		 *              --no-no-buildid-cache
3896 		 *
3897 		 * Following code equals to:
3898 		 *
3899 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
3900 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
3901 		 *         disable_buildid_cache();
3902 		 */
3903 		bool disable = true;
3904 
3905 		if (rec->no_buildid_set && !rec->no_buildid)
3906 			disable = false;
3907 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
3908 			disable = false;
3909 		if (disable) {
3910 			rec->no_buildid = true;
3911 			rec->no_buildid_cache = true;
3912 			disable_buildid_cache();
3913 		}
3914 	}
3915 
3916 	if (record.opts.overwrite)
3917 		record.opts.tail_synthesize = true;
3918 
3919 	if (rec->evlist->core.nr_entries == 0) {
3920 		if (perf_pmu__has_hybrid()) {
3921 			err = evlist__add_default_hybrid(rec->evlist,
3922 							 !record.opts.no_samples);
3923 		} else {
3924 			err = __evlist__add_default(rec->evlist,
3925 						    !record.opts.no_samples);
3926 		}
3927 
3928 		if (err < 0) {
3929 			pr_err("Not enough memory for event selector list\n");
3930 			goto out;
3931 		}
3932 	}
3933 
3934 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
3935 		rec->opts.no_inherit = true;
3936 
3937 	err = target__validate(&rec->opts.target);
3938 	if (err) {
3939 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
3940 		ui__warning("%s\n", errbuf);
3941 	}
3942 
3943 	err = target__parse_uid(&rec->opts.target);
3944 	if (err) {
3945 		int saved_errno = errno;
3946 
3947 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
3948 		ui__error("%s", errbuf);
3949 
3950 		err = -saved_errno;
3951 		goto out;
3952 	}
3953 
3954 	/* Enable ignoring missing threads when -u/-p option is defined. */
3955 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
3956 
3957 	if (evlist__fix_hybrid_cpus(rec->evlist, rec->opts.target.cpu_list)) {
3958 		pr_err("failed to use cpu list %s\n",
3959 		       rec->opts.target.cpu_list);
3960 		goto out;
3961 	}
3962 
3963 	rec->opts.target.hybrid = perf_pmu__has_hybrid();
3964 
3965 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
3966 		arch__add_leaf_frame_record_opts(&rec->opts);
3967 
3968 	err = -ENOMEM;
3969 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
3970 		usage_with_options(record_usage, record_options);
3971 
3972 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
3973 	if (err)
3974 		goto out;
3975 
3976 	/*
3977 	 * We take all buildids when the file contains
3978 	 * AUX area tracing data because we do not decode the
3979 	 * trace because it would take too long.
3980 	 */
3981 	if (rec->opts.full_auxtrace)
3982 		rec->buildid_all = true;
3983 
3984 	if (rec->opts.text_poke) {
3985 		err = record__config_text_poke(rec->evlist);
3986 		if (err) {
3987 			pr_err("record__config_text_poke failed, error %d\n", err);
3988 			goto out;
3989 		}
3990 	}
3991 
3992 	if (rec->off_cpu) {
3993 		err = record__config_off_cpu(rec);
3994 		if (err) {
3995 			pr_err("record__config_off_cpu failed, error %d\n", err);
3996 			goto out;
3997 		}
3998 	}
3999 
4000 	if (record_opts__config(&rec->opts)) {
4001 		err = -EINVAL;
4002 		goto out;
4003 	}
4004 
4005 	err = record__init_thread_masks(rec);
4006 	if (err) {
4007 		pr_err("Failed to initialize parallel data streaming masks\n");
4008 		goto out;
4009 	}
4010 
4011 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4012 		rec->opts.nr_cblocks = nr_cblocks_max;
4013 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4014 
4015 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4016 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4017 
4018 	if (rec->opts.comp_level > comp_level_max)
4019 		rec->opts.comp_level = comp_level_max;
4020 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4021 
4022 	err = __cmd_record(&record, argc, argv);
4023 out:
4024 	evlist__delete(rec->evlist);
4025 	symbol__exit();
4026 	auxtrace_record__free(rec->itr);
4027 out_opts:
4028 	record__free_thread_masks(rec, rec->nr_threads);
4029 	rec->nr_threads = 0;
4030 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4031 	return err;
4032 }
4033 
4034 static void snapshot_sig_handler(int sig __maybe_unused)
4035 {
4036 	struct record *rec = &record;
4037 
4038 	hit_auxtrace_snapshot_trigger(rec);
4039 
4040 	if (switch_output_signal(rec))
4041 		trigger_hit(&switch_output_trigger);
4042 }
4043 
4044 static void alarm_sig_handler(int sig __maybe_unused)
4045 {
4046 	struct record *rec = &record;
4047 
4048 	if (switch_output_time(rec))
4049 		trigger_hit(&switch_output_trigger);
4050 }
4051