xref: /linux/tools/perf/builtin-record.c (revision 673f816b9e1e92d1f70e1bf5f21b531e0ff9ad6c)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/pmu.h"
50 #include "util/pmus.h"
51 #include "util/clockid.h"
52 #include "util/off_cpu.h"
53 #include "util/bpf-filter.h"
54 #include "asm/bug.h"
55 #include "perf.h"
56 #include "cputopo.h"
57 
58 #include <errno.h>
59 #include <inttypes.h>
60 #include <locale.h>
61 #include <poll.h>
62 #include <pthread.h>
63 #include <unistd.h>
64 #ifndef HAVE_GETTID
65 #include <syscall.h>
66 #endif
67 #include <sched.h>
68 #include <signal.h>
69 #ifdef HAVE_EVENTFD_SUPPORT
70 #include <sys/eventfd.h>
71 #endif
72 #include <sys/mman.h>
73 #include <sys/wait.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <fcntl.h>
77 #include <linux/err.h>
78 #include <linux/string.h>
79 #include <linux/time64.h>
80 #include <linux/zalloc.h>
81 #include <linux/bitmap.h>
82 #include <sys/time.h>
83 
84 struct switch_output {
85 	bool		 enabled;
86 	bool		 signal;
87 	unsigned long	 size;
88 	unsigned long	 time;
89 	const char	*str;
90 	bool		 set;
91 	char		 **filenames;
92 	int		 num_files;
93 	int		 cur_file;
94 };
95 
96 struct thread_mask {
97 	struct mmap_cpu_mask	maps;
98 	struct mmap_cpu_mask	affinity;
99 };
100 
101 struct record_thread {
102 	pid_t			tid;
103 	struct thread_mask	*mask;
104 	struct {
105 		int		msg[2];
106 		int		ack[2];
107 	} pipes;
108 	struct fdarray		pollfd;
109 	int			ctlfd_pos;
110 	int			nr_mmaps;
111 	struct mmap		**maps;
112 	struct mmap		**overwrite_maps;
113 	struct record		*rec;
114 	unsigned long long	samples;
115 	unsigned long		waking;
116 	u64			bytes_written;
117 	u64			bytes_transferred;
118 	u64			bytes_compressed;
119 };
120 
121 static __thread struct record_thread *thread;
122 
123 enum thread_msg {
124 	THREAD_MSG__UNDEFINED = 0,
125 	THREAD_MSG__READY,
126 	THREAD_MSG__MAX,
127 };
128 
129 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
130 	"UNDEFINED", "READY"
131 };
132 
133 enum thread_spec {
134 	THREAD_SPEC__UNDEFINED = 0,
135 	THREAD_SPEC__CPU,
136 	THREAD_SPEC__CORE,
137 	THREAD_SPEC__PACKAGE,
138 	THREAD_SPEC__NUMA,
139 	THREAD_SPEC__USER,
140 	THREAD_SPEC__MAX,
141 };
142 
143 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
144 	"undefined", "cpu", "core", "package", "numa", "user"
145 };
146 
147 struct pollfd_index_map {
148 	int evlist_pollfd_index;
149 	int thread_pollfd_index;
150 };
151 
152 struct record {
153 	struct perf_tool	tool;
154 	struct record_opts	opts;
155 	u64			bytes_written;
156 	u64			thread_bytes_written;
157 	struct perf_data	data;
158 	struct auxtrace_record	*itr;
159 	struct evlist	*evlist;
160 	struct perf_session	*session;
161 	struct evlist		*sb_evlist;
162 	pthread_t		thread_id;
163 	int			realtime_prio;
164 	bool			switch_output_event_set;
165 	bool			no_buildid;
166 	bool			no_buildid_set;
167 	bool			no_buildid_cache;
168 	bool			no_buildid_cache_set;
169 	bool			buildid_all;
170 	bool			buildid_mmap;
171 	bool			timestamp_filename;
172 	bool			timestamp_boundary;
173 	bool			off_cpu;
174 	struct switch_output	switch_output;
175 	unsigned long long	samples;
176 	unsigned long		output_max_size;	/* = 0: unlimited */
177 	struct perf_debuginfod	debuginfod;
178 	int			nr_threads;
179 	struct thread_mask	*thread_masks;
180 	struct record_thread	*thread_data;
181 	struct pollfd_index_map	*index_map;
182 	size_t			index_map_sz;
183 	size_t			index_map_cnt;
184 };
185 
186 static volatile int done;
187 
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
191 
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
193 	"SYS", "NODE", "CPU"
194 };
195 
196 #ifndef HAVE_GETTID
197 static inline pid_t gettid(void)
198 {
199 	return (pid_t)syscall(__NR_gettid);
200 }
201 #endif
202 
203 static int record__threads_enabled(struct record *rec)
204 {
205 	return rec->opts.threads_spec;
206 }
207 
208 static bool switch_output_signal(struct record *rec)
209 {
210 	return rec->switch_output.signal &&
211 	       trigger_is_ready(&switch_output_trigger);
212 }
213 
214 static bool switch_output_size(struct record *rec)
215 {
216 	return rec->switch_output.size &&
217 	       trigger_is_ready(&switch_output_trigger) &&
218 	       (rec->bytes_written >= rec->switch_output.size);
219 }
220 
221 static bool switch_output_time(struct record *rec)
222 {
223 	return rec->switch_output.time &&
224 	       trigger_is_ready(&switch_output_trigger);
225 }
226 
227 static u64 record__bytes_written(struct record *rec)
228 {
229 	return rec->bytes_written + rec->thread_bytes_written;
230 }
231 
232 static bool record__output_max_size_exceeded(struct record *rec)
233 {
234 	return rec->output_max_size &&
235 	       (record__bytes_written(rec) >= rec->output_max_size);
236 }
237 
238 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
239 			 void *bf, size_t size)
240 {
241 	struct perf_data_file *file = &rec->session->data->file;
242 
243 	if (map && map->file)
244 		file = map->file;
245 
246 	if (perf_data_file__write(file, bf, size) < 0) {
247 		pr_err("failed to write perf data, error: %m\n");
248 		return -1;
249 	}
250 
251 	if (map && map->file) {
252 		thread->bytes_written += size;
253 		rec->thread_bytes_written += size;
254 	} else {
255 		rec->bytes_written += size;
256 	}
257 
258 	if (record__output_max_size_exceeded(rec) && !done) {
259 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
260 				" stopping session ]\n",
261 				record__bytes_written(rec) >> 10);
262 		done = 1;
263 	}
264 
265 	if (switch_output_size(rec))
266 		trigger_hit(&switch_output_trigger);
267 
268 	return 0;
269 }
270 
271 static int record__aio_enabled(struct record *rec);
272 static int record__comp_enabled(struct record *rec);
273 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
274 			    void *dst, size_t dst_size, void *src, size_t src_size);
275 
276 #ifdef HAVE_AIO_SUPPORT
277 static int record__aio_write(struct aiocb *cblock, int trace_fd,
278 		void *buf, size_t size, off_t off)
279 {
280 	int rc;
281 
282 	cblock->aio_fildes = trace_fd;
283 	cblock->aio_buf    = buf;
284 	cblock->aio_nbytes = size;
285 	cblock->aio_offset = off;
286 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
287 
288 	do {
289 		rc = aio_write(cblock);
290 		if (rc == 0) {
291 			break;
292 		} else if (errno != EAGAIN) {
293 			cblock->aio_fildes = -1;
294 			pr_err("failed to queue perf data, error: %m\n");
295 			break;
296 		}
297 	} while (1);
298 
299 	return rc;
300 }
301 
302 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
303 {
304 	void *rem_buf;
305 	off_t rem_off;
306 	size_t rem_size;
307 	int rc, aio_errno;
308 	ssize_t aio_ret, written;
309 
310 	aio_errno = aio_error(cblock);
311 	if (aio_errno == EINPROGRESS)
312 		return 0;
313 
314 	written = aio_ret = aio_return(cblock);
315 	if (aio_ret < 0) {
316 		if (aio_errno != EINTR)
317 			pr_err("failed to write perf data, error: %m\n");
318 		written = 0;
319 	}
320 
321 	rem_size = cblock->aio_nbytes - written;
322 
323 	if (rem_size == 0) {
324 		cblock->aio_fildes = -1;
325 		/*
326 		 * md->refcount is incremented in record__aio_pushfn() for
327 		 * every aio write request started in record__aio_push() so
328 		 * decrement it because the request is now complete.
329 		 */
330 		perf_mmap__put(&md->core);
331 		rc = 1;
332 	} else {
333 		/*
334 		 * aio write request may require restart with the
335 		 * remainder if the kernel didn't write whole
336 		 * chunk at once.
337 		 */
338 		rem_off = cblock->aio_offset + written;
339 		rem_buf = (void *)(cblock->aio_buf + written);
340 		record__aio_write(cblock, cblock->aio_fildes,
341 				rem_buf, rem_size, rem_off);
342 		rc = 0;
343 	}
344 
345 	return rc;
346 }
347 
348 static int record__aio_sync(struct mmap *md, bool sync_all)
349 {
350 	struct aiocb **aiocb = md->aio.aiocb;
351 	struct aiocb *cblocks = md->aio.cblocks;
352 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
353 	int i, do_suspend;
354 
355 	do {
356 		do_suspend = 0;
357 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
358 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
359 				if (sync_all)
360 					aiocb[i] = NULL;
361 				else
362 					return i;
363 			} else {
364 				/*
365 				 * Started aio write is not complete yet
366 				 * so it has to be waited before the
367 				 * next allocation.
368 				 */
369 				aiocb[i] = &cblocks[i];
370 				do_suspend = 1;
371 			}
372 		}
373 		if (!do_suspend)
374 			return -1;
375 
376 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
377 			if (!(errno == EAGAIN || errno == EINTR))
378 				pr_err("failed to sync perf data, error: %m\n");
379 		}
380 	} while (1);
381 }
382 
383 struct record_aio {
384 	struct record	*rec;
385 	void		*data;
386 	size_t		size;
387 };
388 
389 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
390 {
391 	struct record_aio *aio = to;
392 
393 	/*
394 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
395 	 * to release space in the kernel buffer as fast as possible, calling
396 	 * perf_mmap__consume() from perf_mmap__push() function.
397 	 *
398 	 * That lets the kernel to proceed with storing more profiling data into
399 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
400 	 *
401 	 * Coping can be done in two steps in case the chunk of profiling data
402 	 * crosses the upper bound of the kernel buffer. In this case we first move
403 	 * part of data from map->start till the upper bound and then the remainder
404 	 * from the beginning of the kernel buffer till the end of the data chunk.
405 	 */
406 
407 	if (record__comp_enabled(aio->rec)) {
408 		ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
409 						   mmap__mmap_len(map) - aio->size,
410 						   buf, size);
411 		if (compressed < 0)
412 			return (int)compressed;
413 
414 		size = compressed;
415 	} else {
416 		memcpy(aio->data + aio->size, buf, size);
417 	}
418 
419 	if (!aio->size) {
420 		/*
421 		 * Increment map->refcount to guard map->aio.data[] buffer
422 		 * from premature deallocation because map object can be
423 		 * released earlier than aio write request started on
424 		 * map->aio.data[] buffer is complete.
425 		 *
426 		 * perf_mmap__put() is done at record__aio_complete()
427 		 * after started aio request completion or at record__aio_push()
428 		 * if the request failed to start.
429 		 */
430 		perf_mmap__get(&map->core);
431 	}
432 
433 	aio->size += size;
434 
435 	return size;
436 }
437 
438 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
439 {
440 	int ret, idx;
441 	int trace_fd = rec->session->data->file.fd;
442 	struct record_aio aio = { .rec = rec, .size = 0 };
443 
444 	/*
445 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
446 	 * becomes available after previous aio write operation.
447 	 */
448 
449 	idx = record__aio_sync(map, false);
450 	aio.data = map->aio.data[idx];
451 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
452 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
453 		return ret;
454 
455 	rec->samples++;
456 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
457 	if (!ret) {
458 		*off += aio.size;
459 		rec->bytes_written += aio.size;
460 		if (switch_output_size(rec))
461 			trigger_hit(&switch_output_trigger);
462 	} else {
463 		/*
464 		 * Decrement map->refcount incremented in record__aio_pushfn()
465 		 * back if record__aio_write() operation failed to start, otherwise
466 		 * map->refcount is decremented in record__aio_complete() after
467 		 * aio write operation finishes successfully.
468 		 */
469 		perf_mmap__put(&map->core);
470 	}
471 
472 	return ret;
473 }
474 
475 static off_t record__aio_get_pos(int trace_fd)
476 {
477 	return lseek(trace_fd, 0, SEEK_CUR);
478 }
479 
480 static void record__aio_set_pos(int trace_fd, off_t pos)
481 {
482 	lseek(trace_fd, pos, SEEK_SET);
483 }
484 
485 static void record__aio_mmap_read_sync(struct record *rec)
486 {
487 	int i;
488 	struct evlist *evlist = rec->evlist;
489 	struct mmap *maps = evlist->mmap;
490 
491 	if (!record__aio_enabled(rec))
492 		return;
493 
494 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
495 		struct mmap *map = &maps[i];
496 
497 		if (map->core.base)
498 			record__aio_sync(map, true);
499 	}
500 }
501 
502 static int nr_cblocks_default = 1;
503 static int nr_cblocks_max = 4;
504 
505 static int record__aio_parse(const struct option *opt,
506 			     const char *str,
507 			     int unset)
508 {
509 	struct record_opts *opts = (struct record_opts *)opt->value;
510 
511 	if (unset) {
512 		opts->nr_cblocks = 0;
513 	} else {
514 		if (str)
515 			opts->nr_cblocks = strtol(str, NULL, 0);
516 		if (!opts->nr_cblocks)
517 			opts->nr_cblocks = nr_cblocks_default;
518 	}
519 
520 	return 0;
521 }
522 #else /* HAVE_AIO_SUPPORT */
523 static int nr_cblocks_max = 0;
524 
525 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
526 			    off_t *off __maybe_unused)
527 {
528 	return -1;
529 }
530 
531 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
532 {
533 	return -1;
534 }
535 
536 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
537 {
538 }
539 
540 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
541 {
542 }
543 #endif
544 
545 static int record__aio_enabled(struct record *rec)
546 {
547 	return rec->opts.nr_cblocks > 0;
548 }
549 
550 #define MMAP_FLUSH_DEFAULT 1
551 static int record__mmap_flush_parse(const struct option *opt,
552 				    const char *str,
553 				    int unset)
554 {
555 	int flush_max;
556 	struct record_opts *opts = (struct record_opts *)opt->value;
557 	static struct parse_tag tags[] = {
558 			{ .tag  = 'B', .mult = 1       },
559 			{ .tag  = 'K', .mult = 1 << 10 },
560 			{ .tag  = 'M', .mult = 1 << 20 },
561 			{ .tag  = 'G', .mult = 1 << 30 },
562 			{ .tag  = 0 },
563 	};
564 
565 	if (unset)
566 		return 0;
567 
568 	if (str) {
569 		opts->mmap_flush = parse_tag_value(str, tags);
570 		if (opts->mmap_flush == (int)-1)
571 			opts->mmap_flush = strtol(str, NULL, 0);
572 	}
573 
574 	if (!opts->mmap_flush)
575 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
576 
577 	flush_max = evlist__mmap_size(opts->mmap_pages);
578 	flush_max /= 4;
579 	if (opts->mmap_flush > flush_max)
580 		opts->mmap_flush = flush_max;
581 
582 	return 0;
583 }
584 
585 #ifdef HAVE_ZSTD_SUPPORT
586 static unsigned int comp_level_default = 1;
587 
588 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
589 {
590 	struct record_opts *opts = opt->value;
591 
592 	if (unset) {
593 		opts->comp_level = 0;
594 	} else {
595 		if (str)
596 			opts->comp_level = strtol(str, NULL, 0);
597 		if (!opts->comp_level)
598 			opts->comp_level = comp_level_default;
599 	}
600 
601 	return 0;
602 }
603 #endif
604 static unsigned int comp_level_max = 22;
605 
606 static int record__comp_enabled(struct record *rec)
607 {
608 	return rec->opts.comp_level > 0;
609 }
610 
611 static int process_synthesized_event(struct perf_tool *tool,
612 				     union perf_event *event,
613 				     struct perf_sample *sample __maybe_unused,
614 				     struct machine *machine __maybe_unused)
615 {
616 	struct record *rec = container_of(tool, struct record, tool);
617 	return record__write(rec, NULL, event, event->header.size);
618 }
619 
620 static struct mutex synth_lock;
621 
622 static int process_locked_synthesized_event(struct perf_tool *tool,
623 				     union perf_event *event,
624 				     struct perf_sample *sample __maybe_unused,
625 				     struct machine *machine __maybe_unused)
626 {
627 	int ret;
628 
629 	mutex_lock(&synth_lock);
630 	ret = process_synthesized_event(tool, event, sample, machine);
631 	mutex_unlock(&synth_lock);
632 	return ret;
633 }
634 
635 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
636 {
637 	struct record *rec = to;
638 
639 	if (record__comp_enabled(rec)) {
640 		ssize_t compressed = zstd_compress(rec->session, map, map->data,
641 						   mmap__mmap_len(map), bf, size);
642 
643 		if (compressed < 0)
644 			return (int)compressed;
645 
646 		size = compressed;
647 		bf   = map->data;
648 	}
649 
650 	thread->samples++;
651 	return record__write(rec, map, bf, size);
652 }
653 
654 static volatile sig_atomic_t signr = -1;
655 static volatile sig_atomic_t child_finished;
656 #ifdef HAVE_EVENTFD_SUPPORT
657 static volatile sig_atomic_t done_fd = -1;
658 #endif
659 
660 static void sig_handler(int sig)
661 {
662 	if (sig == SIGCHLD)
663 		child_finished = 1;
664 	else
665 		signr = sig;
666 
667 	done = 1;
668 #ifdef HAVE_EVENTFD_SUPPORT
669 	if (done_fd >= 0) {
670 		u64 tmp = 1;
671 		int orig_errno = errno;
672 
673 		/*
674 		 * It is possible for this signal handler to run after done is
675 		 * checked in the main loop, but before the perf counter fds are
676 		 * polled. If this happens, the poll() will continue to wait
677 		 * even though done is set, and will only break out if either
678 		 * another signal is received, or the counters are ready for
679 		 * read. To ensure the poll() doesn't sleep when done is set,
680 		 * use an eventfd (done_fd) to wake up the poll().
681 		 */
682 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
683 			pr_err("failed to signal wakeup fd, error: %m\n");
684 
685 		errno = orig_errno;
686 	}
687 #endif // HAVE_EVENTFD_SUPPORT
688 }
689 
690 static void sigsegv_handler(int sig)
691 {
692 	perf_hooks__recover();
693 	sighandler_dump_stack(sig);
694 }
695 
696 static void record__sig_exit(void)
697 {
698 	if (signr == -1)
699 		return;
700 
701 	signal(signr, SIG_DFL);
702 	raise(signr);
703 }
704 
705 #ifdef HAVE_AUXTRACE_SUPPORT
706 
707 static int record__process_auxtrace(struct perf_tool *tool,
708 				    struct mmap *map,
709 				    union perf_event *event, void *data1,
710 				    size_t len1, void *data2, size_t len2)
711 {
712 	struct record *rec = container_of(tool, struct record, tool);
713 	struct perf_data *data = &rec->data;
714 	size_t padding;
715 	u8 pad[8] = {0};
716 
717 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
718 		off_t file_offset;
719 		int fd = perf_data__fd(data);
720 		int err;
721 
722 		file_offset = lseek(fd, 0, SEEK_CUR);
723 		if (file_offset == -1)
724 			return -1;
725 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
726 						     event, file_offset);
727 		if (err)
728 			return err;
729 	}
730 
731 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
732 	padding = (len1 + len2) & 7;
733 	if (padding)
734 		padding = 8 - padding;
735 
736 	record__write(rec, map, event, event->header.size);
737 	record__write(rec, map, data1, len1);
738 	if (len2)
739 		record__write(rec, map, data2, len2);
740 	record__write(rec, map, &pad, padding);
741 
742 	return 0;
743 }
744 
745 static int record__auxtrace_mmap_read(struct record *rec,
746 				      struct mmap *map)
747 {
748 	int ret;
749 
750 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
751 				  record__process_auxtrace);
752 	if (ret < 0)
753 		return ret;
754 
755 	if (ret)
756 		rec->samples++;
757 
758 	return 0;
759 }
760 
761 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
762 					       struct mmap *map)
763 {
764 	int ret;
765 
766 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
767 					   record__process_auxtrace,
768 					   rec->opts.auxtrace_snapshot_size);
769 	if (ret < 0)
770 		return ret;
771 
772 	if (ret)
773 		rec->samples++;
774 
775 	return 0;
776 }
777 
778 static int record__auxtrace_read_snapshot_all(struct record *rec)
779 {
780 	int i;
781 	int rc = 0;
782 
783 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
784 		struct mmap *map = &rec->evlist->mmap[i];
785 
786 		if (!map->auxtrace_mmap.base)
787 			continue;
788 
789 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
790 			rc = -1;
791 			goto out;
792 		}
793 	}
794 out:
795 	return rc;
796 }
797 
798 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
799 {
800 	pr_debug("Recording AUX area tracing snapshot\n");
801 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
802 		trigger_error(&auxtrace_snapshot_trigger);
803 	} else {
804 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
805 			trigger_error(&auxtrace_snapshot_trigger);
806 		else
807 			trigger_ready(&auxtrace_snapshot_trigger);
808 	}
809 }
810 
811 static int record__auxtrace_snapshot_exit(struct record *rec)
812 {
813 	if (trigger_is_error(&auxtrace_snapshot_trigger))
814 		return 0;
815 
816 	if (!auxtrace_record__snapshot_started &&
817 	    auxtrace_record__snapshot_start(rec->itr))
818 		return -1;
819 
820 	record__read_auxtrace_snapshot(rec, true);
821 	if (trigger_is_error(&auxtrace_snapshot_trigger))
822 		return -1;
823 
824 	return 0;
825 }
826 
827 static int record__auxtrace_init(struct record *rec)
828 {
829 	int err;
830 
831 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
832 	    && record__threads_enabled(rec)) {
833 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
834 		return -EINVAL;
835 	}
836 
837 	if (!rec->itr) {
838 		rec->itr = auxtrace_record__init(rec->evlist, &err);
839 		if (err)
840 			return err;
841 	}
842 
843 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
844 					      rec->opts.auxtrace_snapshot_opts);
845 	if (err)
846 		return err;
847 
848 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
849 					    rec->opts.auxtrace_sample_opts);
850 	if (err)
851 		return err;
852 
853 	auxtrace_regroup_aux_output(rec->evlist);
854 
855 	return auxtrace_parse_filters(rec->evlist);
856 }
857 
858 #else
859 
860 static inline
861 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
862 			       struct mmap *map __maybe_unused)
863 {
864 	return 0;
865 }
866 
867 static inline
868 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
869 				    bool on_exit __maybe_unused)
870 {
871 }
872 
873 static inline
874 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
875 {
876 	return 0;
877 }
878 
879 static inline
880 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
881 {
882 	return 0;
883 }
884 
885 static int record__auxtrace_init(struct record *rec __maybe_unused)
886 {
887 	return 0;
888 }
889 
890 #endif
891 
892 static int record__config_text_poke(struct evlist *evlist)
893 {
894 	struct evsel *evsel;
895 
896 	/* Nothing to do if text poke is already configured */
897 	evlist__for_each_entry(evlist, evsel) {
898 		if (evsel->core.attr.text_poke)
899 			return 0;
900 	}
901 
902 	evsel = evlist__add_dummy_on_all_cpus(evlist);
903 	if (!evsel)
904 		return -ENOMEM;
905 
906 	evsel->core.attr.text_poke = 1;
907 	evsel->core.attr.ksymbol = 1;
908 	evsel->immediate = true;
909 	evsel__set_sample_bit(evsel, TIME);
910 
911 	return 0;
912 }
913 
914 static int record__config_off_cpu(struct record *rec)
915 {
916 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
917 }
918 
919 static bool record__tracking_system_wide(struct record *rec)
920 {
921 	struct evlist *evlist = rec->evlist;
922 	struct evsel *evsel;
923 
924 	/*
925 	 * If non-dummy evsel exists, system_wide sideband is need to
926 	 * help parse sample information.
927 	 * For example, PERF_EVENT_MMAP event to help parse symbol,
928 	 * and PERF_EVENT_COMM event to help parse task executable name.
929 	 */
930 	evlist__for_each_entry(evlist, evsel) {
931 		if (!evsel__is_dummy_event(evsel))
932 			return true;
933 	}
934 
935 	return false;
936 }
937 
938 static int record__config_tracking_events(struct record *rec)
939 {
940 	struct record_opts *opts = &rec->opts;
941 	struct evlist *evlist = rec->evlist;
942 	bool system_wide = false;
943 	struct evsel *evsel;
944 
945 	/*
946 	 * For initial_delay, system wide or a hybrid system, we need to add
947 	 * tracking event so that we can track PERF_RECORD_MMAP to cover the
948 	 * delay of waiting or event synthesis.
949 	 */
950 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
951 	    perf_pmus__num_core_pmus() > 1) {
952 
953 		/*
954 		 * User space tasks can migrate between CPUs, so when tracing
955 		 * selected CPUs, sideband for all CPUs is still needed.
956 		 */
957 		if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
958 			system_wide = true;
959 
960 		evsel = evlist__findnew_tracking_event(evlist, system_wide);
961 		if (!evsel)
962 			return -ENOMEM;
963 
964 		/*
965 		 * Enable the tracking event when the process is forked for
966 		 * initial_delay, immediately for system wide.
967 		 */
968 		if (opts->target.initial_delay && !evsel->immediate &&
969 		    !target__has_cpu(&opts->target))
970 			evsel->core.attr.enable_on_exec = 1;
971 		else
972 			evsel->immediate = 1;
973 	}
974 
975 	return 0;
976 }
977 
978 static bool record__kcore_readable(struct machine *machine)
979 {
980 	char kcore[PATH_MAX];
981 	int fd;
982 
983 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
984 
985 	fd = open(kcore, O_RDONLY);
986 	if (fd < 0)
987 		return false;
988 
989 	close(fd);
990 
991 	return true;
992 }
993 
994 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
995 {
996 	char from_dir[PATH_MAX];
997 	char kcore_dir[PATH_MAX];
998 	int ret;
999 
1000 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
1001 
1002 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1003 	if (ret)
1004 		return ret;
1005 
1006 	return kcore_copy(from_dir, kcore_dir);
1007 }
1008 
1009 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1010 {
1011 	thread_data->pipes.msg[0] = -1;
1012 	thread_data->pipes.msg[1] = -1;
1013 	thread_data->pipes.ack[0] = -1;
1014 	thread_data->pipes.ack[1] = -1;
1015 }
1016 
1017 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1018 {
1019 	if (pipe(thread_data->pipes.msg))
1020 		return -EINVAL;
1021 
1022 	if (pipe(thread_data->pipes.ack)) {
1023 		close(thread_data->pipes.msg[0]);
1024 		thread_data->pipes.msg[0] = -1;
1025 		close(thread_data->pipes.msg[1]);
1026 		thread_data->pipes.msg[1] = -1;
1027 		return -EINVAL;
1028 	}
1029 
1030 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1031 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1032 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1033 
1034 	return 0;
1035 }
1036 
1037 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1038 {
1039 	if (thread_data->pipes.msg[0] != -1) {
1040 		close(thread_data->pipes.msg[0]);
1041 		thread_data->pipes.msg[0] = -1;
1042 	}
1043 	if (thread_data->pipes.msg[1] != -1) {
1044 		close(thread_data->pipes.msg[1]);
1045 		thread_data->pipes.msg[1] = -1;
1046 	}
1047 	if (thread_data->pipes.ack[0] != -1) {
1048 		close(thread_data->pipes.ack[0]);
1049 		thread_data->pipes.ack[0] = -1;
1050 	}
1051 	if (thread_data->pipes.ack[1] != -1) {
1052 		close(thread_data->pipes.ack[1]);
1053 		thread_data->pipes.ack[1] = -1;
1054 	}
1055 }
1056 
1057 static bool evlist__per_thread(struct evlist *evlist)
1058 {
1059 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1060 }
1061 
1062 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1063 {
1064 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1065 	struct mmap *mmap = evlist->mmap;
1066 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1067 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1068 	bool per_thread = evlist__per_thread(evlist);
1069 
1070 	if (per_thread)
1071 		thread_data->nr_mmaps = nr_mmaps;
1072 	else
1073 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1074 						      thread_data->mask->maps.nbits);
1075 	if (mmap) {
1076 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1077 		if (!thread_data->maps)
1078 			return -ENOMEM;
1079 	}
1080 	if (overwrite_mmap) {
1081 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1082 		if (!thread_data->overwrite_maps) {
1083 			zfree(&thread_data->maps);
1084 			return -ENOMEM;
1085 		}
1086 	}
1087 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1088 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1089 
1090 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1091 		if (per_thread ||
1092 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1093 			if (thread_data->maps) {
1094 				thread_data->maps[tm] = &mmap[m];
1095 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1096 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1097 			}
1098 			if (thread_data->overwrite_maps) {
1099 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1100 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1101 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1102 			}
1103 			tm++;
1104 		}
1105 	}
1106 
1107 	return 0;
1108 }
1109 
1110 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1111 {
1112 	int f, tm, pos;
1113 	struct mmap *map, *overwrite_map;
1114 
1115 	fdarray__init(&thread_data->pollfd, 64);
1116 
1117 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1118 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1119 		overwrite_map = thread_data->overwrite_maps ?
1120 				thread_data->overwrite_maps[tm] : NULL;
1121 
1122 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1123 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1124 
1125 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1126 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1127 							      &evlist->core.pollfd);
1128 				if (pos < 0)
1129 					return pos;
1130 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1131 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1132 			}
1133 		}
1134 	}
1135 
1136 	return 0;
1137 }
1138 
1139 static void record__free_thread_data(struct record *rec)
1140 {
1141 	int t;
1142 	struct record_thread *thread_data = rec->thread_data;
1143 
1144 	if (thread_data == NULL)
1145 		return;
1146 
1147 	for (t = 0; t < rec->nr_threads; t++) {
1148 		record__thread_data_close_pipes(&thread_data[t]);
1149 		zfree(&thread_data[t].maps);
1150 		zfree(&thread_data[t].overwrite_maps);
1151 		fdarray__exit(&thread_data[t].pollfd);
1152 	}
1153 
1154 	zfree(&rec->thread_data);
1155 }
1156 
1157 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1158 						    int evlist_pollfd_index,
1159 						    int thread_pollfd_index)
1160 {
1161 	size_t x = rec->index_map_cnt;
1162 
1163 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1164 		return -ENOMEM;
1165 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1166 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1167 	rec->index_map_cnt += 1;
1168 	return 0;
1169 }
1170 
1171 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1172 						    struct evlist *evlist,
1173 						    struct record_thread *thread_data)
1174 {
1175 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1176 	struct pollfd *t_entries = thread_data->pollfd.entries;
1177 	int err = 0;
1178 	size_t i;
1179 
1180 	for (i = 0; i < rec->index_map_cnt; i++) {
1181 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1182 		int t_pos = rec->index_map[i].thread_pollfd_index;
1183 
1184 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1185 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1186 			pr_err("Thread and evlist pollfd index mismatch\n");
1187 			err = -EINVAL;
1188 			continue;
1189 		}
1190 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1191 	}
1192 	return err;
1193 }
1194 
1195 static int record__dup_non_perf_events(struct record *rec,
1196 				       struct evlist *evlist,
1197 				       struct record_thread *thread_data)
1198 {
1199 	struct fdarray *fda = &evlist->core.pollfd;
1200 	int i, ret;
1201 
1202 	for (i = 0; i < fda->nr; i++) {
1203 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1204 			continue;
1205 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1206 		if (ret < 0) {
1207 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1208 			return ret;
1209 		}
1210 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1211 			  thread_data, ret, fda->entries[i].fd);
1212 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1213 		if (ret < 0) {
1214 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1215 			return ret;
1216 		}
1217 	}
1218 	return 0;
1219 }
1220 
1221 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1222 {
1223 	int t, ret;
1224 	struct record_thread *thread_data;
1225 
1226 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1227 	if (!rec->thread_data) {
1228 		pr_err("Failed to allocate thread data\n");
1229 		return -ENOMEM;
1230 	}
1231 	thread_data = rec->thread_data;
1232 
1233 	for (t = 0; t < rec->nr_threads; t++)
1234 		record__thread_data_init_pipes(&thread_data[t]);
1235 
1236 	for (t = 0; t < rec->nr_threads; t++) {
1237 		thread_data[t].rec = rec;
1238 		thread_data[t].mask = &rec->thread_masks[t];
1239 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1240 		if (ret) {
1241 			pr_err("Failed to initialize thread[%d] maps\n", t);
1242 			goto out_free;
1243 		}
1244 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1245 		if (ret) {
1246 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1247 			goto out_free;
1248 		}
1249 		if (t) {
1250 			thread_data[t].tid = -1;
1251 			ret = record__thread_data_open_pipes(&thread_data[t]);
1252 			if (ret) {
1253 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1254 				goto out_free;
1255 			}
1256 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1257 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1258 			if (ret < 0) {
1259 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1260 				goto out_free;
1261 			}
1262 			thread_data[t].ctlfd_pos = ret;
1263 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1264 				 thread_data, thread_data[t].ctlfd_pos,
1265 				 thread_data[t].pipes.msg[0]);
1266 		} else {
1267 			thread_data[t].tid = gettid();
1268 
1269 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1270 			if (ret < 0)
1271 				goto out_free;
1272 
1273 			thread_data[t].ctlfd_pos = -1; /* Not used */
1274 		}
1275 	}
1276 
1277 	return 0;
1278 
1279 out_free:
1280 	record__free_thread_data(rec);
1281 
1282 	return ret;
1283 }
1284 
1285 static int record__mmap_evlist(struct record *rec,
1286 			       struct evlist *evlist)
1287 {
1288 	int i, ret;
1289 	struct record_opts *opts = &rec->opts;
1290 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1291 				  opts->auxtrace_sample_mode;
1292 	char msg[512];
1293 
1294 	if (opts->affinity != PERF_AFFINITY_SYS)
1295 		cpu__setup_cpunode_map();
1296 
1297 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1298 				 opts->auxtrace_mmap_pages,
1299 				 auxtrace_overwrite,
1300 				 opts->nr_cblocks, opts->affinity,
1301 				 opts->mmap_flush, opts->comp_level) < 0) {
1302 		if (errno == EPERM) {
1303 			pr_err("Permission error mapping pages.\n"
1304 			       "Consider increasing "
1305 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1306 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1307 			       "(current value: %u,%u)\n",
1308 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1309 			return -errno;
1310 		} else {
1311 			pr_err("failed to mmap with %d (%s)\n", errno,
1312 				str_error_r(errno, msg, sizeof(msg)));
1313 			if (errno)
1314 				return -errno;
1315 			else
1316 				return -EINVAL;
1317 		}
1318 	}
1319 
1320 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1321 		return -1;
1322 
1323 	ret = record__alloc_thread_data(rec, evlist);
1324 	if (ret)
1325 		return ret;
1326 
1327 	if (record__threads_enabled(rec)) {
1328 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1329 		if (ret) {
1330 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1331 			return ret;
1332 		}
1333 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1334 			if (evlist->mmap)
1335 				evlist->mmap[i].file = &rec->data.dir.files[i];
1336 			if (evlist->overwrite_mmap)
1337 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1338 		}
1339 	}
1340 
1341 	return 0;
1342 }
1343 
1344 static int record__mmap(struct record *rec)
1345 {
1346 	return record__mmap_evlist(rec, rec->evlist);
1347 }
1348 
1349 static int record__open(struct record *rec)
1350 {
1351 	char msg[BUFSIZ];
1352 	struct evsel *pos;
1353 	struct evlist *evlist = rec->evlist;
1354 	struct perf_session *session = rec->session;
1355 	struct record_opts *opts = &rec->opts;
1356 	int rc = 0;
1357 
1358 	evlist__for_each_entry(evlist, pos) {
1359 try_again:
1360 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1361 			if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1362 				if (verbose > 0)
1363 					ui__warning("%s\n", msg);
1364 				goto try_again;
1365 			}
1366 			if ((errno == EINVAL || errno == EBADF) &&
1367 			    pos->core.leader != &pos->core &&
1368 			    pos->weak_group) {
1369 			        pos = evlist__reset_weak_group(evlist, pos, true);
1370 				goto try_again;
1371 			}
1372 			rc = -errno;
1373 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1374 			ui__error("%s\n", msg);
1375 			goto out;
1376 		}
1377 
1378 		pos->supported = true;
1379 	}
1380 
1381 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1382 		pr_warning(
1383 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1384 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1385 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1386 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1387 "Samples in kernel modules won't be resolved at all.\n\n"
1388 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1389 "even with a suitable vmlinux or kallsyms file.\n\n");
1390 	}
1391 
1392 	if (evlist__apply_filters(evlist, &pos)) {
1393 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1394 			pos->filter ?: "BPF", evsel__name(pos), errno,
1395 			str_error_r(errno, msg, sizeof(msg)));
1396 		rc = -1;
1397 		goto out;
1398 	}
1399 
1400 	rc = record__mmap(rec);
1401 	if (rc)
1402 		goto out;
1403 
1404 	session->evlist = evlist;
1405 	perf_session__set_id_hdr_size(session);
1406 out:
1407 	return rc;
1408 }
1409 
1410 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1411 {
1412 	if (rec->evlist->first_sample_time == 0)
1413 		rec->evlist->first_sample_time = sample_time;
1414 
1415 	if (sample_time)
1416 		rec->evlist->last_sample_time = sample_time;
1417 }
1418 
1419 static int process_sample_event(struct perf_tool *tool,
1420 				union perf_event *event,
1421 				struct perf_sample *sample,
1422 				struct evsel *evsel,
1423 				struct machine *machine)
1424 {
1425 	struct record *rec = container_of(tool, struct record, tool);
1426 
1427 	set_timestamp_boundary(rec, sample->time);
1428 
1429 	if (rec->buildid_all)
1430 		return 0;
1431 
1432 	rec->samples++;
1433 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1434 }
1435 
1436 static int process_buildids(struct record *rec)
1437 {
1438 	struct perf_session *session = rec->session;
1439 
1440 	if (perf_data__size(&rec->data) == 0)
1441 		return 0;
1442 
1443 	/*
1444 	 * During this process, it'll load kernel map and replace the
1445 	 * dso->long_name to a real pathname it found.  In this case
1446 	 * we prefer the vmlinux path like
1447 	 *   /lib/modules/3.16.4/build/vmlinux
1448 	 *
1449 	 * rather than build-id path (in debug directory).
1450 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1451 	 */
1452 	symbol_conf.ignore_vmlinux_buildid = true;
1453 
1454 	/*
1455 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1456 	 * so no need to process samples. But if timestamp_boundary is enabled,
1457 	 * it still needs to walk on all samples to get the timestamps of
1458 	 * first/last samples.
1459 	 */
1460 	if (rec->buildid_all && !rec->timestamp_boundary)
1461 		rec->tool.sample = NULL;
1462 
1463 	return perf_session__process_events(session);
1464 }
1465 
1466 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1467 {
1468 	int err;
1469 	struct perf_tool *tool = data;
1470 	/*
1471 	 *As for guest kernel when processing subcommand record&report,
1472 	 *we arrange module mmap prior to guest kernel mmap and trigger
1473 	 *a preload dso because default guest module symbols are loaded
1474 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1475 	 *method is used to avoid symbol missing when the first addr is
1476 	 *in module instead of in guest kernel.
1477 	 */
1478 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1479 					     machine);
1480 	if (err < 0)
1481 		pr_err("Couldn't record guest kernel [%d]'s reference"
1482 		       " relocation symbol.\n", machine->pid);
1483 
1484 	/*
1485 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1486 	 * have no _text sometimes.
1487 	 */
1488 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1489 						 machine);
1490 	if (err < 0)
1491 		pr_err("Couldn't record guest kernel [%d]'s reference"
1492 		       " relocation symbol.\n", machine->pid);
1493 }
1494 
1495 static struct perf_event_header finished_round_event = {
1496 	.size = sizeof(struct perf_event_header),
1497 	.type = PERF_RECORD_FINISHED_ROUND,
1498 };
1499 
1500 static struct perf_event_header finished_init_event = {
1501 	.size = sizeof(struct perf_event_header),
1502 	.type = PERF_RECORD_FINISHED_INIT,
1503 };
1504 
1505 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1506 {
1507 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1508 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1509 			  thread->mask->affinity.nbits)) {
1510 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1511 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1512 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1513 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1514 					(cpu_set_t *)thread->mask->affinity.bits);
1515 		if (verbose == 2) {
1516 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1517 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1518 		}
1519 	}
1520 }
1521 
1522 static size_t process_comp_header(void *record, size_t increment)
1523 {
1524 	struct perf_record_compressed *event = record;
1525 	size_t size = sizeof(*event);
1526 
1527 	if (increment) {
1528 		event->header.size += increment;
1529 		return increment;
1530 	}
1531 
1532 	event->header.type = PERF_RECORD_COMPRESSED;
1533 	event->header.size = size;
1534 
1535 	return size;
1536 }
1537 
1538 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1539 			    void *dst, size_t dst_size, void *src, size_t src_size)
1540 {
1541 	ssize_t compressed;
1542 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1543 	struct zstd_data *zstd_data = &session->zstd_data;
1544 
1545 	if (map && map->file)
1546 		zstd_data = &map->zstd_data;
1547 
1548 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1549 						     max_record_size, process_comp_header);
1550 	if (compressed < 0)
1551 		return compressed;
1552 
1553 	if (map && map->file) {
1554 		thread->bytes_transferred += src_size;
1555 		thread->bytes_compressed  += compressed;
1556 	} else {
1557 		session->bytes_transferred += src_size;
1558 		session->bytes_compressed  += compressed;
1559 	}
1560 
1561 	return compressed;
1562 }
1563 
1564 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1565 				    bool overwrite, bool synch)
1566 {
1567 	u64 bytes_written = rec->bytes_written;
1568 	int i;
1569 	int rc = 0;
1570 	int nr_mmaps;
1571 	struct mmap **maps;
1572 	int trace_fd = rec->data.file.fd;
1573 	off_t off = 0;
1574 
1575 	if (!evlist)
1576 		return 0;
1577 
1578 	nr_mmaps = thread->nr_mmaps;
1579 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1580 
1581 	if (!maps)
1582 		return 0;
1583 
1584 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1585 		return 0;
1586 
1587 	if (record__aio_enabled(rec))
1588 		off = record__aio_get_pos(trace_fd);
1589 
1590 	for (i = 0; i < nr_mmaps; i++) {
1591 		u64 flush = 0;
1592 		struct mmap *map = maps[i];
1593 
1594 		if (map->core.base) {
1595 			record__adjust_affinity(rec, map);
1596 			if (synch) {
1597 				flush = map->core.flush;
1598 				map->core.flush = 1;
1599 			}
1600 			if (!record__aio_enabled(rec)) {
1601 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1602 					if (synch)
1603 						map->core.flush = flush;
1604 					rc = -1;
1605 					goto out;
1606 				}
1607 			} else {
1608 				if (record__aio_push(rec, map, &off) < 0) {
1609 					record__aio_set_pos(trace_fd, off);
1610 					if (synch)
1611 						map->core.flush = flush;
1612 					rc = -1;
1613 					goto out;
1614 				}
1615 			}
1616 			if (synch)
1617 				map->core.flush = flush;
1618 		}
1619 
1620 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1621 		    !rec->opts.auxtrace_sample_mode &&
1622 		    record__auxtrace_mmap_read(rec, map) != 0) {
1623 			rc = -1;
1624 			goto out;
1625 		}
1626 	}
1627 
1628 	if (record__aio_enabled(rec))
1629 		record__aio_set_pos(trace_fd, off);
1630 
1631 	/*
1632 	 * Mark the round finished in case we wrote
1633 	 * at least one event.
1634 	 *
1635 	 * No need for round events in directory mode,
1636 	 * because per-cpu maps and files have data
1637 	 * sorted by kernel.
1638 	 */
1639 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1640 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1641 
1642 	if (overwrite)
1643 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1644 out:
1645 	return rc;
1646 }
1647 
1648 static int record__mmap_read_all(struct record *rec, bool synch)
1649 {
1650 	int err;
1651 
1652 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1653 	if (err)
1654 		return err;
1655 
1656 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1657 }
1658 
1659 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1660 					   void *arg __maybe_unused)
1661 {
1662 	struct perf_mmap *map = fda->priv[fd].ptr;
1663 
1664 	if (map)
1665 		perf_mmap__put(map);
1666 }
1667 
1668 static void *record__thread(void *arg)
1669 {
1670 	enum thread_msg msg = THREAD_MSG__READY;
1671 	bool terminate = false;
1672 	struct fdarray *pollfd;
1673 	int err, ctlfd_pos;
1674 
1675 	thread = arg;
1676 	thread->tid = gettid();
1677 
1678 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1679 	if (err == -1)
1680 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1681 			   thread->tid, strerror(errno));
1682 
1683 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1684 
1685 	pollfd = &thread->pollfd;
1686 	ctlfd_pos = thread->ctlfd_pos;
1687 
1688 	for (;;) {
1689 		unsigned long long hits = thread->samples;
1690 
1691 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1692 			break;
1693 
1694 		if (hits == thread->samples) {
1695 
1696 			err = fdarray__poll(pollfd, -1);
1697 			/*
1698 			 * Propagate error, only if there's any. Ignore positive
1699 			 * number of returned events and interrupt error.
1700 			 */
1701 			if (err > 0 || (err < 0 && errno == EINTR))
1702 				err = 0;
1703 			thread->waking++;
1704 
1705 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1706 					    record__thread_munmap_filtered, NULL) == 0)
1707 				break;
1708 		}
1709 
1710 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1711 			terminate = true;
1712 			close(thread->pipes.msg[0]);
1713 			thread->pipes.msg[0] = -1;
1714 			pollfd->entries[ctlfd_pos].fd = -1;
1715 			pollfd->entries[ctlfd_pos].events = 0;
1716 		}
1717 
1718 		pollfd->entries[ctlfd_pos].revents = 0;
1719 	}
1720 	record__mmap_read_all(thread->rec, true);
1721 
1722 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1723 	if (err == -1)
1724 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1725 			   thread->tid, strerror(errno));
1726 
1727 	return NULL;
1728 }
1729 
1730 static void record__init_features(struct record *rec)
1731 {
1732 	struct perf_session *session = rec->session;
1733 	int feat;
1734 
1735 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1736 		perf_header__set_feat(&session->header, feat);
1737 
1738 	if (rec->no_buildid)
1739 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1740 
1741 #ifdef HAVE_LIBTRACEEVENT
1742 	if (!have_tracepoints(&rec->evlist->core.entries))
1743 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1744 #endif
1745 
1746 	if (!rec->opts.branch_stack)
1747 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1748 
1749 	if (!rec->opts.full_auxtrace)
1750 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1751 
1752 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1753 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1754 
1755 	if (!rec->opts.use_clockid)
1756 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1757 
1758 	if (!record__threads_enabled(rec))
1759 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1760 
1761 	if (!record__comp_enabled(rec))
1762 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1763 
1764 	perf_header__clear_feat(&session->header, HEADER_STAT);
1765 }
1766 
1767 static void
1768 record__finish_output(struct record *rec)
1769 {
1770 	int i;
1771 	struct perf_data *data = &rec->data;
1772 	int fd = perf_data__fd(data);
1773 
1774 	if (data->is_pipe) {
1775 		/* Just to display approx. size */
1776 		data->file.size = rec->bytes_written;
1777 		return;
1778 	}
1779 
1780 	rec->session->header.data_size += rec->bytes_written;
1781 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1782 	if (record__threads_enabled(rec)) {
1783 		for (i = 0; i < data->dir.nr; i++)
1784 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1785 	}
1786 
1787 	if (!rec->no_buildid) {
1788 		process_buildids(rec);
1789 
1790 		if (rec->buildid_all)
1791 			perf_session__dsos_hit_all(rec->session);
1792 	}
1793 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1794 
1795 	return;
1796 }
1797 
1798 static int record__synthesize_workload(struct record *rec, bool tail)
1799 {
1800 	int err;
1801 	struct perf_thread_map *thread_map;
1802 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1803 
1804 	if (rec->opts.tail_synthesize != tail)
1805 		return 0;
1806 
1807 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1808 	if (thread_map == NULL)
1809 		return -1;
1810 
1811 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1812 						 process_synthesized_event,
1813 						 &rec->session->machines.host,
1814 						 needs_mmap,
1815 						 rec->opts.sample_address);
1816 	perf_thread_map__put(thread_map);
1817 	return err;
1818 }
1819 
1820 static int write_finished_init(struct record *rec, bool tail)
1821 {
1822 	if (rec->opts.tail_synthesize != tail)
1823 		return 0;
1824 
1825 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1826 }
1827 
1828 static int record__synthesize(struct record *rec, bool tail);
1829 
1830 static int
1831 record__switch_output(struct record *rec, bool at_exit)
1832 {
1833 	struct perf_data *data = &rec->data;
1834 	char *new_filename = NULL;
1835 	int fd, err;
1836 
1837 	/* Same Size:      "2015122520103046"*/
1838 	char timestamp[] = "InvalidTimestamp";
1839 
1840 	record__aio_mmap_read_sync(rec);
1841 
1842 	write_finished_init(rec, true);
1843 
1844 	record__synthesize(rec, true);
1845 	if (target__none(&rec->opts.target))
1846 		record__synthesize_workload(rec, true);
1847 
1848 	rec->samples = 0;
1849 	record__finish_output(rec);
1850 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1851 	if (err) {
1852 		pr_err("Failed to get current timestamp\n");
1853 		return -EINVAL;
1854 	}
1855 
1856 	fd = perf_data__switch(data, timestamp,
1857 			       rec->session->header.data_offset,
1858 			       at_exit, &new_filename);
1859 	if (fd >= 0 && !at_exit) {
1860 		rec->bytes_written = 0;
1861 		rec->session->header.data_size = 0;
1862 	}
1863 
1864 	if (!quiet) {
1865 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1866 			data->path, timestamp);
1867 	}
1868 
1869 	if (rec->switch_output.num_files) {
1870 		int n = rec->switch_output.cur_file + 1;
1871 
1872 		if (n >= rec->switch_output.num_files)
1873 			n = 0;
1874 		rec->switch_output.cur_file = n;
1875 		if (rec->switch_output.filenames[n]) {
1876 			remove(rec->switch_output.filenames[n]);
1877 			zfree(&rec->switch_output.filenames[n]);
1878 		}
1879 		rec->switch_output.filenames[n] = new_filename;
1880 	} else {
1881 		free(new_filename);
1882 	}
1883 
1884 	/* Output tracking events */
1885 	if (!at_exit) {
1886 		record__synthesize(rec, false);
1887 
1888 		/*
1889 		 * In 'perf record --switch-output' without -a,
1890 		 * record__synthesize() in record__switch_output() won't
1891 		 * generate tracking events because there's no thread_map
1892 		 * in evlist. Which causes newly created perf.data doesn't
1893 		 * contain map and comm information.
1894 		 * Create a fake thread_map and directly call
1895 		 * perf_event__synthesize_thread_map() for those events.
1896 		 */
1897 		if (target__none(&rec->opts.target))
1898 			record__synthesize_workload(rec, false);
1899 		write_finished_init(rec, false);
1900 	}
1901 	return fd;
1902 }
1903 
1904 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1905 					struct perf_record_lost_samples *lost,
1906 					int cpu_idx, int thread_idx, u64 lost_count,
1907 					u16 misc_flag)
1908 {
1909 	struct perf_sample_id *sid;
1910 	struct perf_sample sample = {};
1911 	int id_hdr_size;
1912 
1913 	lost->lost = lost_count;
1914 	if (evsel->core.ids) {
1915 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1916 		sample.id = sid->id;
1917 	}
1918 
1919 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1920 						       evsel->core.attr.sample_type, &sample);
1921 	lost->header.size = sizeof(*lost) + id_hdr_size;
1922 	lost->header.misc = misc_flag;
1923 	record__write(rec, NULL, lost, lost->header.size);
1924 }
1925 
1926 static void record__read_lost_samples(struct record *rec)
1927 {
1928 	struct perf_session *session = rec->session;
1929 	struct perf_record_lost_samples *lost = NULL;
1930 	struct evsel *evsel;
1931 
1932 	/* there was an error during record__open */
1933 	if (session->evlist == NULL)
1934 		return;
1935 
1936 	evlist__for_each_entry(session->evlist, evsel) {
1937 		struct xyarray *xy = evsel->core.sample_id;
1938 		u64 lost_count;
1939 
1940 		if (xy == NULL || evsel->core.fd == NULL)
1941 			continue;
1942 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1943 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1944 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1945 			continue;
1946 		}
1947 
1948 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1949 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1950 				struct perf_counts_values count;
1951 
1952 				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1953 					pr_debug("read LOST count failed\n");
1954 					goto out;
1955 				}
1956 
1957 				if (count.lost) {
1958 					if (!lost) {
1959 						lost = zalloc(PERF_SAMPLE_MAX_SIZE);
1960 						if (!lost) {
1961 							pr_debug("Memory allocation failed\n");
1962 							return;
1963 						}
1964 						lost->header.type = PERF_RECORD_LOST_SAMPLES;
1965 					}
1966 					__record__save_lost_samples(rec, evsel, lost,
1967 								    x, y, count.lost, 0);
1968 				}
1969 			}
1970 		}
1971 
1972 		lost_count = perf_bpf_filter__lost_count(evsel);
1973 		if (lost_count) {
1974 			if (!lost) {
1975 				lost = zalloc(PERF_SAMPLE_MAX_SIZE);
1976 				if (!lost) {
1977 					pr_debug("Memory allocation failed\n");
1978 					return;
1979 				}
1980 				lost->header.type = PERF_RECORD_LOST_SAMPLES;
1981 			}
1982 			__record__save_lost_samples(rec, evsel, lost, 0, 0, lost_count,
1983 						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
1984 		}
1985 	}
1986 out:
1987 	free(lost);
1988 }
1989 
1990 static volatile sig_atomic_t workload_exec_errno;
1991 
1992 /*
1993  * evlist__prepare_workload will send a SIGUSR1
1994  * if the fork fails, since we asked by setting its
1995  * want_signal to true.
1996  */
1997 static void workload_exec_failed_signal(int signo __maybe_unused,
1998 					siginfo_t *info,
1999 					void *ucontext __maybe_unused)
2000 {
2001 	workload_exec_errno = info->si_value.sival_int;
2002 	done = 1;
2003 	child_finished = 1;
2004 }
2005 
2006 static void snapshot_sig_handler(int sig);
2007 static void alarm_sig_handler(int sig);
2008 
2009 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
2010 {
2011 	if (evlist) {
2012 		if (evlist->mmap && evlist->mmap[0].core.base)
2013 			return evlist->mmap[0].core.base;
2014 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2015 			return evlist->overwrite_mmap[0].core.base;
2016 	}
2017 	return NULL;
2018 }
2019 
2020 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2021 {
2022 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2023 	if (pc)
2024 		return pc;
2025 	return NULL;
2026 }
2027 
2028 static int record__synthesize(struct record *rec, bool tail)
2029 {
2030 	struct perf_session *session = rec->session;
2031 	struct machine *machine = &session->machines.host;
2032 	struct perf_data *data = &rec->data;
2033 	struct record_opts *opts = &rec->opts;
2034 	struct perf_tool *tool = &rec->tool;
2035 	int err = 0;
2036 	event_op f = process_synthesized_event;
2037 
2038 	if (rec->opts.tail_synthesize != tail)
2039 		return 0;
2040 
2041 	if (data->is_pipe) {
2042 		err = perf_event__synthesize_for_pipe(tool, session, data,
2043 						      process_synthesized_event);
2044 		if (err < 0)
2045 			goto out;
2046 
2047 		rec->bytes_written += err;
2048 	}
2049 
2050 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2051 					  process_synthesized_event, machine);
2052 	if (err)
2053 		goto out;
2054 
2055 	/* Synthesize id_index before auxtrace_info */
2056 	err = perf_event__synthesize_id_index(tool,
2057 					      process_synthesized_event,
2058 					      session->evlist, machine);
2059 	if (err)
2060 		goto out;
2061 
2062 	if (rec->opts.full_auxtrace) {
2063 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2064 					session, process_synthesized_event);
2065 		if (err)
2066 			goto out;
2067 	}
2068 
2069 	if (!evlist__exclude_kernel(rec->evlist)) {
2070 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2071 							 machine);
2072 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2073 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2074 				   "Check /proc/kallsyms permission or run as root.\n");
2075 
2076 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2077 						     machine);
2078 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2079 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2080 				   "Check /proc/modules permission or run as root.\n");
2081 	}
2082 
2083 	if (perf_guest) {
2084 		machines__process_guests(&session->machines,
2085 					 perf_event__synthesize_guest_os, tool);
2086 	}
2087 
2088 	err = perf_event__synthesize_extra_attr(&rec->tool,
2089 						rec->evlist,
2090 						process_synthesized_event,
2091 						data->is_pipe);
2092 	if (err)
2093 		goto out;
2094 
2095 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2096 						 process_synthesized_event,
2097 						NULL);
2098 	if (err < 0) {
2099 		pr_err("Couldn't synthesize thread map.\n");
2100 		return err;
2101 	}
2102 
2103 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2104 					     process_synthesized_event, NULL);
2105 	if (err < 0) {
2106 		pr_err("Couldn't synthesize cpu map.\n");
2107 		return err;
2108 	}
2109 
2110 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2111 						machine, opts);
2112 	if (err < 0) {
2113 		pr_warning("Couldn't synthesize bpf events.\n");
2114 		err = 0;
2115 	}
2116 
2117 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2118 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2119 						     machine);
2120 		if (err < 0) {
2121 			pr_warning("Couldn't synthesize cgroup events.\n");
2122 			err = 0;
2123 		}
2124 	}
2125 
2126 	if (rec->opts.nr_threads_synthesize > 1) {
2127 		mutex_init(&synth_lock);
2128 		perf_set_multithreaded();
2129 		f = process_locked_synthesized_event;
2130 	}
2131 
2132 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2133 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2134 
2135 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2136 						    rec->evlist->core.threads,
2137 						    f, needs_mmap, opts->sample_address,
2138 						    rec->opts.nr_threads_synthesize);
2139 	}
2140 
2141 	if (rec->opts.nr_threads_synthesize > 1) {
2142 		perf_set_singlethreaded();
2143 		mutex_destroy(&synth_lock);
2144 	}
2145 
2146 out:
2147 	return err;
2148 }
2149 
2150 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2151 {
2152 	struct record *rec = data;
2153 	pthread_kill(rec->thread_id, SIGUSR2);
2154 	return 0;
2155 }
2156 
2157 static int record__setup_sb_evlist(struct record *rec)
2158 {
2159 	struct record_opts *opts = &rec->opts;
2160 
2161 	if (rec->sb_evlist != NULL) {
2162 		/*
2163 		 * We get here if --switch-output-event populated the
2164 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2165 		 * to the main thread.
2166 		 */
2167 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2168 		rec->thread_id = pthread_self();
2169 	}
2170 #ifdef HAVE_LIBBPF_SUPPORT
2171 	if (!opts->no_bpf_event) {
2172 		if (rec->sb_evlist == NULL) {
2173 			rec->sb_evlist = evlist__new();
2174 
2175 			if (rec->sb_evlist == NULL) {
2176 				pr_err("Couldn't create side band evlist.\n.");
2177 				return -1;
2178 			}
2179 		}
2180 
2181 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2182 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2183 			return -1;
2184 		}
2185 	}
2186 #endif
2187 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2188 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2189 		opts->no_bpf_event = true;
2190 	}
2191 
2192 	return 0;
2193 }
2194 
2195 static int record__init_clock(struct record *rec)
2196 {
2197 	struct perf_session *session = rec->session;
2198 	struct timespec ref_clockid;
2199 	struct timeval ref_tod;
2200 	u64 ref;
2201 
2202 	if (!rec->opts.use_clockid)
2203 		return 0;
2204 
2205 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2206 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2207 
2208 	session->header.env.clock.clockid = rec->opts.clockid;
2209 
2210 	if (gettimeofday(&ref_tod, NULL) != 0) {
2211 		pr_err("gettimeofday failed, cannot set reference time.\n");
2212 		return -1;
2213 	}
2214 
2215 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2216 		pr_err("clock_gettime failed, cannot set reference time.\n");
2217 		return -1;
2218 	}
2219 
2220 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2221 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2222 
2223 	session->header.env.clock.tod_ns = ref;
2224 
2225 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2226 	      (u64) ref_clockid.tv_nsec;
2227 
2228 	session->header.env.clock.clockid_ns = ref;
2229 	return 0;
2230 }
2231 
2232 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2233 {
2234 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2235 		trigger_hit(&auxtrace_snapshot_trigger);
2236 		auxtrace_record__snapshot_started = 1;
2237 		if (auxtrace_record__snapshot_start(rec->itr))
2238 			trigger_error(&auxtrace_snapshot_trigger);
2239 	}
2240 }
2241 
2242 static int record__terminate_thread(struct record_thread *thread_data)
2243 {
2244 	int err;
2245 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2246 	pid_t tid = thread_data->tid;
2247 
2248 	close(thread_data->pipes.msg[1]);
2249 	thread_data->pipes.msg[1] = -1;
2250 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2251 	if (err > 0)
2252 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2253 	else
2254 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2255 			   thread->tid, tid);
2256 
2257 	return 0;
2258 }
2259 
2260 static int record__start_threads(struct record *rec)
2261 {
2262 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2263 	struct record_thread *thread_data = rec->thread_data;
2264 	sigset_t full, mask;
2265 	pthread_t handle;
2266 	pthread_attr_t attrs;
2267 
2268 	thread = &thread_data[0];
2269 
2270 	if (!record__threads_enabled(rec))
2271 		return 0;
2272 
2273 	sigfillset(&full);
2274 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2275 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2276 		return -1;
2277 	}
2278 
2279 	pthread_attr_init(&attrs);
2280 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2281 
2282 	for (t = 1; t < nr_threads; t++) {
2283 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2284 
2285 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2286 		pthread_attr_setaffinity_np(&attrs,
2287 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2288 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2289 #endif
2290 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2291 			for (tt = 1; tt < t; tt++)
2292 				record__terminate_thread(&thread_data[t]);
2293 			pr_err("Failed to start threads: %s\n", strerror(errno));
2294 			ret = -1;
2295 			goto out_err;
2296 		}
2297 
2298 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2299 		if (err > 0)
2300 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2301 				  thread_msg_tags[msg]);
2302 		else
2303 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2304 				   thread->tid, rec->thread_data[t].tid);
2305 	}
2306 
2307 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2308 			(cpu_set_t *)thread->mask->affinity.bits);
2309 
2310 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2311 
2312 out_err:
2313 	pthread_attr_destroy(&attrs);
2314 
2315 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2316 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2317 		ret = -1;
2318 	}
2319 
2320 	return ret;
2321 }
2322 
2323 static int record__stop_threads(struct record *rec)
2324 {
2325 	int t;
2326 	struct record_thread *thread_data = rec->thread_data;
2327 
2328 	for (t = 1; t < rec->nr_threads; t++)
2329 		record__terminate_thread(&thread_data[t]);
2330 
2331 	for (t = 0; t < rec->nr_threads; t++) {
2332 		rec->samples += thread_data[t].samples;
2333 		if (!record__threads_enabled(rec))
2334 			continue;
2335 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2336 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2337 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2338 			 thread_data[t].samples, thread_data[t].waking);
2339 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2340 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2341 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2342 		else
2343 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2344 	}
2345 
2346 	return 0;
2347 }
2348 
2349 static unsigned long record__waking(struct record *rec)
2350 {
2351 	int t;
2352 	unsigned long waking = 0;
2353 	struct record_thread *thread_data = rec->thread_data;
2354 
2355 	for (t = 0; t < rec->nr_threads; t++)
2356 		waking += thread_data[t].waking;
2357 
2358 	return waking;
2359 }
2360 
2361 static int __cmd_record(struct record *rec, int argc, const char **argv)
2362 {
2363 	int err;
2364 	int status = 0;
2365 	const bool forks = argc > 0;
2366 	struct perf_tool *tool = &rec->tool;
2367 	struct record_opts *opts = &rec->opts;
2368 	struct perf_data *data = &rec->data;
2369 	struct perf_session *session;
2370 	bool disabled = false, draining = false;
2371 	int fd;
2372 	float ratio = 0;
2373 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2374 
2375 	atexit(record__sig_exit);
2376 	signal(SIGCHLD, sig_handler);
2377 	signal(SIGINT, sig_handler);
2378 	signal(SIGTERM, sig_handler);
2379 	signal(SIGSEGV, sigsegv_handler);
2380 
2381 	if (rec->opts.record_namespaces)
2382 		tool->namespace_events = true;
2383 
2384 	if (rec->opts.record_cgroup) {
2385 #ifdef HAVE_FILE_HANDLE
2386 		tool->cgroup_events = true;
2387 #else
2388 		pr_err("cgroup tracking is not supported\n");
2389 		return -1;
2390 #endif
2391 	}
2392 
2393 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2394 		signal(SIGUSR2, snapshot_sig_handler);
2395 		if (rec->opts.auxtrace_snapshot_mode)
2396 			trigger_on(&auxtrace_snapshot_trigger);
2397 		if (rec->switch_output.enabled)
2398 			trigger_on(&switch_output_trigger);
2399 	} else {
2400 		signal(SIGUSR2, SIG_IGN);
2401 	}
2402 
2403 	session = perf_session__new(data, tool);
2404 	if (IS_ERR(session)) {
2405 		pr_err("Perf session creation failed.\n");
2406 		return PTR_ERR(session);
2407 	}
2408 
2409 	if (record__threads_enabled(rec)) {
2410 		if (perf_data__is_pipe(&rec->data)) {
2411 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2412 			return -1;
2413 		}
2414 		if (rec->opts.full_auxtrace) {
2415 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2416 			return -1;
2417 		}
2418 	}
2419 
2420 	fd = perf_data__fd(data);
2421 	rec->session = session;
2422 
2423 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2424 		pr_err("Compression initialization failed.\n");
2425 		return -1;
2426 	}
2427 #ifdef HAVE_EVENTFD_SUPPORT
2428 	done_fd = eventfd(0, EFD_NONBLOCK);
2429 	if (done_fd < 0) {
2430 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2431 		status = -1;
2432 		goto out_delete_session;
2433 	}
2434 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2435 	if (err < 0) {
2436 		pr_err("Failed to add wakeup eventfd to poll list\n");
2437 		status = err;
2438 		goto out_delete_session;
2439 	}
2440 #endif // HAVE_EVENTFD_SUPPORT
2441 
2442 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2443 	session->header.env.comp_level = rec->opts.comp_level;
2444 
2445 	if (rec->opts.kcore &&
2446 	    !record__kcore_readable(&session->machines.host)) {
2447 		pr_err("ERROR: kcore is not readable.\n");
2448 		return -1;
2449 	}
2450 
2451 	if (record__init_clock(rec))
2452 		return -1;
2453 
2454 	record__init_features(rec);
2455 
2456 	if (forks) {
2457 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2458 					       workload_exec_failed_signal);
2459 		if (err < 0) {
2460 			pr_err("Couldn't run the workload!\n");
2461 			status = err;
2462 			goto out_delete_session;
2463 		}
2464 	}
2465 
2466 	/*
2467 	 * If we have just single event and are sending data
2468 	 * through pipe, we need to force the ids allocation,
2469 	 * because we synthesize event name through the pipe
2470 	 * and need the id for that.
2471 	 */
2472 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2473 		rec->opts.sample_id = true;
2474 
2475 	if (rec->timestamp_filename && perf_data__is_pipe(data)) {
2476 		rec->timestamp_filename = false;
2477 		pr_warning("WARNING: --timestamp-filename option is not available in pipe mode.\n");
2478 	}
2479 
2480 	evlist__uniquify_name(rec->evlist);
2481 
2482 	evlist__config(rec->evlist, opts, &callchain_param);
2483 
2484 	/* Debug message used by test scripts */
2485 	pr_debug3("perf record opening and mmapping events\n");
2486 	if (record__open(rec) != 0) {
2487 		err = -1;
2488 		goto out_free_threads;
2489 	}
2490 	/* Debug message used by test scripts */
2491 	pr_debug3("perf record done opening and mmapping events\n");
2492 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2493 
2494 	if (rec->opts.kcore) {
2495 		err = record__kcore_copy(&session->machines.host, data);
2496 		if (err) {
2497 			pr_err("ERROR: Failed to copy kcore\n");
2498 			goto out_free_threads;
2499 		}
2500 	}
2501 
2502 	/*
2503 	 * Normally perf_session__new would do this, but it doesn't have the
2504 	 * evlist.
2505 	 */
2506 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2507 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2508 		rec->tool.ordered_events = false;
2509 	}
2510 
2511 	if (evlist__nr_groups(rec->evlist) == 0)
2512 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2513 
2514 	if (data->is_pipe) {
2515 		err = perf_header__write_pipe(fd);
2516 		if (err < 0)
2517 			goto out_free_threads;
2518 	} else {
2519 		err = perf_session__write_header(session, rec->evlist, fd, false);
2520 		if (err < 0)
2521 			goto out_free_threads;
2522 	}
2523 
2524 	err = -1;
2525 	if (!rec->no_buildid
2526 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2527 		pr_err("Couldn't generate buildids. "
2528 		       "Use --no-buildid to profile anyway.\n");
2529 		goto out_free_threads;
2530 	}
2531 
2532 	err = record__setup_sb_evlist(rec);
2533 	if (err)
2534 		goto out_free_threads;
2535 
2536 	err = record__synthesize(rec, false);
2537 	if (err < 0)
2538 		goto out_free_threads;
2539 
2540 	if (rec->realtime_prio) {
2541 		struct sched_param param;
2542 
2543 		param.sched_priority = rec->realtime_prio;
2544 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2545 			pr_err("Could not set realtime priority.\n");
2546 			err = -1;
2547 			goto out_free_threads;
2548 		}
2549 	}
2550 
2551 	if (record__start_threads(rec))
2552 		goto out_free_threads;
2553 
2554 	/*
2555 	 * When perf is starting the traced process, all the events
2556 	 * (apart from group members) have enable_on_exec=1 set,
2557 	 * so don't spoil it by prematurely enabling them.
2558 	 */
2559 	if (!target__none(&opts->target) && !opts->target.initial_delay)
2560 		evlist__enable(rec->evlist);
2561 
2562 	/*
2563 	 * Let the child rip
2564 	 */
2565 	if (forks) {
2566 		struct machine *machine = &session->machines.host;
2567 		union perf_event *event;
2568 		pid_t tgid;
2569 
2570 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2571 		if (event == NULL) {
2572 			err = -ENOMEM;
2573 			goto out_child;
2574 		}
2575 
2576 		/*
2577 		 * Some H/W events are generated before COMM event
2578 		 * which is emitted during exec(), so perf script
2579 		 * cannot see a correct process name for those events.
2580 		 * Synthesize COMM event to prevent it.
2581 		 */
2582 		tgid = perf_event__synthesize_comm(tool, event,
2583 						   rec->evlist->workload.pid,
2584 						   process_synthesized_event,
2585 						   machine);
2586 		free(event);
2587 
2588 		if (tgid == -1)
2589 			goto out_child;
2590 
2591 		event = malloc(sizeof(event->namespaces) +
2592 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2593 			       machine->id_hdr_size);
2594 		if (event == NULL) {
2595 			err = -ENOMEM;
2596 			goto out_child;
2597 		}
2598 
2599 		/*
2600 		 * Synthesize NAMESPACES event for the command specified.
2601 		 */
2602 		perf_event__synthesize_namespaces(tool, event,
2603 						  rec->evlist->workload.pid,
2604 						  tgid, process_synthesized_event,
2605 						  machine);
2606 		free(event);
2607 
2608 		evlist__start_workload(rec->evlist);
2609 	}
2610 
2611 	if (opts->target.initial_delay) {
2612 		pr_info(EVLIST_DISABLED_MSG);
2613 		if (opts->target.initial_delay > 0) {
2614 			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2615 			evlist__enable(rec->evlist);
2616 			pr_info(EVLIST_ENABLED_MSG);
2617 		}
2618 	}
2619 
2620 	err = event_enable_timer__start(rec->evlist->eet);
2621 	if (err)
2622 		goto out_child;
2623 
2624 	/* Debug message used by test scripts */
2625 	pr_debug3("perf record has started\n");
2626 	fflush(stderr);
2627 
2628 	trigger_ready(&auxtrace_snapshot_trigger);
2629 	trigger_ready(&switch_output_trigger);
2630 	perf_hooks__invoke_record_start();
2631 
2632 	/*
2633 	 * Must write FINISHED_INIT so it will be seen after all other
2634 	 * synthesized user events, but before any regular events.
2635 	 */
2636 	err = write_finished_init(rec, false);
2637 	if (err < 0)
2638 		goto out_child;
2639 
2640 	for (;;) {
2641 		unsigned long long hits = thread->samples;
2642 
2643 		/*
2644 		 * rec->evlist->bkw_mmap_state is possible to be
2645 		 * BKW_MMAP_EMPTY here: when done == true and
2646 		 * hits != rec->samples in previous round.
2647 		 *
2648 		 * evlist__toggle_bkw_mmap ensure we never
2649 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2650 		 */
2651 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2652 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2653 
2654 		if (record__mmap_read_all(rec, false) < 0) {
2655 			trigger_error(&auxtrace_snapshot_trigger);
2656 			trigger_error(&switch_output_trigger);
2657 			err = -1;
2658 			goto out_child;
2659 		}
2660 
2661 		if (auxtrace_record__snapshot_started) {
2662 			auxtrace_record__snapshot_started = 0;
2663 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2664 				record__read_auxtrace_snapshot(rec, false);
2665 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2666 				pr_err("AUX area tracing snapshot failed\n");
2667 				err = -1;
2668 				goto out_child;
2669 			}
2670 		}
2671 
2672 		if (trigger_is_hit(&switch_output_trigger)) {
2673 			/*
2674 			 * If switch_output_trigger is hit, the data in
2675 			 * overwritable ring buffer should have been collected,
2676 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2677 			 *
2678 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2679 			 * record__mmap_read_all() didn't collect data from
2680 			 * overwritable ring buffer. Read again.
2681 			 */
2682 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2683 				continue;
2684 			trigger_ready(&switch_output_trigger);
2685 
2686 			/*
2687 			 * Reenable events in overwrite ring buffer after
2688 			 * record__mmap_read_all(): we should have collected
2689 			 * data from it.
2690 			 */
2691 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2692 
2693 			if (!quiet)
2694 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2695 					record__waking(rec));
2696 			thread->waking = 0;
2697 			fd = record__switch_output(rec, false);
2698 			if (fd < 0) {
2699 				pr_err("Failed to switch to new file\n");
2700 				trigger_error(&switch_output_trigger);
2701 				err = fd;
2702 				goto out_child;
2703 			}
2704 
2705 			/* re-arm the alarm */
2706 			if (rec->switch_output.time)
2707 				alarm(rec->switch_output.time);
2708 		}
2709 
2710 		if (hits == thread->samples) {
2711 			if (done || draining)
2712 				break;
2713 			err = fdarray__poll(&thread->pollfd, -1);
2714 			/*
2715 			 * Propagate error, only if there's any. Ignore positive
2716 			 * number of returned events and interrupt error.
2717 			 */
2718 			if (err > 0 || (err < 0 && errno == EINTR))
2719 				err = 0;
2720 			thread->waking++;
2721 
2722 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2723 					    record__thread_munmap_filtered, NULL) == 0)
2724 				draining = true;
2725 
2726 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2727 			if (err)
2728 				goto out_child;
2729 		}
2730 
2731 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2732 			switch (cmd) {
2733 			case EVLIST_CTL_CMD_SNAPSHOT:
2734 				hit_auxtrace_snapshot_trigger(rec);
2735 				evlist__ctlfd_ack(rec->evlist);
2736 				break;
2737 			case EVLIST_CTL_CMD_STOP:
2738 				done = 1;
2739 				break;
2740 			case EVLIST_CTL_CMD_ACK:
2741 			case EVLIST_CTL_CMD_UNSUPPORTED:
2742 			case EVLIST_CTL_CMD_ENABLE:
2743 			case EVLIST_CTL_CMD_DISABLE:
2744 			case EVLIST_CTL_CMD_EVLIST:
2745 			case EVLIST_CTL_CMD_PING:
2746 			default:
2747 				break;
2748 			}
2749 		}
2750 
2751 		err = event_enable_timer__process(rec->evlist->eet);
2752 		if (err < 0)
2753 			goto out_child;
2754 		if (err) {
2755 			err = 0;
2756 			done = 1;
2757 		}
2758 
2759 		/*
2760 		 * When perf is starting the traced process, at the end events
2761 		 * die with the process and we wait for that. Thus no need to
2762 		 * disable events in this case.
2763 		 */
2764 		if (done && !disabled && !target__none(&opts->target)) {
2765 			trigger_off(&auxtrace_snapshot_trigger);
2766 			evlist__disable(rec->evlist);
2767 			disabled = true;
2768 		}
2769 	}
2770 
2771 	trigger_off(&auxtrace_snapshot_trigger);
2772 	trigger_off(&switch_output_trigger);
2773 
2774 	if (opts->auxtrace_snapshot_on_exit)
2775 		record__auxtrace_snapshot_exit(rec);
2776 
2777 	if (forks && workload_exec_errno) {
2778 		char msg[STRERR_BUFSIZE], strevsels[2048];
2779 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2780 
2781 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2782 
2783 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2784 			strevsels, argv[0], emsg);
2785 		err = -1;
2786 		goto out_child;
2787 	}
2788 
2789 	if (!quiet)
2790 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2791 			record__waking(rec));
2792 
2793 	write_finished_init(rec, true);
2794 
2795 	if (target__none(&rec->opts.target))
2796 		record__synthesize_workload(rec, true);
2797 
2798 out_child:
2799 	record__stop_threads(rec);
2800 	record__mmap_read_all(rec, true);
2801 out_free_threads:
2802 	record__free_thread_data(rec);
2803 	evlist__finalize_ctlfd(rec->evlist);
2804 	record__aio_mmap_read_sync(rec);
2805 
2806 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2807 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2808 		session->header.env.comp_ratio = ratio + 0.5;
2809 	}
2810 
2811 	if (forks) {
2812 		int exit_status;
2813 
2814 		if (!child_finished)
2815 			kill(rec->evlist->workload.pid, SIGTERM);
2816 
2817 		wait(&exit_status);
2818 
2819 		if (err < 0)
2820 			status = err;
2821 		else if (WIFEXITED(exit_status))
2822 			status = WEXITSTATUS(exit_status);
2823 		else if (WIFSIGNALED(exit_status))
2824 			signr = WTERMSIG(exit_status);
2825 	} else
2826 		status = err;
2827 
2828 	if (rec->off_cpu)
2829 		rec->bytes_written += off_cpu_write(rec->session);
2830 
2831 	record__read_lost_samples(rec);
2832 	record__synthesize(rec, true);
2833 	/* this will be recalculated during process_buildids() */
2834 	rec->samples = 0;
2835 
2836 	if (!err) {
2837 		if (!rec->timestamp_filename) {
2838 			record__finish_output(rec);
2839 		} else {
2840 			fd = record__switch_output(rec, true);
2841 			if (fd < 0) {
2842 				status = fd;
2843 				goto out_delete_session;
2844 			}
2845 		}
2846 	}
2847 
2848 	perf_hooks__invoke_record_end();
2849 
2850 	if (!err && !quiet) {
2851 		char samples[128];
2852 		const char *postfix = rec->timestamp_filename ?
2853 					".<timestamp>" : "";
2854 
2855 		if (rec->samples && !rec->opts.full_auxtrace)
2856 			scnprintf(samples, sizeof(samples),
2857 				  " (%" PRIu64 " samples)", rec->samples);
2858 		else
2859 			samples[0] = '\0';
2860 
2861 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2862 			perf_data__size(data) / 1024.0 / 1024.0,
2863 			data->path, postfix, samples);
2864 		if (ratio) {
2865 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2866 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2867 					ratio);
2868 		}
2869 		fprintf(stderr, " ]\n");
2870 	}
2871 
2872 out_delete_session:
2873 #ifdef HAVE_EVENTFD_SUPPORT
2874 	if (done_fd >= 0) {
2875 		fd = done_fd;
2876 		done_fd = -1;
2877 
2878 		close(fd);
2879 	}
2880 #endif
2881 	zstd_fini(&session->zstd_data);
2882 	if (!opts->no_bpf_event)
2883 		evlist__stop_sb_thread(rec->sb_evlist);
2884 
2885 	perf_session__delete(session);
2886 	return status;
2887 }
2888 
2889 static void callchain_debug(struct callchain_param *callchain)
2890 {
2891 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2892 
2893 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2894 
2895 	if (callchain->record_mode == CALLCHAIN_DWARF)
2896 		pr_debug("callchain: stack dump size %d\n",
2897 			 callchain->dump_size);
2898 }
2899 
2900 int record_opts__parse_callchain(struct record_opts *record,
2901 				 struct callchain_param *callchain,
2902 				 const char *arg, bool unset)
2903 {
2904 	int ret;
2905 	callchain->enabled = !unset;
2906 
2907 	/* --no-call-graph */
2908 	if (unset) {
2909 		callchain->record_mode = CALLCHAIN_NONE;
2910 		pr_debug("callchain: disabled\n");
2911 		return 0;
2912 	}
2913 
2914 	ret = parse_callchain_record_opt(arg, callchain);
2915 	if (!ret) {
2916 		/* Enable data address sampling for DWARF unwind. */
2917 		if (callchain->record_mode == CALLCHAIN_DWARF)
2918 			record->sample_address = true;
2919 		callchain_debug(callchain);
2920 	}
2921 
2922 	return ret;
2923 }
2924 
2925 int record_parse_callchain_opt(const struct option *opt,
2926 			       const char *arg,
2927 			       int unset)
2928 {
2929 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2930 }
2931 
2932 int record_callchain_opt(const struct option *opt,
2933 			 const char *arg __maybe_unused,
2934 			 int unset __maybe_unused)
2935 {
2936 	struct callchain_param *callchain = opt->value;
2937 
2938 	callchain->enabled = true;
2939 
2940 	if (callchain->record_mode == CALLCHAIN_NONE)
2941 		callchain->record_mode = CALLCHAIN_FP;
2942 
2943 	callchain_debug(callchain);
2944 	return 0;
2945 }
2946 
2947 static int perf_record_config(const char *var, const char *value, void *cb)
2948 {
2949 	struct record *rec = cb;
2950 
2951 	if (!strcmp(var, "record.build-id")) {
2952 		if (!strcmp(value, "cache"))
2953 			rec->no_buildid_cache = false;
2954 		else if (!strcmp(value, "no-cache"))
2955 			rec->no_buildid_cache = true;
2956 		else if (!strcmp(value, "skip"))
2957 			rec->no_buildid = true;
2958 		else if (!strcmp(value, "mmap"))
2959 			rec->buildid_mmap = true;
2960 		else
2961 			return -1;
2962 		return 0;
2963 	}
2964 	if (!strcmp(var, "record.call-graph")) {
2965 		var = "call-graph.record-mode";
2966 		return perf_default_config(var, value, cb);
2967 	}
2968 #ifdef HAVE_AIO_SUPPORT
2969 	if (!strcmp(var, "record.aio")) {
2970 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2971 		if (!rec->opts.nr_cblocks)
2972 			rec->opts.nr_cblocks = nr_cblocks_default;
2973 	}
2974 #endif
2975 	if (!strcmp(var, "record.debuginfod")) {
2976 		rec->debuginfod.urls = strdup(value);
2977 		if (!rec->debuginfod.urls)
2978 			return -ENOMEM;
2979 		rec->debuginfod.set = true;
2980 	}
2981 
2982 	return 0;
2983 }
2984 
2985 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2986 {
2987 	struct record *rec = (struct record *)opt->value;
2988 
2989 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2990 }
2991 
2992 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2993 {
2994 	struct record_opts *opts = (struct record_opts *)opt->value;
2995 
2996 	if (unset || !str)
2997 		return 0;
2998 
2999 	if (!strcasecmp(str, "node"))
3000 		opts->affinity = PERF_AFFINITY_NODE;
3001 	else if (!strcasecmp(str, "cpu"))
3002 		opts->affinity = PERF_AFFINITY_CPU;
3003 
3004 	return 0;
3005 }
3006 
3007 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3008 {
3009 	mask->nbits = nr_bits;
3010 	mask->bits = bitmap_zalloc(mask->nbits);
3011 	if (!mask->bits)
3012 		return -ENOMEM;
3013 
3014 	return 0;
3015 }
3016 
3017 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3018 {
3019 	bitmap_free(mask->bits);
3020 	mask->nbits = 0;
3021 }
3022 
3023 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3024 {
3025 	int ret;
3026 
3027 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3028 	if (ret) {
3029 		mask->affinity.bits = NULL;
3030 		return ret;
3031 	}
3032 
3033 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3034 	if (ret) {
3035 		record__mmap_cpu_mask_free(&mask->maps);
3036 		mask->maps.bits = NULL;
3037 	}
3038 
3039 	return ret;
3040 }
3041 
3042 static void record__thread_mask_free(struct thread_mask *mask)
3043 {
3044 	record__mmap_cpu_mask_free(&mask->maps);
3045 	record__mmap_cpu_mask_free(&mask->affinity);
3046 }
3047 
3048 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3049 {
3050 	int s;
3051 	struct record_opts *opts = opt->value;
3052 
3053 	if (unset || !str || !strlen(str)) {
3054 		opts->threads_spec = THREAD_SPEC__CPU;
3055 	} else {
3056 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3057 			if (s == THREAD_SPEC__USER) {
3058 				opts->threads_user_spec = strdup(str);
3059 				if (!opts->threads_user_spec)
3060 					return -ENOMEM;
3061 				opts->threads_spec = THREAD_SPEC__USER;
3062 				break;
3063 			}
3064 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3065 				opts->threads_spec = s;
3066 				break;
3067 			}
3068 		}
3069 	}
3070 
3071 	if (opts->threads_spec == THREAD_SPEC__USER)
3072 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3073 	else
3074 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3075 
3076 	return 0;
3077 }
3078 
3079 static int parse_output_max_size(const struct option *opt,
3080 				 const char *str, int unset)
3081 {
3082 	unsigned long *s = (unsigned long *)opt->value;
3083 	static struct parse_tag tags_size[] = {
3084 		{ .tag  = 'B', .mult = 1       },
3085 		{ .tag  = 'K', .mult = 1 << 10 },
3086 		{ .tag  = 'M', .mult = 1 << 20 },
3087 		{ .tag  = 'G', .mult = 1 << 30 },
3088 		{ .tag  = 0 },
3089 	};
3090 	unsigned long val;
3091 
3092 	if (unset) {
3093 		*s = 0;
3094 		return 0;
3095 	}
3096 
3097 	val = parse_tag_value(str, tags_size);
3098 	if (val != (unsigned long) -1) {
3099 		*s = val;
3100 		return 0;
3101 	}
3102 
3103 	return -1;
3104 }
3105 
3106 static int record__parse_mmap_pages(const struct option *opt,
3107 				    const char *str,
3108 				    int unset __maybe_unused)
3109 {
3110 	struct record_opts *opts = opt->value;
3111 	char *s, *p;
3112 	unsigned int mmap_pages;
3113 	int ret;
3114 
3115 	if (!str)
3116 		return -EINVAL;
3117 
3118 	s = strdup(str);
3119 	if (!s)
3120 		return -ENOMEM;
3121 
3122 	p = strchr(s, ',');
3123 	if (p)
3124 		*p = '\0';
3125 
3126 	if (*s) {
3127 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3128 		if (ret)
3129 			goto out_free;
3130 		opts->mmap_pages = mmap_pages;
3131 	}
3132 
3133 	if (!p) {
3134 		ret = 0;
3135 		goto out_free;
3136 	}
3137 
3138 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3139 	if (ret)
3140 		goto out_free;
3141 
3142 	opts->auxtrace_mmap_pages = mmap_pages;
3143 
3144 out_free:
3145 	free(s);
3146 	return ret;
3147 }
3148 
3149 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3150 {
3151 }
3152 
3153 static int parse_control_option(const struct option *opt,
3154 				const char *str,
3155 				int unset __maybe_unused)
3156 {
3157 	struct record_opts *opts = opt->value;
3158 
3159 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3160 }
3161 
3162 static void switch_output_size_warn(struct record *rec)
3163 {
3164 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3165 	struct switch_output *s = &rec->switch_output;
3166 
3167 	wakeup_size /= 2;
3168 
3169 	if (s->size < wakeup_size) {
3170 		char buf[100];
3171 
3172 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3173 		pr_warning("WARNING: switch-output data size lower than "
3174 			   "wakeup kernel buffer size (%s) "
3175 			   "expect bigger perf.data sizes\n", buf);
3176 	}
3177 }
3178 
3179 static int switch_output_setup(struct record *rec)
3180 {
3181 	struct switch_output *s = &rec->switch_output;
3182 	static struct parse_tag tags_size[] = {
3183 		{ .tag  = 'B', .mult = 1       },
3184 		{ .tag  = 'K', .mult = 1 << 10 },
3185 		{ .tag  = 'M', .mult = 1 << 20 },
3186 		{ .tag  = 'G', .mult = 1 << 30 },
3187 		{ .tag  = 0 },
3188 	};
3189 	static struct parse_tag tags_time[] = {
3190 		{ .tag  = 's', .mult = 1        },
3191 		{ .tag  = 'm', .mult = 60       },
3192 		{ .tag  = 'h', .mult = 60*60    },
3193 		{ .tag  = 'd', .mult = 60*60*24 },
3194 		{ .tag  = 0 },
3195 	};
3196 	unsigned long val;
3197 
3198 	/*
3199 	 * If we're using --switch-output-events, then we imply its
3200 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3201 	 *  thread to its parent.
3202 	 */
3203 	if (rec->switch_output_event_set) {
3204 		if (record__threads_enabled(rec)) {
3205 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3206 			return 0;
3207 		}
3208 		goto do_signal;
3209 	}
3210 
3211 	if (!s->set)
3212 		return 0;
3213 
3214 	if (record__threads_enabled(rec)) {
3215 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3216 		return 0;
3217 	}
3218 
3219 	if (!strcmp(s->str, "signal")) {
3220 do_signal:
3221 		s->signal = true;
3222 		pr_debug("switch-output with SIGUSR2 signal\n");
3223 		goto enabled;
3224 	}
3225 
3226 	val = parse_tag_value(s->str, tags_size);
3227 	if (val != (unsigned long) -1) {
3228 		s->size = val;
3229 		pr_debug("switch-output with %s size threshold\n", s->str);
3230 		goto enabled;
3231 	}
3232 
3233 	val = parse_tag_value(s->str, tags_time);
3234 	if (val != (unsigned long) -1) {
3235 		s->time = val;
3236 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3237 			 s->str, s->time);
3238 		goto enabled;
3239 	}
3240 
3241 	return -1;
3242 
3243 enabled:
3244 	rec->timestamp_filename = true;
3245 	s->enabled              = true;
3246 
3247 	if (s->size && !rec->opts.no_buffering)
3248 		switch_output_size_warn(rec);
3249 
3250 	return 0;
3251 }
3252 
3253 static const char * const __record_usage[] = {
3254 	"perf record [<options>] [<command>]",
3255 	"perf record [<options>] -- <command> [<options>]",
3256 	NULL
3257 };
3258 const char * const *record_usage = __record_usage;
3259 
3260 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3261 				  struct perf_sample *sample, struct machine *machine)
3262 {
3263 	/*
3264 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3265 	 * no need to add them twice.
3266 	 */
3267 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3268 		return 0;
3269 	return perf_event__process_mmap(tool, event, sample, machine);
3270 }
3271 
3272 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3273 				   struct perf_sample *sample, struct machine *machine)
3274 {
3275 	/*
3276 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3277 	 * no need to add them twice.
3278 	 */
3279 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3280 		return 0;
3281 
3282 	return perf_event__process_mmap2(tool, event, sample, machine);
3283 }
3284 
3285 static int process_timestamp_boundary(struct perf_tool *tool,
3286 				      union perf_event *event __maybe_unused,
3287 				      struct perf_sample *sample,
3288 				      struct machine *machine __maybe_unused)
3289 {
3290 	struct record *rec = container_of(tool, struct record, tool);
3291 
3292 	set_timestamp_boundary(rec, sample->time);
3293 	return 0;
3294 }
3295 
3296 static int parse_record_synth_option(const struct option *opt,
3297 				     const char *str,
3298 				     int unset __maybe_unused)
3299 {
3300 	struct record_opts *opts = opt->value;
3301 	char *p = strdup(str);
3302 
3303 	if (p == NULL)
3304 		return -1;
3305 
3306 	opts->synth = parse_synth_opt(p);
3307 	free(p);
3308 
3309 	if (opts->synth < 0) {
3310 		pr_err("Invalid synth option: %s\n", str);
3311 		return -1;
3312 	}
3313 	return 0;
3314 }
3315 
3316 /*
3317  * XXX Ideally would be local to cmd_record() and passed to a record__new
3318  * because we need to have access to it in record__exit, that is called
3319  * after cmd_record() exits, but since record_options need to be accessible to
3320  * builtin-script, leave it here.
3321  *
3322  * At least we don't ouch it in all the other functions here directly.
3323  *
3324  * Just say no to tons of global variables, sigh.
3325  */
3326 static struct record record = {
3327 	.opts = {
3328 		.sample_time	     = true,
3329 		.mmap_pages	     = UINT_MAX,
3330 		.user_freq	     = UINT_MAX,
3331 		.user_interval	     = ULLONG_MAX,
3332 		.freq		     = 4000,
3333 		.target		     = {
3334 			.uses_mmap   = true,
3335 			.default_per_cpu = true,
3336 		},
3337 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3338 		.nr_threads_synthesize = 1,
3339 		.ctl_fd              = -1,
3340 		.ctl_fd_ack          = -1,
3341 		.synth               = PERF_SYNTH_ALL,
3342 	},
3343 	.tool = {
3344 		.sample		= process_sample_event,
3345 		.fork		= perf_event__process_fork,
3346 		.exit		= perf_event__process_exit,
3347 		.comm		= perf_event__process_comm,
3348 		.namespaces	= perf_event__process_namespaces,
3349 		.mmap		= build_id__process_mmap,
3350 		.mmap2		= build_id__process_mmap2,
3351 		.itrace_start	= process_timestamp_boundary,
3352 		.aux		= process_timestamp_boundary,
3353 		.ordered_events	= true,
3354 	},
3355 };
3356 
3357 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3358 	"\n\t\t\t\tDefault: fp";
3359 
3360 static bool dry_run;
3361 
3362 static struct parse_events_option_args parse_events_option_args = {
3363 	.evlistp = &record.evlist,
3364 };
3365 
3366 static struct parse_events_option_args switch_output_parse_events_option_args = {
3367 	.evlistp = &record.sb_evlist,
3368 };
3369 
3370 /*
3371  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3372  * with it and switch to use the library functions in perf_evlist that came
3373  * from builtin-record.c, i.e. use record_opts,
3374  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3375  * using pipes, etc.
3376  */
3377 static struct option __record_options[] = {
3378 	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3379 		     "event selector. use 'perf list' to list available events",
3380 		     parse_events_option),
3381 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3382 		     "event filter", parse_filter),
3383 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3384 			   NULL, "don't record events from perf itself",
3385 			   exclude_perf),
3386 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3387 		    "record events on existing process id"),
3388 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3389 		    "record events on existing thread id"),
3390 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3391 		    "collect data with this RT SCHED_FIFO priority"),
3392 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3393 		    "collect data without buffering"),
3394 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3395 		    "collect raw sample records from all opened counters"),
3396 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3397 			    "system-wide collection from all CPUs"),
3398 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3399 		    "list of cpus to monitor"),
3400 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3401 	OPT_STRING('o', "output", &record.data.path, "file",
3402 		    "output file name"),
3403 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3404 			&record.opts.no_inherit_set,
3405 			"child tasks do not inherit counters"),
3406 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3407 		    "synthesize non-sample events at the end of output"),
3408 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3409 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3410 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3411 		    "Fail if the specified frequency can't be used"),
3412 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3413 		     "profile at this frequency",
3414 		      record__parse_freq),
3415 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3416 		     "number of mmap data pages and AUX area tracing mmap pages",
3417 		     record__parse_mmap_pages),
3418 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3419 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3420 		     record__mmap_flush_parse),
3421 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3422 			   NULL, "enables call-graph recording" ,
3423 			   &record_callchain_opt),
3424 	OPT_CALLBACK(0, "call-graph", &record.opts,
3425 		     "record_mode[,record_size]", record_callchain_help,
3426 		     &record_parse_callchain_opt),
3427 	OPT_INCR('v', "verbose", &verbose,
3428 		    "be more verbose (show counter open errors, etc)"),
3429 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3430 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3431 		    "per thread counts"),
3432 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3433 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3434 		    "Record the sample physical addresses"),
3435 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3436 		    "Record the sampled data address data page size"),
3437 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3438 		    "Record the sampled code address (ip) page size"),
3439 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3440 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3441 		    "Record the sample identifier"),
3442 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3443 			&record.opts.sample_time_set,
3444 			"Record the sample timestamps"),
3445 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3446 			"Record the sample period"),
3447 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3448 		    "don't sample"),
3449 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3450 			&record.no_buildid_cache_set,
3451 			"do not update the buildid cache"),
3452 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3453 			&record.no_buildid_set,
3454 			"do not collect buildids in perf.data"),
3455 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3456 		     "monitor event in cgroup name only",
3457 		     parse_cgroups),
3458 	OPT_CALLBACK('D', "delay", &record, "ms",
3459 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3460 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3461 		     record__parse_event_enable_time),
3462 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3463 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3464 		   "user to profile"),
3465 
3466 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3467 		     "branch any", "sample any taken branches",
3468 		     parse_branch_stack),
3469 
3470 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3471 		     "branch filter mask", "branch stack filter modes",
3472 		     parse_branch_stack),
3473 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3474 		    "sample by weight (on special events only)"),
3475 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3476 		    "sample transaction flags (special events only)"),
3477 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3478 		    "use per-thread mmaps"),
3479 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3480 		    "sample selected machine registers on interrupt,"
3481 		    " use '-I?' to list register names", parse_intr_regs),
3482 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3483 		    "sample selected machine registers on interrupt,"
3484 		    " use '--user-regs=?' to list register names", parse_user_regs),
3485 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3486 		    "Record running/enabled time of read (:S) events"),
3487 	OPT_CALLBACK('k', "clockid", &record.opts,
3488 	"clockid", "clockid to use for events, see clock_gettime()",
3489 	parse_clockid),
3490 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3491 			  "opts", "AUX area tracing Snapshot Mode", ""),
3492 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3493 			  "opts", "sample AUX area", ""),
3494 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3495 			"per thread proc mmap processing timeout in ms"),
3496 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3497 		    "Record namespaces events"),
3498 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3499 		    "Record cgroup events"),
3500 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3501 			&record.opts.record_switch_events_set,
3502 			"Record context switch events"),
3503 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3504 			 "Configure all used events to run in kernel space.",
3505 			 PARSE_OPT_EXCLUSIVE),
3506 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3507 			 "Configure all used events to run in user space.",
3508 			 PARSE_OPT_EXCLUSIVE),
3509 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3510 		    "collect kernel callchains"),
3511 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3512 		    "collect user callchains"),
3513 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3514 		   "file", "vmlinux pathname"),
3515 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3516 		    "Record build-id of all DSOs regardless of hits"),
3517 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3518 		    "Record build-id in map events"),
3519 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3520 		    "append timestamp to output filename"),
3521 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3522 		    "Record timestamp boundary (time of first/last samples)"),
3523 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3524 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3525 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3526 			  "signal"),
3527 	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3528 			 &record.switch_output_event_set, "switch output event",
3529 			 "switch output event selector. use 'perf list' to list available events",
3530 			 parse_events_option_new_evlist),
3531 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3532 		   "Limit number of switch output generated files"),
3533 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3534 		    "Parse options then exit"),
3535 #ifdef HAVE_AIO_SUPPORT
3536 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3537 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3538 		     record__aio_parse),
3539 #endif
3540 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3541 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3542 		     record__parse_affinity),
3543 #ifdef HAVE_ZSTD_SUPPORT
3544 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3545 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3546 			    record__parse_comp_level),
3547 #endif
3548 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3549 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3550 	OPT_UINTEGER(0, "num-thread-synthesize",
3551 		     &record.opts.nr_threads_synthesize,
3552 		     "number of threads to run for event synthesis"),
3553 #ifdef HAVE_LIBPFM
3554 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3555 		"libpfm4 event selector. use 'perf list' to list available events",
3556 		parse_libpfm_events_option),
3557 #endif
3558 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3559 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3560 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3561 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3562 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3563 		      parse_control_option),
3564 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3565 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3566 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3567 			  &record.debuginfod.set, "debuginfod urls",
3568 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3569 			  "system"),
3570 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3571 			    "write collected trace data into several data files using parallel threads",
3572 			    record__parse_threads),
3573 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3574 	OPT_END()
3575 };
3576 
3577 struct option *record_options = __record_options;
3578 
3579 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3580 {
3581 	struct perf_cpu cpu;
3582 	int idx;
3583 
3584 	if (cpu_map__is_dummy(cpus))
3585 		return 0;
3586 
3587 	perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3588 		/* Return ENODEV is input cpu is greater than max cpu */
3589 		if ((unsigned long)cpu.cpu > mask->nbits)
3590 			return -ENODEV;
3591 		__set_bit(cpu.cpu, mask->bits);
3592 	}
3593 
3594 	return 0;
3595 }
3596 
3597 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3598 {
3599 	struct perf_cpu_map *cpus;
3600 
3601 	cpus = perf_cpu_map__new(mask_spec);
3602 	if (!cpus)
3603 		return -ENOMEM;
3604 
3605 	bitmap_zero(mask->bits, mask->nbits);
3606 	if (record__mmap_cpu_mask_init(mask, cpus))
3607 		return -ENODEV;
3608 
3609 	perf_cpu_map__put(cpus);
3610 
3611 	return 0;
3612 }
3613 
3614 static void record__free_thread_masks(struct record *rec, int nr_threads)
3615 {
3616 	int t;
3617 
3618 	if (rec->thread_masks)
3619 		for (t = 0; t < nr_threads; t++)
3620 			record__thread_mask_free(&rec->thread_masks[t]);
3621 
3622 	zfree(&rec->thread_masks);
3623 }
3624 
3625 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3626 {
3627 	int t, ret;
3628 
3629 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3630 	if (!rec->thread_masks) {
3631 		pr_err("Failed to allocate thread masks\n");
3632 		return -ENOMEM;
3633 	}
3634 
3635 	for (t = 0; t < nr_threads; t++) {
3636 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3637 		if (ret) {
3638 			pr_err("Failed to allocate thread masks[%d]\n", t);
3639 			goto out_free;
3640 		}
3641 	}
3642 
3643 	return 0;
3644 
3645 out_free:
3646 	record__free_thread_masks(rec, nr_threads);
3647 
3648 	return ret;
3649 }
3650 
3651 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3652 {
3653 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3654 
3655 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3656 	if (ret)
3657 		return ret;
3658 
3659 	rec->nr_threads = nr_cpus;
3660 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3661 
3662 	for (t = 0; t < rec->nr_threads; t++) {
3663 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3664 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3665 		if (verbose > 0) {
3666 			pr_debug("thread_masks[%d]: ", t);
3667 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3668 			pr_debug("thread_masks[%d]: ", t);
3669 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3670 		}
3671 	}
3672 
3673 	return 0;
3674 }
3675 
3676 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3677 					  const char **maps_spec, const char **affinity_spec,
3678 					  u32 nr_spec)
3679 {
3680 	u32 s;
3681 	int ret = 0, t = 0;
3682 	struct mmap_cpu_mask cpus_mask;
3683 	struct thread_mask thread_mask, full_mask, *thread_masks;
3684 
3685 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3686 	if (ret) {
3687 		pr_err("Failed to allocate CPUs mask\n");
3688 		return ret;
3689 	}
3690 
3691 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3692 	if (ret) {
3693 		pr_err("Failed to init cpu mask\n");
3694 		goto out_free_cpu_mask;
3695 	}
3696 
3697 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3698 	if (ret) {
3699 		pr_err("Failed to allocate full mask\n");
3700 		goto out_free_cpu_mask;
3701 	}
3702 
3703 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3704 	if (ret) {
3705 		pr_err("Failed to allocate thread mask\n");
3706 		goto out_free_full_and_cpu_masks;
3707 	}
3708 
3709 	for (s = 0; s < nr_spec; s++) {
3710 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3711 		if (ret) {
3712 			pr_err("Failed to initialize maps thread mask\n");
3713 			goto out_free;
3714 		}
3715 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3716 		if (ret) {
3717 			pr_err("Failed to initialize affinity thread mask\n");
3718 			goto out_free;
3719 		}
3720 
3721 		/* ignore invalid CPUs but do not allow empty masks */
3722 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3723 				cpus_mask.bits, thread_mask.maps.nbits)) {
3724 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3725 			ret = -EINVAL;
3726 			goto out_free;
3727 		}
3728 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3729 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3730 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3731 			ret = -EINVAL;
3732 			goto out_free;
3733 		}
3734 
3735 		/* do not allow intersection with other masks (full_mask) */
3736 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3737 				      thread_mask.maps.nbits)) {
3738 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3739 			ret = -EINVAL;
3740 			goto out_free;
3741 		}
3742 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3743 				      thread_mask.affinity.nbits)) {
3744 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3745 			ret = -EINVAL;
3746 			goto out_free;
3747 		}
3748 
3749 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3750 			  thread_mask.maps.bits, full_mask.maps.nbits);
3751 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3752 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3753 
3754 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3755 		if (!thread_masks) {
3756 			pr_err("Failed to reallocate thread masks\n");
3757 			ret = -ENOMEM;
3758 			goto out_free;
3759 		}
3760 		rec->thread_masks = thread_masks;
3761 		rec->thread_masks[t] = thread_mask;
3762 		if (verbose > 0) {
3763 			pr_debug("thread_masks[%d]: ", t);
3764 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3765 			pr_debug("thread_masks[%d]: ", t);
3766 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3767 		}
3768 		t++;
3769 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3770 		if (ret) {
3771 			pr_err("Failed to allocate thread mask\n");
3772 			goto out_free_full_and_cpu_masks;
3773 		}
3774 	}
3775 	rec->nr_threads = t;
3776 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3777 	if (!rec->nr_threads)
3778 		ret = -EINVAL;
3779 
3780 out_free:
3781 	record__thread_mask_free(&thread_mask);
3782 out_free_full_and_cpu_masks:
3783 	record__thread_mask_free(&full_mask);
3784 out_free_cpu_mask:
3785 	record__mmap_cpu_mask_free(&cpus_mask);
3786 
3787 	return ret;
3788 }
3789 
3790 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3791 {
3792 	int ret;
3793 	struct cpu_topology *topo;
3794 
3795 	topo = cpu_topology__new();
3796 	if (!topo) {
3797 		pr_err("Failed to allocate CPU topology\n");
3798 		return -ENOMEM;
3799 	}
3800 
3801 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3802 					     topo->core_cpus_list, topo->core_cpus_lists);
3803 	cpu_topology__delete(topo);
3804 
3805 	return ret;
3806 }
3807 
3808 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3809 {
3810 	int ret;
3811 	struct cpu_topology *topo;
3812 
3813 	topo = cpu_topology__new();
3814 	if (!topo) {
3815 		pr_err("Failed to allocate CPU topology\n");
3816 		return -ENOMEM;
3817 	}
3818 
3819 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3820 					     topo->package_cpus_list, topo->package_cpus_lists);
3821 	cpu_topology__delete(topo);
3822 
3823 	return ret;
3824 }
3825 
3826 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3827 {
3828 	u32 s;
3829 	int ret;
3830 	const char **spec;
3831 	struct numa_topology *topo;
3832 
3833 	topo = numa_topology__new();
3834 	if (!topo) {
3835 		pr_err("Failed to allocate NUMA topology\n");
3836 		return -ENOMEM;
3837 	}
3838 
3839 	spec = zalloc(topo->nr * sizeof(char *));
3840 	if (!spec) {
3841 		pr_err("Failed to allocate NUMA spec\n");
3842 		ret = -ENOMEM;
3843 		goto out_delete_topo;
3844 	}
3845 	for (s = 0; s < topo->nr; s++)
3846 		spec[s] = topo->nodes[s].cpus;
3847 
3848 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3849 
3850 	zfree(&spec);
3851 
3852 out_delete_topo:
3853 	numa_topology__delete(topo);
3854 
3855 	return ret;
3856 }
3857 
3858 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3859 {
3860 	int t, ret;
3861 	u32 s, nr_spec = 0;
3862 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3863 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3864 
3865 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3866 		spec = strtok_r(user_spec, ":", &spec_ptr);
3867 		if (spec == NULL)
3868 			break;
3869 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3870 		mask = strtok_r(spec, "/", &mask_ptr);
3871 		if (mask == NULL)
3872 			break;
3873 		pr_debug2("  maps mask: %s\n", mask);
3874 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3875 		if (!tmp_spec) {
3876 			pr_err("Failed to reallocate maps spec\n");
3877 			ret = -ENOMEM;
3878 			goto out_free;
3879 		}
3880 		maps_spec = tmp_spec;
3881 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3882 		if (!maps_spec[nr_spec]) {
3883 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3884 			ret = -ENOMEM;
3885 			goto out_free;
3886 		}
3887 		mask = strtok_r(NULL, "/", &mask_ptr);
3888 		if (mask == NULL) {
3889 			pr_err("Invalid thread maps or affinity specs\n");
3890 			ret = -EINVAL;
3891 			goto out_free;
3892 		}
3893 		pr_debug2("  affinity mask: %s\n", mask);
3894 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3895 		if (!tmp_spec) {
3896 			pr_err("Failed to reallocate affinity spec\n");
3897 			ret = -ENOMEM;
3898 			goto out_free;
3899 		}
3900 		affinity_spec = tmp_spec;
3901 		affinity_spec[nr_spec] = strdup(mask);
3902 		if (!affinity_spec[nr_spec]) {
3903 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3904 			ret = -ENOMEM;
3905 			goto out_free;
3906 		}
3907 		dup_mask = NULL;
3908 		nr_spec++;
3909 	}
3910 
3911 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3912 					     (const char **)affinity_spec, nr_spec);
3913 
3914 out_free:
3915 	free(dup_mask);
3916 	for (s = 0; s < nr_spec; s++) {
3917 		if (maps_spec)
3918 			free(maps_spec[s]);
3919 		if (affinity_spec)
3920 			free(affinity_spec[s]);
3921 	}
3922 	free(affinity_spec);
3923 	free(maps_spec);
3924 
3925 	return ret;
3926 }
3927 
3928 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3929 {
3930 	int ret;
3931 
3932 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3933 	if (ret)
3934 		return ret;
3935 
3936 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3937 		return -ENODEV;
3938 
3939 	rec->nr_threads = 1;
3940 
3941 	return 0;
3942 }
3943 
3944 static int record__init_thread_masks(struct record *rec)
3945 {
3946 	int ret = 0;
3947 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3948 
3949 	if (!record__threads_enabled(rec))
3950 		return record__init_thread_default_masks(rec, cpus);
3951 
3952 	if (evlist__per_thread(rec->evlist)) {
3953 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3954 		return -EINVAL;
3955 	}
3956 
3957 	switch (rec->opts.threads_spec) {
3958 	case THREAD_SPEC__CPU:
3959 		ret = record__init_thread_cpu_masks(rec, cpus);
3960 		break;
3961 	case THREAD_SPEC__CORE:
3962 		ret = record__init_thread_core_masks(rec, cpus);
3963 		break;
3964 	case THREAD_SPEC__PACKAGE:
3965 		ret = record__init_thread_package_masks(rec, cpus);
3966 		break;
3967 	case THREAD_SPEC__NUMA:
3968 		ret = record__init_thread_numa_masks(rec, cpus);
3969 		break;
3970 	case THREAD_SPEC__USER:
3971 		ret = record__init_thread_user_masks(rec, cpus);
3972 		break;
3973 	default:
3974 		break;
3975 	}
3976 
3977 	return ret;
3978 }
3979 
3980 int cmd_record(int argc, const char **argv)
3981 {
3982 	int err;
3983 	struct record *rec = &record;
3984 	char errbuf[BUFSIZ];
3985 
3986 	setlocale(LC_ALL, "");
3987 
3988 #ifndef HAVE_BPF_SKEL
3989 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3990 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3991 # undef set_nobuild
3992 #endif
3993 
3994 	/* Disable eager loading of kernel symbols that adds overhead to perf record. */
3995 	symbol_conf.lazy_load_kernel_maps = true;
3996 	rec->opts.affinity = PERF_AFFINITY_SYS;
3997 
3998 	rec->evlist = evlist__new();
3999 	if (rec->evlist == NULL)
4000 		return -ENOMEM;
4001 
4002 	err = perf_config(perf_record_config, rec);
4003 	if (err)
4004 		return err;
4005 
4006 	argc = parse_options(argc, argv, record_options, record_usage,
4007 			    PARSE_OPT_STOP_AT_NON_OPTION);
4008 	if (quiet)
4009 		perf_quiet_option();
4010 
4011 	err = symbol__validate_sym_arguments();
4012 	if (err)
4013 		return err;
4014 
4015 	perf_debuginfod_setup(&record.debuginfod);
4016 
4017 	/* Make system wide (-a) the default target. */
4018 	if (!argc && target__none(&rec->opts.target))
4019 		rec->opts.target.system_wide = true;
4020 
4021 	if (nr_cgroups && !rec->opts.target.system_wide) {
4022 		usage_with_options_msg(record_usage, record_options,
4023 			"cgroup monitoring only available in system-wide mode");
4024 
4025 	}
4026 
4027 	if (rec->buildid_mmap) {
4028 		if (!perf_can_record_build_id()) {
4029 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4030 			err = -EINVAL;
4031 			goto out_opts;
4032 		}
4033 		pr_debug("Enabling build id in mmap2 events.\n");
4034 		/* Enable mmap build id synthesizing. */
4035 		symbol_conf.buildid_mmap2 = true;
4036 		/* Enable perf_event_attr::build_id bit. */
4037 		rec->opts.build_id = true;
4038 		/* Disable build id cache. */
4039 		rec->no_buildid = true;
4040 	}
4041 
4042 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4043 		pr_err("Kernel has no cgroup sampling support.\n");
4044 		err = -EINVAL;
4045 		goto out_opts;
4046 	}
4047 
4048 	if (rec->opts.kcore)
4049 		rec->opts.text_poke = true;
4050 
4051 	if (rec->opts.kcore || record__threads_enabled(rec))
4052 		rec->data.is_dir = true;
4053 
4054 	if (record__threads_enabled(rec)) {
4055 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4056 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4057 			goto out_opts;
4058 		}
4059 		if (record__aio_enabled(rec)) {
4060 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4061 			goto out_opts;
4062 		}
4063 	}
4064 
4065 	if (rec->opts.comp_level != 0) {
4066 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4067 		rec->no_buildid = true;
4068 	}
4069 
4070 	if (rec->opts.record_switch_events &&
4071 	    !perf_can_record_switch_events()) {
4072 		ui__error("kernel does not support recording context switch events\n");
4073 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4074 		err = -EINVAL;
4075 		goto out_opts;
4076 	}
4077 
4078 	if (switch_output_setup(rec)) {
4079 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4080 		err = -EINVAL;
4081 		goto out_opts;
4082 	}
4083 
4084 	if (rec->switch_output.time) {
4085 		signal(SIGALRM, alarm_sig_handler);
4086 		alarm(rec->switch_output.time);
4087 	}
4088 
4089 	if (rec->switch_output.num_files) {
4090 		rec->switch_output.filenames = calloc(rec->switch_output.num_files,
4091 						      sizeof(char *));
4092 		if (!rec->switch_output.filenames) {
4093 			err = -EINVAL;
4094 			goto out_opts;
4095 		}
4096 	}
4097 
4098 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4099 		rec->timestamp_filename = false;
4100 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4101 	}
4102 
4103 	/*
4104 	 * Allow aliases to facilitate the lookup of symbols for address
4105 	 * filters. Refer to auxtrace_parse_filters().
4106 	 */
4107 	symbol_conf.allow_aliases = true;
4108 
4109 	symbol__init(NULL);
4110 
4111 	err = record__auxtrace_init(rec);
4112 	if (err)
4113 		goto out;
4114 
4115 	if (dry_run)
4116 		goto out;
4117 
4118 	err = -ENOMEM;
4119 
4120 	if (rec->no_buildid_cache || rec->no_buildid) {
4121 		disable_buildid_cache();
4122 	} else if (rec->switch_output.enabled) {
4123 		/*
4124 		 * In 'perf record --switch-output', disable buildid
4125 		 * generation by default to reduce data file switching
4126 		 * overhead. Still generate buildid if they are required
4127 		 * explicitly using
4128 		 *
4129 		 *  perf record --switch-output --no-no-buildid \
4130 		 *              --no-no-buildid-cache
4131 		 *
4132 		 * Following code equals to:
4133 		 *
4134 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4135 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4136 		 *         disable_buildid_cache();
4137 		 */
4138 		bool disable = true;
4139 
4140 		if (rec->no_buildid_set && !rec->no_buildid)
4141 			disable = false;
4142 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4143 			disable = false;
4144 		if (disable) {
4145 			rec->no_buildid = true;
4146 			rec->no_buildid_cache = true;
4147 			disable_buildid_cache();
4148 		}
4149 	}
4150 
4151 	if (record.opts.overwrite)
4152 		record.opts.tail_synthesize = true;
4153 
4154 	if (rec->evlist->core.nr_entries == 0) {
4155 		bool can_profile_kernel = perf_event_paranoid_check(1);
4156 
4157 		err = parse_event(rec->evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu");
4158 		if (err)
4159 			goto out;
4160 	}
4161 
4162 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4163 		rec->opts.no_inherit = true;
4164 
4165 	err = target__validate(&rec->opts.target);
4166 	if (err) {
4167 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4168 		ui__warning("%s\n", errbuf);
4169 	}
4170 
4171 	err = target__parse_uid(&rec->opts.target);
4172 	if (err) {
4173 		int saved_errno = errno;
4174 
4175 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4176 		ui__error("%s", errbuf);
4177 
4178 		err = -saved_errno;
4179 		goto out;
4180 	}
4181 
4182 	/* Enable ignoring missing threads when -u/-p option is defined. */
4183 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4184 
4185 	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4186 
4187 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4188 		arch__add_leaf_frame_record_opts(&rec->opts);
4189 
4190 	err = -ENOMEM;
4191 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4192 		if (rec->opts.target.pid != NULL) {
4193 			pr_err("Couldn't create thread/CPU maps: %s\n",
4194 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4195 			goto out;
4196 		}
4197 		else
4198 			usage_with_options(record_usage, record_options);
4199 	}
4200 
4201 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4202 	if (err)
4203 		goto out;
4204 
4205 	/*
4206 	 * We take all buildids when the file contains
4207 	 * AUX area tracing data because we do not decode the
4208 	 * trace because it would take too long.
4209 	 */
4210 	if (rec->opts.full_auxtrace)
4211 		rec->buildid_all = true;
4212 
4213 	if (rec->opts.text_poke) {
4214 		err = record__config_text_poke(rec->evlist);
4215 		if (err) {
4216 			pr_err("record__config_text_poke failed, error %d\n", err);
4217 			goto out;
4218 		}
4219 	}
4220 
4221 	if (rec->off_cpu) {
4222 		err = record__config_off_cpu(rec);
4223 		if (err) {
4224 			pr_err("record__config_off_cpu failed, error %d\n", err);
4225 			goto out;
4226 		}
4227 	}
4228 
4229 	if (record_opts__config(&rec->opts)) {
4230 		err = -EINVAL;
4231 		goto out;
4232 	}
4233 
4234 	err = record__config_tracking_events(rec);
4235 	if (err) {
4236 		pr_err("record__config_tracking_events failed, error %d\n", err);
4237 		goto out;
4238 	}
4239 
4240 	err = record__init_thread_masks(rec);
4241 	if (err) {
4242 		pr_err("Failed to initialize parallel data streaming masks\n");
4243 		goto out;
4244 	}
4245 
4246 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4247 		rec->opts.nr_cblocks = nr_cblocks_max;
4248 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4249 
4250 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4251 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4252 
4253 	if (rec->opts.comp_level > comp_level_max)
4254 		rec->opts.comp_level = comp_level_max;
4255 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4256 
4257 	err = __cmd_record(&record, argc, argv);
4258 out:
4259 	evlist__delete(rec->evlist);
4260 	symbol__exit();
4261 	auxtrace_record__free(rec->itr);
4262 out_opts:
4263 	record__free_thread_masks(rec, rec->nr_threads);
4264 	rec->nr_threads = 0;
4265 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4266 	return err;
4267 }
4268 
4269 static void snapshot_sig_handler(int sig __maybe_unused)
4270 {
4271 	struct record *rec = &record;
4272 
4273 	hit_auxtrace_snapshot_trigger(rec);
4274 
4275 	if (switch_output_signal(rec))
4276 		trigger_hit(&switch_output_trigger);
4277 }
4278 
4279 static void alarm_sig_handler(int sig __maybe_unused)
4280 {
4281 	struct record *rec = &record;
4282 
4283 	if (switch_output_time(rec))
4284 		trigger_hit(&switch_output_trigger);
4285 }
4286