xref: /linux/tools/perf/builtin-record.c (revision 6c7353836a91b1479e6b81791cdc163fb04b4834)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/pmu.h"
50 #include "util/pmus.h"
51 #include "util/clockid.h"
52 #include "util/off_cpu.h"
53 #include "util/bpf-filter.h"
54 #include "asm/bug.h"
55 #include "perf.h"
56 #include "cputopo.h"
57 
58 #include <errno.h>
59 #include <inttypes.h>
60 #include <locale.h>
61 #include <poll.h>
62 #include <pthread.h>
63 #include <unistd.h>
64 #ifndef HAVE_GETTID
65 #include <syscall.h>
66 #endif
67 #include <sched.h>
68 #include <signal.h>
69 #ifdef HAVE_EVENTFD_SUPPORT
70 #include <sys/eventfd.h>
71 #endif
72 #include <sys/mman.h>
73 #include <sys/wait.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <fcntl.h>
77 #include <linux/err.h>
78 #include <linux/string.h>
79 #include <linux/time64.h>
80 #include <linux/zalloc.h>
81 #include <linux/bitmap.h>
82 #include <sys/time.h>
83 
84 struct switch_output {
85 	bool		 enabled;
86 	bool		 signal;
87 	unsigned long	 size;
88 	unsigned long	 time;
89 	const char	*str;
90 	bool		 set;
91 	char		 **filenames;
92 	int		 num_files;
93 	int		 cur_file;
94 };
95 
96 struct thread_mask {
97 	struct mmap_cpu_mask	maps;
98 	struct mmap_cpu_mask	affinity;
99 };
100 
101 struct record_thread {
102 	pid_t			tid;
103 	struct thread_mask	*mask;
104 	struct {
105 		int		msg[2];
106 		int		ack[2];
107 	} pipes;
108 	struct fdarray		pollfd;
109 	int			ctlfd_pos;
110 	int			nr_mmaps;
111 	struct mmap		**maps;
112 	struct mmap		**overwrite_maps;
113 	struct record		*rec;
114 	unsigned long long	samples;
115 	unsigned long		waking;
116 	u64			bytes_written;
117 	u64			bytes_transferred;
118 	u64			bytes_compressed;
119 };
120 
121 static __thread struct record_thread *thread;
122 
123 enum thread_msg {
124 	THREAD_MSG__UNDEFINED = 0,
125 	THREAD_MSG__READY,
126 	THREAD_MSG__MAX,
127 };
128 
129 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
130 	"UNDEFINED", "READY"
131 };
132 
133 enum thread_spec {
134 	THREAD_SPEC__UNDEFINED = 0,
135 	THREAD_SPEC__CPU,
136 	THREAD_SPEC__CORE,
137 	THREAD_SPEC__PACKAGE,
138 	THREAD_SPEC__NUMA,
139 	THREAD_SPEC__USER,
140 	THREAD_SPEC__MAX,
141 };
142 
143 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
144 	"undefined", "cpu", "core", "package", "numa", "user"
145 };
146 
147 struct pollfd_index_map {
148 	int evlist_pollfd_index;
149 	int thread_pollfd_index;
150 };
151 
152 struct record {
153 	struct perf_tool	tool;
154 	struct record_opts	opts;
155 	u64			bytes_written;
156 	u64			thread_bytes_written;
157 	struct perf_data	data;
158 	struct auxtrace_record	*itr;
159 	struct evlist	*evlist;
160 	struct perf_session	*session;
161 	struct evlist		*sb_evlist;
162 	pthread_t		thread_id;
163 	int			realtime_prio;
164 	bool			switch_output_event_set;
165 	bool			no_buildid;
166 	bool			no_buildid_set;
167 	bool			no_buildid_cache;
168 	bool			no_buildid_cache_set;
169 	bool			buildid_all;
170 	bool			buildid_mmap;
171 	bool			timestamp_filename;
172 	bool			timestamp_boundary;
173 	bool			off_cpu;
174 	struct switch_output	switch_output;
175 	unsigned long long	samples;
176 	unsigned long		output_max_size;	/* = 0: unlimited */
177 	struct perf_debuginfod	debuginfod;
178 	int			nr_threads;
179 	struct thread_mask	*thread_masks;
180 	struct record_thread	*thread_data;
181 	struct pollfd_index_map	*index_map;
182 	size_t			index_map_sz;
183 	size_t			index_map_cnt;
184 };
185 
186 static volatile int done;
187 
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
191 
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
193 	"SYS", "NODE", "CPU"
194 };
195 
196 #ifndef HAVE_GETTID
197 static inline pid_t gettid(void)
198 {
199 	return (pid_t)syscall(__NR_gettid);
200 }
201 #endif
202 
203 static int record__threads_enabled(struct record *rec)
204 {
205 	return rec->opts.threads_spec;
206 }
207 
208 static bool switch_output_signal(struct record *rec)
209 {
210 	return rec->switch_output.signal &&
211 	       trigger_is_ready(&switch_output_trigger);
212 }
213 
214 static bool switch_output_size(struct record *rec)
215 {
216 	return rec->switch_output.size &&
217 	       trigger_is_ready(&switch_output_trigger) &&
218 	       (rec->bytes_written >= rec->switch_output.size);
219 }
220 
221 static bool switch_output_time(struct record *rec)
222 {
223 	return rec->switch_output.time &&
224 	       trigger_is_ready(&switch_output_trigger);
225 }
226 
227 static u64 record__bytes_written(struct record *rec)
228 {
229 	return rec->bytes_written + rec->thread_bytes_written;
230 }
231 
232 static bool record__output_max_size_exceeded(struct record *rec)
233 {
234 	return rec->output_max_size &&
235 	       (record__bytes_written(rec) >= rec->output_max_size);
236 }
237 
238 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
239 			 void *bf, size_t size)
240 {
241 	struct perf_data_file *file = &rec->session->data->file;
242 
243 	if (map && map->file)
244 		file = map->file;
245 
246 	if (perf_data_file__write(file, bf, size) < 0) {
247 		pr_err("failed to write perf data, error: %m\n");
248 		return -1;
249 	}
250 
251 	if (map && map->file) {
252 		thread->bytes_written += size;
253 		rec->thread_bytes_written += size;
254 	} else {
255 		rec->bytes_written += size;
256 	}
257 
258 	if (record__output_max_size_exceeded(rec) && !done) {
259 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
260 				" stopping session ]\n",
261 				record__bytes_written(rec) >> 10);
262 		done = 1;
263 	}
264 
265 	if (switch_output_size(rec))
266 		trigger_hit(&switch_output_trigger);
267 
268 	return 0;
269 }
270 
271 static int record__aio_enabled(struct record *rec);
272 static int record__comp_enabled(struct record *rec);
273 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
274 			    void *dst, size_t dst_size, void *src, size_t src_size);
275 
276 #ifdef HAVE_AIO_SUPPORT
277 static int record__aio_write(struct aiocb *cblock, int trace_fd,
278 		void *buf, size_t size, off_t off)
279 {
280 	int rc;
281 
282 	cblock->aio_fildes = trace_fd;
283 	cblock->aio_buf    = buf;
284 	cblock->aio_nbytes = size;
285 	cblock->aio_offset = off;
286 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
287 
288 	do {
289 		rc = aio_write(cblock);
290 		if (rc == 0) {
291 			break;
292 		} else if (errno != EAGAIN) {
293 			cblock->aio_fildes = -1;
294 			pr_err("failed to queue perf data, error: %m\n");
295 			break;
296 		}
297 	} while (1);
298 
299 	return rc;
300 }
301 
302 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
303 {
304 	void *rem_buf;
305 	off_t rem_off;
306 	size_t rem_size;
307 	int rc, aio_errno;
308 	ssize_t aio_ret, written;
309 
310 	aio_errno = aio_error(cblock);
311 	if (aio_errno == EINPROGRESS)
312 		return 0;
313 
314 	written = aio_ret = aio_return(cblock);
315 	if (aio_ret < 0) {
316 		if (aio_errno != EINTR)
317 			pr_err("failed to write perf data, error: %m\n");
318 		written = 0;
319 	}
320 
321 	rem_size = cblock->aio_nbytes - written;
322 
323 	if (rem_size == 0) {
324 		cblock->aio_fildes = -1;
325 		/*
326 		 * md->refcount is incremented in record__aio_pushfn() for
327 		 * every aio write request started in record__aio_push() so
328 		 * decrement it because the request is now complete.
329 		 */
330 		perf_mmap__put(&md->core);
331 		rc = 1;
332 	} else {
333 		/*
334 		 * aio write request may require restart with the
335 		 * reminder if the kernel didn't write whole
336 		 * chunk at once.
337 		 */
338 		rem_off = cblock->aio_offset + written;
339 		rem_buf = (void *)(cblock->aio_buf + written);
340 		record__aio_write(cblock, cblock->aio_fildes,
341 				rem_buf, rem_size, rem_off);
342 		rc = 0;
343 	}
344 
345 	return rc;
346 }
347 
348 static int record__aio_sync(struct mmap *md, bool sync_all)
349 {
350 	struct aiocb **aiocb = md->aio.aiocb;
351 	struct aiocb *cblocks = md->aio.cblocks;
352 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
353 	int i, do_suspend;
354 
355 	do {
356 		do_suspend = 0;
357 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
358 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
359 				if (sync_all)
360 					aiocb[i] = NULL;
361 				else
362 					return i;
363 			} else {
364 				/*
365 				 * Started aio write is not complete yet
366 				 * so it has to be waited before the
367 				 * next allocation.
368 				 */
369 				aiocb[i] = &cblocks[i];
370 				do_suspend = 1;
371 			}
372 		}
373 		if (!do_suspend)
374 			return -1;
375 
376 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
377 			if (!(errno == EAGAIN || errno == EINTR))
378 				pr_err("failed to sync perf data, error: %m\n");
379 		}
380 	} while (1);
381 }
382 
383 struct record_aio {
384 	struct record	*rec;
385 	void		*data;
386 	size_t		size;
387 };
388 
389 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
390 {
391 	struct record_aio *aio = to;
392 
393 	/*
394 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
395 	 * to release space in the kernel buffer as fast as possible, calling
396 	 * perf_mmap__consume() from perf_mmap__push() function.
397 	 *
398 	 * That lets the kernel to proceed with storing more profiling data into
399 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
400 	 *
401 	 * Coping can be done in two steps in case the chunk of profiling data
402 	 * crosses the upper bound of the kernel buffer. In this case we first move
403 	 * part of data from map->start till the upper bound and then the reminder
404 	 * from the beginning of the kernel buffer till the end of the data chunk.
405 	 */
406 
407 	if (record__comp_enabled(aio->rec)) {
408 		ssize_t compressed = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
409 						   mmap__mmap_len(map) - aio->size,
410 						   buf, size);
411 		if (compressed < 0)
412 			return (int)compressed;
413 
414 		size = compressed;
415 	} else {
416 		memcpy(aio->data + aio->size, buf, size);
417 	}
418 
419 	if (!aio->size) {
420 		/*
421 		 * Increment map->refcount to guard map->aio.data[] buffer
422 		 * from premature deallocation because map object can be
423 		 * released earlier than aio write request started on
424 		 * map->aio.data[] buffer is complete.
425 		 *
426 		 * perf_mmap__put() is done at record__aio_complete()
427 		 * after started aio request completion or at record__aio_push()
428 		 * if the request failed to start.
429 		 */
430 		perf_mmap__get(&map->core);
431 	}
432 
433 	aio->size += size;
434 
435 	return size;
436 }
437 
438 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
439 {
440 	int ret, idx;
441 	int trace_fd = rec->session->data->file.fd;
442 	struct record_aio aio = { .rec = rec, .size = 0 };
443 
444 	/*
445 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
446 	 * becomes available after previous aio write operation.
447 	 */
448 
449 	idx = record__aio_sync(map, false);
450 	aio.data = map->aio.data[idx];
451 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
452 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
453 		return ret;
454 
455 	rec->samples++;
456 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
457 	if (!ret) {
458 		*off += aio.size;
459 		rec->bytes_written += aio.size;
460 		if (switch_output_size(rec))
461 			trigger_hit(&switch_output_trigger);
462 	} else {
463 		/*
464 		 * Decrement map->refcount incremented in record__aio_pushfn()
465 		 * back if record__aio_write() operation failed to start, otherwise
466 		 * map->refcount is decremented in record__aio_complete() after
467 		 * aio write operation finishes successfully.
468 		 */
469 		perf_mmap__put(&map->core);
470 	}
471 
472 	return ret;
473 }
474 
475 static off_t record__aio_get_pos(int trace_fd)
476 {
477 	return lseek(trace_fd, 0, SEEK_CUR);
478 }
479 
480 static void record__aio_set_pos(int trace_fd, off_t pos)
481 {
482 	lseek(trace_fd, pos, SEEK_SET);
483 }
484 
485 static void record__aio_mmap_read_sync(struct record *rec)
486 {
487 	int i;
488 	struct evlist *evlist = rec->evlist;
489 	struct mmap *maps = evlist->mmap;
490 
491 	if (!record__aio_enabled(rec))
492 		return;
493 
494 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
495 		struct mmap *map = &maps[i];
496 
497 		if (map->core.base)
498 			record__aio_sync(map, true);
499 	}
500 }
501 
502 static int nr_cblocks_default = 1;
503 static int nr_cblocks_max = 4;
504 
505 static int record__aio_parse(const struct option *opt,
506 			     const char *str,
507 			     int unset)
508 {
509 	struct record_opts *opts = (struct record_opts *)opt->value;
510 
511 	if (unset) {
512 		opts->nr_cblocks = 0;
513 	} else {
514 		if (str)
515 			opts->nr_cblocks = strtol(str, NULL, 0);
516 		if (!opts->nr_cblocks)
517 			opts->nr_cblocks = nr_cblocks_default;
518 	}
519 
520 	return 0;
521 }
522 #else /* HAVE_AIO_SUPPORT */
523 static int nr_cblocks_max = 0;
524 
525 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
526 			    off_t *off __maybe_unused)
527 {
528 	return -1;
529 }
530 
531 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
532 {
533 	return -1;
534 }
535 
536 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
537 {
538 }
539 
540 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
541 {
542 }
543 #endif
544 
545 static int record__aio_enabled(struct record *rec)
546 {
547 	return rec->opts.nr_cblocks > 0;
548 }
549 
550 #define MMAP_FLUSH_DEFAULT 1
551 static int record__mmap_flush_parse(const struct option *opt,
552 				    const char *str,
553 				    int unset)
554 {
555 	int flush_max;
556 	struct record_opts *opts = (struct record_opts *)opt->value;
557 	static struct parse_tag tags[] = {
558 			{ .tag  = 'B', .mult = 1       },
559 			{ .tag  = 'K', .mult = 1 << 10 },
560 			{ .tag  = 'M', .mult = 1 << 20 },
561 			{ .tag  = 'G', .mult = 1 << 30 },
562 			{ .tag  = 0 },
563 	};
564 
565 	if (unset)
566 		return 0;
567 
568 	if (str) {
569 		opts->mmap_flush = parse_tag_value(str, tags);
570 		if (opts->mmap_flush == (int)-1)
571 			opts->mmap_flush = strtol(str, NULL, 0);
572 	}
573 
574 	if (!opts->mmap_flush)
575 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
576 
577 	flush_max = evlist__mmap_size(opts->mmap_pages);
578 	flush_max /= 4;
579 	if (opts->mmap_flush > flush_max)
580 		opts->mmap_flush = flush_max;
581 
582 	return 0;
583 }
584 
585 #ifdef HAVE_ZSTD_SUPPORT
586 static unsigned int comp_level_default = 1;
587 
588 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
589 {
590 	struct record_opts *opts = opt->value;
591 
592 	if (unset) {
593 		opts->comp_level = 0;
594 	} else {
595 		if (str)
596 			opts->comp_level = strtol(str, NULL, 0);
597 		if (!opts->comp_level)
598 			opts->comp_level = comp_level_default;
599 	}
600 
601 	return 0;
602 }
603 #endif
604 static unsigned int comp_level_max = 22;
605 
606 static int record__comp_enabled(struct record *rec)
607 {
608 	return rec->opts.comp_level > 0;
609 }
610 
611 static int process_synthesized_event(struct perf_tool *tool,
612 				     union perf_event *event,
613 				     struct perf_sample *sample __maybe_unused,
614 				     struct machine *machine __maybe_unused)
615 {
616 	struct record *rec = container_of(tool, struct record, tool);
617 	return record__write(rec, NULL, event, event->header.size);
618 }
619 
620 static struct mutex synth_lock;
621 
622 static int process_locked_synthesized_event(struct perf_tool *tool,
623 				     union perf_event *event,
624 				     struct perf_sample *sample __maybe_unused,
625 				     struct machine *machine __maybe_unused)
626 {
627 	int ret;
628 
629 	mutex_lock(&synth_lock);
630 	ret = process_synthesized_event(tool, event, sample, machine);
631 	mutex_unlock(&synth_lock);
632 	return ret;
633 }
634 
635 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
636 {
637 	struct record *rec = to;
638 
639 	if (record__comp_enabled(rec)) {
640 		ssize_t compressed = zstd_compress(rec->session, map, map->data,
641 						   mmap__mmap_len(map), bf, size);
642 
643 		if (compressed < 0)
644 			return (int)compressed;
645 
646 		size = compressed;
647 		bf   = map->data;
648 	}
649 
650 	thread->samples++;
651 	return record__write(rec, map, bf, size);
652 }
653 
654 static volatile sig_atomic_t signr = -1;
655 static volatile sig_atomic_t child_finished;
656 #ifdef HAVE_EVENTFD_SUPPORT
657 static volatile sig_atomic_t done_fd = -1;
658 #endif
659 
660 static void sig_handler(int sig)
661 {
662 	if (sig == SIGCHLD)
663 		child_finished = 1;
664 	else
665 		signr = sig;
666 
667 	done = 1;
668 #ifdef HAVE_EVENTFD_SUPPORT
669 	if (done_fd >= 0) {
670 		u64 tmp = 1;
671 		int orig_errno = errno;
672 
673 		/*
674 		 * It is possible for this signal handler to run after done is
675 		 * checked in the main loop, but before the perf counter fds are
676 		 * polled. If this happens, the poll() will continue to wait
677 		 * even though done is set, and will only break out if either
678 		 * another signal is received, or the counters are ready for
679 		 * read. To ensure the poll() doesn't sleep when done is set,
680 		 * use an eventfd (done_fd) to wake up the poll().
681 		 */
682 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
683 			pr_err("failed to signal wakeup fd, error: %m\n");
684 
685 		errno = orig_errno;
686 	}
687 #endif // HAVE_EVENTFD_SUPPORT
688 }
689 
690 static void sigsegv_handler(int sig)
691 {
692 	perf_hooks__recover();
693 	sighandler_dump_stack(sig);
694 }
695 
696 static void record__sig_exit(void)
697 {
698 	if (signr == -1)
699 		return;
700 
701 	signal(signr, SIG_DFL);
702 	raise(signr);
703 }
704 
705 #ifdef HAVE_AUXTRACE_SUPPORT
706 
707 static int record__process_auxtrace(struct perf_tool *tool,
708 				    struct mmap *map,
709 				    union perf_event *event, void *data1,
710 				    size_t len1, void *data2, size_t len2)
711 {
712 	struct record *rec = container_of(tool, struct record, tool);
713 	struct perf_data *data = &rec->data;
714 	size_t padding;
715 	u8 pad[8] = {0};
716 
717 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
718 		off_t file_offset;
719 		int fd = perf_data__fd(data);
720 		int err;
721 
722 		file_offset = lseek(fd, 0, SEEK_CUR);
723 		if (file_offset == -1)
724 			return -1;
725 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
726 						     event, file_offset);
727 		if (err)
728 			return err;
729 	}
730 
731 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
732 	padding = (len1 + len2) & 7;
733 	if (padding)
734 		padding = 8 - padding;
735 
736 	record__write(rec, map, event, event->header.size);
737 	record__write(rec, map, data1, len1);
738 	if (len2)
739 		record__write(rec, map, data2, len2);
740 	record__write(rec, map, &pad, padding);
741 
742 	return 0;
743 }
744 
745 static int record__auxtrace_mmap_read(struct record *rec,
746 				      struct mmap *map)
747 {
748 	int ret;
749 
750 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
751 				  record__process_auxtrace);
752 	if (ret < 0)
753 		return ret;
754 
755 	if (ret)
756 		rec->samples++;
757 
758 	return 0;
759 }
760 
761 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
762 					       struct mmap *map)
763 {
764 	int ret;
765 
766 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
767 					   record__process_auxtrace,
768 					   rec->opts.auxtrace_snapshot_size);
769 	if (ret < 0)
770 		return ret;
771 
772 	if (ret)
773 		rec->samples++;
774 
775 	return 0;
776 }
777 
778 static int record__auxtrace_read_snapshot_all(struct record *rec)
779 {
780 	int i;
781 	int rc = 0;
782 
783 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
784 		struct mmap *map = &rec->evlist->mmap[i];
785 
786 		if (!map->auxtrace_mmap.base)
787 			continue;
788 
789 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
790 			rc = -1;
791 			goto out;
792 		}
793 	}
794 out:
795 	return rc;
796 }
797 
798 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
799 {
800 	pr_debug("Recording AUX area tracing snapshot\n");
801 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
802 		trigger_error(&auxtrace_snapshot_trigger);
803 	} else {
804 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
805 			trigger_error(&auxtrace_snapshot_trigger);
806 		else
807 			trigger_ready(&auxtrace_snapshot_trigger);
808 	}
809 }
810 
811 static int record__auxtrace_snapshot_exit(struct record *rec)
812 {
813 	if (trigger_is_error(&auxtrace_snapshot_trigger))
814 		return 0;
815 
816 	if (!auxtrace_record__snapshot_started &&
817 	    auxtrace_record__snapshot_start(rec->itr))
818 		return -1;
819 
820 	record__read_auxtrace_snapshot(rec, true);
821 	if (trigger_is_error(&auxtrace_snapshot_trigger))
822 		return -1;
823 
824 	return 0;
825 }
826 
827 static int record__auxtrace_init(struct record *rec)
828 {
829 	int err;
830 
831 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
832 	    && record__threads_enabled(rec)) {
833 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
834 		return -EINVAL;
835 	}
836 
837 	if (!rec->itr) {
838 		rec->itr = auxtrace_record__init(rec->evlist, &err);
839 		if (err)
840 			return err;
841 	}
842 
843 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
844 					      rec->opts.auxtrace_snapshot_opts);
845 	if (err)
846 		return err;
847 
848 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
849 					    rec->opts.auxtrace_sample_opts);
850 	if (err)
851 		return err;
852 
853 	auxtrace_regroup_aux_output(rec->evlist);
854 
855 	return auxtrace_parse_filters(rec->evlist);
856 }
857 
858 #else
859 
860 static inline
861 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
862 			       struct mmap *map __maybe_unused)
863 {
864 	return 0;
865 }
866 
867 static inline
868 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
869 				    bool on_exit __maybe_unused)
870 {
871 }
872 
873 static inline
874 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
875 {
876 	return 0;
877 }
878 
879 static inline
880 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
881 {
882 	return 0;
883 }
884 
885 static int record__auxtrace_init(struct record *rec __maybe_unused)
886 {
887 	return 0;
888 }
889 
890 #endif
891 
892 static int record__config_text_poke(struct evlist *evlist)
893 {
894 	struct evsel *evsel;
895 
896 	/* Nothing to do if text poke is already configured */
897 	evlist__for_each_entry(evlist, evsel) {
898 		if (evsel->core.attr.text_poke)
899 			return 0;
900 	}
901 
902 	evsel = evlist__add_dummy_on_all_cpus(evlist);
903 	if (!evsel)
904 		return -ENOMEM;
905 
906 	evsel->core.attr.text_poke = 1;
907 	evsel->core.attr.ksymbol = 1;
908 	evsel->immediate = true;
909 	evsel__set_sample_bit(evsel, TIME);
910 
911 	return 0;
912 }
913 
914 static int record__config_off_cpu(struct record *rec)
915 {
916 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
917 }
918 
919 static bool record__tracking_system_wide(struct record *rec)
920 {
921 	struct evlist *evlist = rec->evlist;
922 	struct evsel *evsel;
923 
924 	/*
925 	 * If non-dummy evsel exists, system_wide sideband is need to
926 	 * help parse sample information.
927 	 * For example, PERF_EVENT_MMAP event to help parse symbol,
928 	 * and PERF_EVENT_COMM event to help parse task executable name.
929 	 */
930 	evlist__for_each_entry(evlist, evsel) {
931 		if (!evsel__is_dummy_event(evsel))
932 			return true;
933 	}
934 
935 	return false;
936 }
937 
938 static int record__config_tracking_events(struct record *rec)
939 {
940 	struct record_opts *opts = &rec->opts;
941 	struct evlist *evlist = rec->evlist;
942 	bool system_wide = false;
943 	struct evsel *evsel;
944 
945 	/*
946 	 * For initial_delay, system wide or a hybrid system, we need to add
947 	 * tracking event so that we can track PERF_RECORD_MMAP to cover the
948 	 * delay of waiting or event synthesis.
949 	 */
950 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
951 	    perf_pmus__num_core_pmus() > 1) {
952 
953 		/*
954 		 * User space tasks can migrate between CPUs, so when tracing
955 		 * selected CPUs, sideband for all CPUs is still needed.
956 		 */
957 		if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
958 			system_wide = true;
959 
960 		evsel = evlist__findnew_tracking_event(evlist, system_wide);
961 		if (!evsel)
962 			return -ENOMEM;
963 
964 		/*
965 		 * Enable the tracking event when the process is forked for
966 		 * initial_delay, immediately for system wide.
967 		 */
968 		if (opts->target.initial_delay && !evsel->immediate &&
969 		    !target__has_cpu(&opts->target))
970 			evsel->core.attr.enable_on_exec = 1;
971 		else
972 			evsel->immediate = 1;
973 	}
974 
975 	return 0;
976 }
977 
978 static bool record__kcore_readable(struct machine *machine)
979 {
980 	char kcore[PATH_MAX];
981 	int fd;
982 
983 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
984 
985 	fd = open(kcore, O_RDONLY);
986 	if (fd < 0)
987 		return false;
988 
989 	close(fd);
990 
991 	return true;
992 }
993 
994 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
995 {
996 	char from_dir[PATH_MAX];
997 	char kcore_dir[PATH_MAX];
998 	int ret;
999 
1000 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
1001 
1002 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
1003 	if (ret)
1004 		return ret;
1005 
1006 	return kcore_copy(from_dir, kcore_dir);
1007 }
1008 
1009 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1010 {
1011 	thread_data->pipes.msg[0] = -1;
1012 	thread_data->pipes.msg[1] = -1;
1013 	thread_data->pipes.ack[0] = -1;
1014 	thread_data->pipes.ack[1] = -1;
1015 }
1016 
1017 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1018 {
1019 	if (pipe(thread_data->pipes.msg))
1020 		return -EINVAL;
1021 
1022 	if (pipe(thread_data->pipes.ack)) {
1023 		close(thread_data->pipes.msg[0]);
1024 		thread_data->pipes.msg[0] = -1;
1025 		close(thread_data->pipes.msg[1]);
1026 		thread_data->pipes.msg[1] = -1;
1027 		return -EINVAL;
1028 	}
1029 
1030 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1031 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1032 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1033 
1034 	return 0;
1035 }
1036 
1037 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1038 {
1039 	if (thread_data->pipes.msg[0] != -1) {
1040 		close(thread_data->pipes.msg[0]);
1041 		thread_data->pipes.msg[0] = -1;
1042 	}
1043 	if (thread_data->pipes.msg[1] != -1) {
1044 		close(thread_data->pipes.msg[1]);
1045 		thread_data->pipes.msg[1] = -1;
1046 	}
1047 	if (thread_data->pipes.ack[0] != -1) {
1048 		close(thread_data->pipes.ack[0]);
1049 		thread_data->pipes.ack[0] = -1;
1050 	}
1051 	if (thread_data->pipes.ack[1] != -1) {
1052 		close(thread_data->pipes.ack[1]);
1053 		thread_data->pipes.ack[1] = -1;
1054 	}
1055 }
1056 
1057 static bool evlist__per_thread(struct evlist *evlist)
1058 {
1059 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1060 }
1061 
1062 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1063 {
1064 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1065 	struct mmap *mmap = evlist->mmap;
1066 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1067 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1068 	bool per_thread = evlist__per_thread(evlist);
1069 
1070 	if (per_thread)
1071 		thread_data->nr_mmaps = nr_mmaps;
1072 	else
1073 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1074 						      thread_data->mask->maps.nbits);
1075 	if (mmap) {
1076 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1077 		if (!thread_data->maps)
1078 			return -ENOMEM;
1079 	}
1080 	if (overwrite_mmap) {
1081 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1082 		if (!thread_data->overwrite_maps) {
1083 			zfree(&thread_data->maps);
1084 			return -ENOMEM;
1085 		}
1086 	}
1087 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1088 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1089 
1090 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1091 		if (per_thread ||
1092 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1093 			if (thread_data->maps) {
1094 				thread_data->maps[tm] = &mmap[m];
1095 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1096 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1097 			}
1098 			if (thread_data->overwrite_maps) {
1099 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1100 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1101 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1102 			}
1103 			tm++;
1104 		}
1105 	}
1106 
1107 	return 0;
1108 }
1109 
1110 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1111 {
1112 	int f, tm, pos;
1113 	struct mmap *map, *overwrite_map;
1114 
1115 	fdarray__init(&thread_data->pollfd, 64);
1116 
1117 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1118 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1119 		overwrite_map = thread_data->overwrite_maps ?
1120 				thread_data->overwrite_maps[tm] : NULL;
1121 
1122 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1123 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1124 
1125 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1126 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1127 							      &evlist->core.pollfd);
1128 				if (pos < 0)
1129 					return pos;
1130 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1131 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1132 			}
1133 		}
1134 	}
1135 
1136 	return 0;
1137 }
1138 
1139 static void record__free_thread_data(struct record *rec)
1140 {
1141 	int t;
1142 	struct record_thread *thread_data = rec->thread_data;
1143 
1144 	if (thread_data == NULL)
1145 		return;
1146 
1147 	for (t = 0; t < rec->nr_threads; t++) {
1148 		record__thread_data_close_pipes(&thread_data[t]);
1149 		zfree(&thread_data[t].maps);
1150 		zfree(&thread_data[t].overwrite_maps);
1151 		fdarray__exit(&thread_data[t].pollfd);
1152 	}
1153 
1154 	zfree(&rec->thread_data);
1155 }
1156 
1157 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1158 						    int evlist_pollfd_index,
1159 						    int thread_pollfd_index)
1160 {
1161 	size_t x = rec->index_map_cnt;
1162 
1163 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1164 		return -ENOMEM;
1165 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1166 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1167 	rec->index_map_cnt += 1;
1168 	return 0;
1169 }
1170 
1171 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1172 						    struct evlist *evlist,
1173 						    struct record_thread *thread_data)
1174 {
1175 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1176 	struct pollfd *t_entries = thread_data->pollfd.entries;
1177 	int err = 0;
1178 	size_t i;
1179 
1180 	for (i = 0; i < rec->index_map_cnt; i++) {
1181 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1182 		int t_pos = rec->index_map[i].thread_pollfd_index;
1183 
1184 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1185 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1186 			pr_err("Thread and evlist pollfd index mismatch\n");
1187 			err = -EINVAL;
1188 			continue;
1189 		}
1190 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1191 	}
1192 	return err;
1193 }
1194 
1195 static int record__dup_non_perf_events(struct record *rec,
1196 				       struct evlist *evlist,
1197 				       struct record_thread *thread_data)
1198 {
1199 	struct fdarray *fda = &evlist->core.pollfd;
1200 	int i, ret;
1201 
1202 	for (i = 0; i < fda->nr; i++) {
1203 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1204 			continue;
1205 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1206 		if (ret < 0) {
1207 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1208 			return ret;
1209 		}
1210 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1211 			  thread_data, ret, fda->entries[i].fd);
1212 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1213 		if (ret < 0) {
1214 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1215 			return ret;
1216 		}
1217 	}
1218 	return 0;
1219 }
1220 
1221 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1222 {
1223 	int t, ret;
1224 	struct record_thread *thread_data;
1225 
1226 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1227 	if (!rec->thread_data) {
1228 		pr_err("Failed to allocate thread data\n");
1229 		return -ENOMEM;
1230 	}
1231 	thread_data = rec->thread_data;
1232 
1233 	for (t = 0; t < rec->nr_threads; t++)
1234 		record__thread_data_init_pipes(&thread_data[t]);
1235 
1236 	for (t = 0; t < rec->nr_threads; t++) {
1237 		thread_data[t].rec = rec;
1238 		thread_data[t].mask = &rec->thread_masks[t];
1239 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1240 		if (ret) {
1241 			pr_err("Failed to initialize thread[%d] maps\n", t);
1242 			goto out_free;
1243 		}
1244 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1245 		if (ret) {
1246 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1247 			goto out_free;
1248 		}
1249 		if (t) {
1250 			thread_data[t].tid = -1;
1251 			ret = record__thread_data_open_pipes(&thread_data[t]);
1252 			if (ret) {
1253 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1254 				goto out_free;
1255 			}
1256 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1257 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1258 			if (ret < 0) {
1259 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1260 				goto out_free;
1261 			}
1262 			thread_data[t].ctlfd_pos = ret;
1263 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1264 				 thread_data, thread_data[t].ctlfd_pos,
1265 				 thread_data[t].pipes.msg[0]);
1266 		} else {
1267 			thread_data[t].tid = gettid();
1268 
1269 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1270 			if (ret < 0)
1271 				goto out_free;
1272 
1273 			thread_data[t].ctlfd_pos = -1; /* Not used */
1274 		}
1275 	}
1276 
1277 	return 0;
1278 
1279 out_free:
1280 	record__free_thread_data(rec);
1281 
1282 	return ret;
1283 }
1284 
1285 static int record__mmap_evlist(struct record *rec,
1286 			       struct evlist *evlist)
1287 {
1288 	int i, ret;
1289 	struct record_opts *opts = &rec->opts;
1290 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1291 				  opts->auxtrace_sample_mode;
1292 	char msg[512];
1293 
1294 	if (opts->affinity != PERF_AFFINITY_SYS)
1295 		cpu__setup_cpunode_map();
1296 
1297 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1298 				 opts->auxtrace_mmap_pages,
1299 				 auxtrace_overwrite,
1300 				 opts->nr_cblocks, opts->affinity,
1301 				 opts->mmap_flush, opts->comp_level) < 0) {
1302 		if (errno == EPERM) {
1303 			pr_err("Permission error mapping pages.\n"
1304 			       "Consider increasing "
1305 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1306 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1307 			       "(current value: %u,%u)\n",
1308 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1309 			return -errno;
1310 		} else {
1311 			pr_err("failed to mmap with %d (%s)\n", errno,
1312 				str_error_r(errno, msg, sizeof(msg)));
1313 			if (errno)
1314 				return -errno;
1315 			else
1316 				return -EINVAL;
1317 		}
1318 	}
1319 
1320 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1321 		return -1;
1322 
1323 	ret = record__alloc_thread_data(rec, evlist);
1324 	if (ret)
1325 		return ret;
1326 
1327 	if (record__threads_enabled(rec)) {
1328 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1329 		if (ret) {
1330 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1331 			return ret;
1332 		}
1333 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1334 			if (evlist->mmap)
1335 				evlist->mmap[i].file = &rec->data.dir.files[i];
1336 			if (evlist->overwrite_mmap)
1337 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1338 		}
1339 	}
1340 
1341 	return 0;
1342 }
1343 
1344 static int record__mmap(struct record *rec)
1345 {
1346 	return record__mmap_evlist(rec, rec->evlist);
1347 }
1348 
1349 static int record__open(struct record *rec)
1350 {
1351 	char msg[BUFSIZ];
1352 	struct evsel *pos;
1353 	struct evlist *evlist = rec->evlist;
1354 	struct perf_session *session = rec->session;
1355 	struct record_opts *opts = &rec->opts;
1356 	int rc = 0;
1357 
1358 	evlist__config(evlist, opts, &callchain_param);
1359 
1360 	evlist__for_each_entry(evlist, pos) {
1361 try_again:
1362 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1363 			if (evsel__fallback(pos, &opts->target, errno, msg, sizeof(msg))) {
1364 				if (verbose > 0)
1365 					ui__warning("%s\n", msg);
1366 				goto try_again;
1367 			}
1368 			if ((errno == EINVAL || errno == EBADF) &&
1369 			    pos->core.leader != &pos->core &&
1370 			    pos->weak_group) {
1371 			        pos = evlist__reset_weak_group(evlist, pos, true);
1372 				goto try_again;
1373 			}
1374 			rc = -errno;
1375 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1376 			ui__error("%s\n", msg);
1377 			goto out;
1378 		}
1379 
1380 		pos->supported = true;
1381 	}
1382 
1383 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1384 		pr_warning(
1385 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1386 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1387 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1388 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1389 "Samples in kernel modules won't be resolved at all.\n\n"
1390 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1391 "even with a suitable vmlinux or kallsyms file.\n\n");
1392 	}
1393 
1394 	if (evlist__apply_filters(evlist, &pos)) {
1395 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1396 			pos->filter ?: "BPF", evsel__name(pos), errno,
1397 			str_error_r(errno, msg, sizeof(msg)));
1398 		rc = -1;
1399 		goto out;
1400 	}
1401 
1402 	rc = record__mmap(rec);
1403 	if (rc)
1404 		goto out;
1405 
1406 	session->evlist = evlist;
1407 	perf_session__set_id_hdr_size(session);
1408 out:
1409 	return rc;
1410 }
1411 
1412 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1413 {
1414 	if (rec->evlist->first_sample_time == 0)
1415 		rec->evlist->first_sample_time = sample_time;
1416 
1417 	if (sample_time)
1418 		rec->evlist->last_sample_time = sample_time;
1419 }
1420 
1421 static int process_sample_event(struct perf_tool *tool,
1422 				union perf_event *event,
1423 				struct perf_sample *sample,
1424 				struct evsel *evsel,
1425 				struct machine *machine)
1426 {
1427 	struct record *rec = container_of(tool, struct record, tool);
1428 
1429 	set_timestamp_boundary(rec, sample->time);
1430 
1431 	if (rec->buildid_all)
1432 		return 0;
1433 
1434 	rec->samples++;
1435 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1436 }
1437 
1438 static int process_buildids(struct record *rec)
1439 {
1440 	struct perf_session *session = rec->session;
1441 
1442 	if (perf_data__size(&rec->data) == 0)
1443 		return 0;
1444 
1445 	/*
1446 	 * During this process, it'll load kernel map and replace the
1447 	 * dso->long_name to a real pathname it found.  In this case
1448 	 * we prefer the vmlinux path like
1449 	 *   /lib/modules/3.16.4/build/vmlinux
1450 	 *
1451 	 * rather than build-id path (in debug directory).
1452 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1453 	 */
1454 	symbol_conf.ignore_vmlinux_buildid = true;
1455 
1456 	/*
1457 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1458 	 * so no need to process samples. But if timestamp_boundary is enabled,
1459 	 * it still needs to walk on all samples to get the timestamps of
1460 	 * first/last samples.
1461 	 */
1462 	if (rec->buildid_all && !rec->timestamp_boundary)
1463 		rec->tool.sample = NULL;
1464 
1465 	return perf_session__process_events(session);
1466 }
1467 
1468 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1469 {
1470 	int err;
1471 	struct perf_tool *tool = data;
1472 	/*
1473 	 *As for guest kernel when processing subcommand record&report,
1474 	 *we arrange module mmap prior to guest kernel mmap and trigger
1475 	 *a preload dso because default guest module symbols are loaded
1476 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1477 	 *method is used to avoid symbol missing when the first addr is
1478 	 *in module instead of in guest kernel.
1479 	 */
1480 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1481 					     machine);
1482 	if (err < 0)
1483 		pr_err("Couldn't record guest kernel [%d]'s reference"
1484 		       " relocation symbol.\n", machine->pid);
1485 
1486 	/*
1487 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1488 	 * have no _text sometimes.
1489 	 */
1490 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1491 						 machine);
1492 	if (err < 0)
1493 		pr_err("Couldn't record guest kernel [%d]'s reference"
1494 		       " relocation symbol.\n", machine->pid);
1495 }
1496 
1497 static struct perf_event_header finished_round_event = {
1498 	.size = sizeof(struct perf_event_header),
1499 	.type = PERF_RECORD_FINISHED_ROUND,
1500 };
1501 
1502 static struct perf_event_header finished_init_event = {
1503 	.size = sizeof(struct perf_event_header),
1504 	.type = PERF_RECORD_FINISHED_INIT,
1505 };
1506 
1507 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1508 {
1509 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1510 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1511 			  thread->mask->affinity.nbits)) {
1512 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1513 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1514 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1515 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1516 					(cpu_set_t *)thread->mask->affinity.bits);
1517 		if (verbose == 2) {
1518 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1519 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1520 		}
1521 	}
1522 }
1523 
1524 static size_t process_comp_header(void *record, size_t increment)
1525 {
1526 	struct perf_record_compressed *event = record;
1527 	size_t size = sizeof(*event);
1528 
1529 	if (increment) {
1530 		event->header.size += increment;
1531 		return increment;
1532 	}
1533 
1534 	event->header.type = PERF_RECORD_COMPRESSED;
1535 	event->header.size = size;
1536 
1537 	return size;
1538 }
1539 
1540 static ssize_t zstd_compress(struct perf_session *session, struct mmap *map,
1541 			    void *dst, size_t dst_size, void *src, size_t src_size)
1542 {
1543 	ssize_t compressed;
1544 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1545 	struct zstd_data *zstd_data = &session->zstd_data;
1546 
1547 	if (map && map->file)
1548 		zstd_data = &map->zstd_data;
1549 
1550 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1551 						     max_record_size, process_comp_header);
1552 	if (compressed < 0)
1553 		return compressed;
1554 
1555 	if (map && map->file) {
1556 		thread->bytes_transferred += src_size;
1557 		thread->bytes_compressed  += compressed;
1558 	} else {
1559 		session->bytes_transferred += src_size;
1560 		session->bytes_compressed  += compressed;
1561 	}
1562 
1563 	return compressed;
1564 }
1565 
1566 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1567 				    bool overwrite, bool synch)
1568 {
1569 	u64 bytes_written = rec->bytes_written;
1570 	int i;
1571 	int rc = 0;
1572 	int nr_mmaps;
1573 	struct mmap **maps;
1574 	int trace_fd = rec->data.file.fd;
1575 	off_t off = 0;
1576 
1577 	if (!evlist)
1578 		return 0;
1579 
1580 	nr_mmaps = thread->nr_mmaps;
1581 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1582 
1583 	if (!maps)
1584 		return 0;
1585 
1586 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1587 		return 0;
1588 
1589 	if (record__aio_enabled(rec))
1590 		off = record__aio_get_pos(trace_fd);
1591 
1592 	for (i = 0; i < nr_mmaps; i++) {
1593 		u64 flush = 0;
1594 		struct mmap *map = maps[i];
1595 
1596 		if (map->core.base) {
1597 			record__adjust_affinity(rec, map);
1598 			if (synch) {
1599 				flush = map->core.flush;
1600 				map->core.flush = 1;
1601 			}
1602 			if (!record__aio_enabled(rec)) {
1603 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1604 					if (synch)
1605 						map->core.flush = flush;
1606 					rc = -1;
1607 					goto out;
1608 				}
1609 			} else {
1610 				if (record__aio_push(rec, map, &off) < 0) {
1611 					record__aio_set_pos(trace_fd, off);
1612 					if (synch)
1613 						map->core.flush = flush;
1614 					rc = -1;
1615 					goto out;
1616 				}
1617 			}
1618 			if (synch)
1619 				map->core.flush = flush;
1620 		}
1621 
1622 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1623 		    !rec->opts.auxtrace_sample_mode &&
1624 		    record__auxtrace_mmap_read(rec, map) != 0) {
1625 			rc = -1;
1626 			goto out;
1627 		}
1628 	}
1629 
1630 	if (record__aio_enabled(rec))
1631 		record__aio_set_pos(trace_fd, off);
1632 
1633 	/*
1634 	 * Mark the round finished in case we wrote
1635 	 * at least one event.
1636 	 *
1637 	 * No need for round events in directory mode,
1638 	 * because per-cpu maps and files have data
1639 	 * sorted by kernel.
1640 	 */
1641 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1642 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1643 
1644 	if (overwrite)
1645 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1646 out:
1647 	return rc;
1648 }
1649 
1650 static int record__mmap_read_all(struct record *rec, bool synch)
1651 {
1652 	int err;
1653 
1654 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1655 	if (err)
1656 		return err;
1657 
1658 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1659 }
1660 
1661 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1662 					   void *arg __maybe_unused)
1663 {
1664 	struct perf_mmap *map = fda->priv[fd].ptr;
1665 
1666 	if (map)
1667 		perf_mmap__put(map);
1668 }
1669 
1670 static void *record__thread(void *arg)
1671 {
1672 	enum thread_msg msg = THREAD_MSG__READY;
1673 	bool terminate = false;
1674 	struct fdarray *pollfd;
1675 	int err, ctlfd_pos;
1676 
1677 	thread = arg;
1678 	thread->tid = gettid();
1679 
1680 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1681 	if (err == -1)
1682 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1683 			   thread->tid, strerror(errno));
1684 
1685 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1686 
1687 	pollfd = &thread->pollfd;
1688 	ctlfd_pos = thread->ctlfd_pos;
1689 
1690 	for (;;) {
1691 		unsigned long long hits = thread->samples;
1692 
1693 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1694 			break;
1695 
1696 		if (hits == thread->samples) {
1697 
1698 			err = fdarray__poll(pollfd, -1);
1699 			/*
1700 			 * Propagate error, only if there's any. Ignore positive
1701 			 * number of returned events and interrupt error.
1702 			 */
1703 			if (err > 0 || (err < 0 && errno == EINTR))
1704 				err = 0;
1705 			thread->waking++;
1706 
1707 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1708 					    record__thread_munmap_filtered, NULL) == 0)
1709 				break;
1710 		}
1711 
1712 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1713 			terminate = true;
1714 			close(thread->pipes.msg[0]);
1715 			thread->pipes.msg[0] = -1;
1716 			pollfd->entries[ctlfd_pos].fd = -1;
1717 			pollfd->entries[ctlfd_pos].events = 0;
1718 		}
1719 
1720 		pollfd->entries[ctlfd_pos].revents = 0;
1721 	}
1722 	record__mmap_read_all(thread->rec, true);
1723 
1724 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1725 	if (err == -1)
1726 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1727 			   thread->tid, strerror(errno));
1728 
1729 	return NULL;
1730 }
1731 
1732 static void record__init_features(struct record *rec)
1733 {
1734 	struct perf_session *session = rec->session;
1735 	int feat;
1736 
1737 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1738 		perf_header__set_feat(&session->header, feat);
1739 
1740 	if (rec->no_buildid)
1741 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1742 
1743 #ifdef HAVE_LIBTRACEEVENT
1744 	if (!have_tracepoints(&rec->evlist->core.entries))
1745 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1746 #endif
1747 
1748 	if (!rec->opts.branch_stack)
1749 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1750 
1751 	if (!rec->opts.full_auxtrace)
1752 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1753 
1754 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1755 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1756 
1757 	if (!rec->opts.use_clockid)
1758 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1759 
1760 	if (!record__threads_enabled(rec))
1761 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1762 
1763 	if (!record__comp_enabled(rec))
1764 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1765 
1766 	perf_header__clear_feat(&session->header, HEADER_STAT);
1767 }
1768 
1769 static void
1770 record__finish_output(struct record *rec)
1771 {
1772 	int i;
1773 	struct perf_data *data = &rec->data;
1774 	int fd = perf_data__fd(data);
1775 
1776 	if (data->is_pipe)
1777 		return;
1778 
1779 	rec->session->header.data_size += rec->bytes_written;
1780 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1781 	if (record__threads_enabled(rec)) {
1782 		for (i = 0; i < data->dir.nr; i++)
1783 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1784 	}
1785 
1786 	if (!rec->no_buildid) {
1787 		process_buildids(rec);
1788 
1789 		if (rec->buildid_all)
1790 			dsos__hit_all(rec->session);
1791 	}
1792 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1793 
1794 	return;
1795 }
1796 
1797 static int record__synthesize_workload(struct record *rec, bool tail)
1798 {
1799 	int err;
1800 	struct perf_thread_map *thread_map;
1801 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1802 
1803 	if (rec->opts.tail_synthesize != tail)
1804 		return 0;
1805 
1806 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1807 	if (thread_map == NULL)
1808 		return -1;
1809 
1810 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1811 						 process_synthesized_event,
1812 						 &rec->session->machines.host,
1813 						 needs_mmap,
1814 						 rec->opts.sample_address);
1815 	perf_thread_map__put(thread_map);
1816 	return err;
1817 }
1818 
1819 static int write_finished_init(struct record *rec, bool tail)
1820 {
1821 	if (rec->opts.tail_synthesize != tail)
1822 		return 0;
1823 
1824 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1825 }
1826 
1827 static int record__synthesize(struct record *rec, bool tail);
1828 
1829 static int
1830 record__switch_output(struct record *rec, bool at_exit)
1831 {
1832 	struct perf_data *data = &rec->data;
1833 	int fd, err;
1834 	char *new_filename;
1835 
1836 	/* Same Size:      "2015122520103046"*/
1837 	char timestamp[] = "InvalidTimestamp";
1838 
1839 	record__aio_mmap_read_sync(rec);
1840 
1841 	write_finished_init(rec, true);
1842 
1843 	record__synthesize(rec, true);
1844 	if (target__none(&rec->opts.target))
1845 		record__synthesize_workload(rec, true);
1846 
1847 	rec->samples = 0;
1848 	record__finish_output(rec);
1849 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1850 	if (err) {
1851 		pr_err("Failed to get current timestamp\n");
1852 		return -EINVAL;
1853 	}
1854 
1855 	fd = perf_data__switch(data, timestamp,
1856 				    rec->session->header.data_offset,
1857 				    at_exit, &new_filename);
1858 	if (fd >= 0 && !at_exit) {
1859 		rec->bytes_written = 0;
1860 		rec->session->header.data_size = 0;
1861 	}
1862 
1863 	if (!quiet)
1864 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1865 			data->path, timestamp);
1866 
1867 	if (rec->switch_output.num_files) {
1868 		int n = rec->switch_output.cur_file + 1;
1869 
1870 		if (n >= rec->switch_output.num_files)
1871 			n = 0;
1872 		rec->switch_output.cur_file = n;
1873 		if (rec->switch_output.filenames[n]) {
1874 			remove(rec->switch_output.filenames[n]);
1875 			zfree(&rec->switch_output.filenames[n]);
1876 		}
1877 		rec->switch_output.filenames[n] = new_filename;
1878 	} else {
1879 		free(new_filename);
1880 	}
1881 
1882 	/* Output tracking events */
1883 	if (!at_exit) {
1884 		record__synthesize(rec, false);
1885 
1886 		/*
1887 		 * In 'perf record --switch-output' without -a,
1888 		 * record__synthesize() in record__switch_output() won't
1889 		 * generate tracking events because there's no thread_map
1890 		 * in evlist. Which causes newly created perf.data doesn't
1891 		 * contain map and comm information.
1892 		 * Create a fake thread_map and directly call
1893 		 * perf_event__synthesize_thread_map() for those events.
1894 		 */
1895 		if (target__none(&rec->opts.target))
1896 			record__synthesize_workload(rec, false);
1897 		write_finished_init(rec, false);
1898 	}
1899 	return fd;
1900 }
1901 
1902 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1903 					struct perf_record_lost_samples *lost,
1904 					int cpu_idx, int thread_idx, u64 lost_count,
1905 					u16 misc_flag)
1906 {
1907 	struct perf_sample_id *sid;
1908 	struct perf_sample sample = {};
1909 	int id_hdr_size;
1910 
1911 	lost->lost = lost_count;
1912 	if (evsel->core.ids) {
1913 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1914 		sample.id = sid->id;
1915 	}
1916 
1917 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1918 						       evsel->core.attr.sample_type, &sample);
1919 	lost->header.size = sizeof(*lost) + id_hdr_size;
1920 	lost->header.misc = misc_flag;
1921 	record__write(rec, NULL, lost, lost->header.size);
1922 }
1923 
1924 static void record__read_lost_samples(struct record *rec)
1925 {
1926 	struct perf_session *session = rec->session;
1927 	struct perf_record_lost_samples *lost = NULL;
1928 	struct evsel *evsel;
1929 
1930 	/* there was an error during record__open */
1931 	if (session->evlist == NULL)
1932 		return;
1933 
1934 	evlist__for_each_entry(session->evlist, evsel) {
1935 		struct xyarray *xy = evsel->core.sample_id;
1936 		u64 lost_count;
1937 
1938 		if (xy == NULL || evsel->core.fd == NULL)
1939 			continue;
1940 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1941 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1942 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1943 			continue;
1944 		}
1945 
1946 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1947 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1948 				struct perf_counts_values count;
1949 
1950 				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1951 					pr_debug("read LOST count failed\n");
1952 					goto out;
1953 				}
1954 
1955 				if (count.lost) {
1956 					if (!lost) {
1957 						lost = zalloc(sizeof(*lost) +
1958 							      session->machines.host.id_hdr_size);
1959 						if (!lost) {
1960 							pr_debug("Memory allocation failed\n");
1961 							return;
1962 						}
1963 						lost->header.type = PERF_RECORD_LOST_SAMPLES;
1964 					}
1965 					__record__save_lost_samples(rec, evsel, lost,
1966 								    x, y, count.lost, 0);
1967 				}
1968 			}
1969 		}
1970 
1971 		lost_count = perf_bpf_filter__lost_count(evsel);
1972 		if (lost_count) {
1973 			if (!lost) {
1974 				lost = zalloc(sizeof(*lost) +
1975 					      session->machines.host.id_hdr_size);
1976 				if (!lost) {
1977 					pr_debug("Memory allocation failed\n");
1978 					return;
1979 				}
1980 				lost->header.type = PERF_RECORD_LOST_SAMPLES;
1981 			}
1982 			__record__save_lost_samples(rec, evsel, lost, 0, 0, lost_count,
1983 						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
1984 		}
1985 	}
1986 out:
1987 	free(lost);
1988 }
1989 
1990 static volatile sig_atomic_t workload_exec_errno;
1991 
1992 /*
1993  * evlist__prepare_workload will send a SIGUSR1
1994  * if the fork fails, since we asked by setting its
1995  * want_signal to true.
1996  */
1997 static void workload_exec_failed_signal(int signo __maybe_unused,
1998 					siginfo_t *info,
1999 					void *ucontext __maybe_unused)
2000 {
2001 	workload_exec_errno = info->si_value.sival_int;
2002 	done = 1;
2003 	child_finished = 1;
2004 }
2005 
2006 static void snapshot_sig_handler(int sig);
2007 static void alarm_sig_handler(int sig);
2008 
2009 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
2010 {
2011 	if (evlist) {
2012 		if (evlist->mmap && evlist->mmap[0].core.base)
2013 			return evlist->mmap[0].core.base;
2014 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
2015 			return evlist->overwrite_mmap[0].core.base;
2016 	}
2017 	return NULL;
2018 }
2019 
2020 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
2021 {
2022 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2023 	if (pc)
2024 		return pc;
2025 	return NULL;
2026 }
2027 
2028 static int record__synthesize(struct record *rec, bool tail)
2029 {
2030 	struct perf_session *session = rec->session;
2031 	struct machine *machine = &session->machines.host;
2032 	struct perf_data *data = &rec->data;
2033 	struct record_opts *opts = &rec->opts;
2034 	struct perf_tool *tool = &rec->tool;
2035 	int err = 0;
2036 	event_op f = process_synthesized_event;
2037 
2038 	if (rec->opts.tail_synthesize != tail)
2039 		return 0;
2040 
2041 	if (data->is_pipe) {
2042 		err = perf_event__synthesize_for_pipe(tool, session, data,
2043 						      process_synthesized_event);
2044 		if (err < 0)
2045 			goto out;
2046 
2047 		rec->bytes_written += err;
2048 	}
2049 
2050 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2051 					  process_synthesized_event, machine);
2052 	if (err)
2053 		goto out;
2054 
2055 	/* Synthesize id_index before auxtrace_info */
2056 	err = perf_event__synthesize_id_index(tool,
2057 					      process_synthesized_event,
2058 					      session->evlist, machine);
2059 	if (err)
2060 		goto out;
2061 
2062 	if (rec->opts.full_auxtrace) {
2063 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2064 					session, process_synthesized_event);
2065 		if (err)
2066 			goto out;
2067 	}
2068 
2069 	if (!evlist__exclude_kernel(rec->evlist)) {
2070 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2071 							 machine);
2072 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2073 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2074 				   "Check /proc/kallsyms permission or run as root.\n");
2075 
2076 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2077 						     machine);
2078 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2079 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2080 				   "Check /proc/modules permission or run as root.\n");
2081 	}
2082 
2083 	if (perf_guest) {
2084 		machines__process_guests(&session->machines,
2085 					 perf_event__synthesize_guest_os, tool);
2086 	}
2087 
2088 	err = perf_event__synthesize_extra_attr(&rec->tool,
2089 						rec->evlist,
2090 						process_synthesized_event,
2091 						data->is_pipe);
2092 	if (err)
2093 		goto out;
2094 
2095 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2096 						 process_synthesized_event,
2097 						NULL);
2098 	if (err < 0) {
2099 		pr_err("Couldn't synthesize thread map.\n");
2100 		return err;
2101 	}
2102 
2103 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2104 					     process_synthesized_event, NULL);
2105 	if (err < 0) {
2106 		pr_err("Couldn't synthesize cpu map.\n");
2107 		return err;
2108 	}
2109 
2110 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2111 						machine, opts);
2112 	if (err < 0) {
2113 		pr_warning("Couldn't synthesize bpf events.\n");
2114 		err = 0;
2115 	}
2116 
2117 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2118 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2119 						     machine);
2120 		if (err < 0) {
2121 			pr_warning("Couldn't synthesize cgroup events.\n");
2122 			err = 0;
2123 		}
2124 	}
2125 
2126 	if (rec->opts.nr_threads_synthesize > 1) {
2127 		mutex_init(&synth_lock);
2128 		perf_set_multithreaded();
2129 		f = process_locked_synthesized_event;
2130 	}
2131 
2132 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2133 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2134 
2135 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2136 						    rec->evlist->core.threads,
2137 						    f, needs_mmap, opts->sample_address,
2138 						    rec->opts.nr_threads_synthesize);
2139 	}
2140 
2141 	if (rec->opts.nr_threads_synthesize > 1) {
2142 		perf_set_singlethreaded();
2143 		mutex_destroy(&synth_lock);
2144 	}
2145 
2146 out:
2147 	return err;
2148 }
2149 
2150 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2151 {
2152 	struct record *rec = data;
2153 	pthread_kill(rec->thread_id, SIGUSR2);
2154 	return 0;
2155 }
2156 
2157 static int record__setup_sb_evlist(struct record *rec)
2158 {
2159 	struct record_opts *opts = &rec->opts;
2160 
2161 	if (rec->sb_evlist != NULL) {
2162 		/*
2163 		 * We get here if --switch-output-event populated the
2164 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2165 		 * to the main thread.
2166 		 */
2167 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2168 		rec->thread_id = pthread_self();
2169 	}
2170 #ifdef HAVE_LIBBPF_SUPPORT
2171 	if (!opts->no_bpf_event) {
2172 		if (rec->sb_evlist == NULL) {
2173 			rec->sb_evlist = evlist__new();
2174 
2175 			if (rec->sb_evlist == NULL) {
2176 				pr_err("Couldn't create side band evlist.\n.");
2177 				return -1;
2178 			}
2179 		}
2180 
2181 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2182 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2183 			return -1;
2184 		}
2185 	}
2186 #endif
2187 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2188 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2189 		opts->no_bpf_event = true;
2190 	}
2191 
2192 	return 0;
2193 }
2194 
2195 static int record__init_clock(struct record *rec)
2196 {
2197 	struct perf_session *session = rec->session;
2198 	struct timespec ref_clockid;
2199 	struct timeval ref_tod;
2200 	u64 ref;
2201 
2202 	if (!rec->opts.use_clockid)
2203 		return 0;
2204 
2205 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2206 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2207 
2208 	session->header.env.clock.clockid = rec->opts.clockid;
2209 
2210 	if (gettimeofday(&ref_tod, NULL) != 0) {
2211 		pr_err("gettimeofday failed, cannot set reference time.\n");
2212 		return -1;
2213 	}
2214 
2215 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2216 		pr_err("clock_gettime failed, cannot set reference time.\n");
2217 		return -1;
2218 	}
2219 
2220 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2221 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2222 
2223 	session->header.env.clock.tod_ns = ref;
2224 
2225 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2226 	      (u64) ref_clockid.tv_nsec;
2227 
2228 	session->header.env.clock.clockid_ns = ref;
2229 	return 0;
2230 }
2231 
2232 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2233 {
2234 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2235 		trigger_hit(&auxtrace_snapshot_trigger);
2236 		auxtrace_record__snapshot_started = 1;
2237 		if (auxtrace_record__snapshot_start(rec->itr))
2238 			trigger_error(&auxtrace_snapshot_trigger);
2239 	}
2240 }
2241 
2242 static int record__terminate_thread(struct record_thread *thread_data)
2243 {
2244 	int err;
2245 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2246 	pid_t tid = thread_data->tid;
2247 
2248 	close(thread_data->pipes.msg[1]);
2249 	thread_data->pipes.msg[1] = -1;
2250 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2251 	if (err > 0)
2252 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2253 	else
2254 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2255 			   thread->tid, tid);
2256 
2257 	return 0;
2258 }
2259 
2260 static int record__start_threads(struct record *rec)
2261 {
2262 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2263 	struct record_thread *thread_data = rec->thread_data;
2264 	sigset_t full, mask;
2265 	pthread_t handle;
2266 	pthread_attr_t attrs;
2267 
2268 	thread = &thread_data[0];
2269 
2270 	if (!record__threads_enabled(rec))
2271 		return 0;
2272 
2273 	sigfillset(&full);
2274 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2275 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2276 		return -1;
2277 	}
2278 
2279 	pthread_attr_init(&attrs);
2280 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2281 
2282 	for (t = 1; t < nr_threads; t++) {
2283 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2284 
2285 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2286 		pthread_attr_setaffinity_np(&attrs,
2287 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2288 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2289 #endif
2290 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2291 			for (tt = 1; tt < t; tt++)
2292 				record__terminate_thread(&thread_data[t]);
2293 			pr_err("Failed to start threads: %s\n", strerror(errno));
2294 			ret = -1;
2295 			goto out_err;
2296 		}
2297 
2298 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2299 		if (err > 0)
2300 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2301 				  thread_msg_tags[msg]);
2302 		else
2303 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2304 				   thread->tid, rec->thread_data[t].tid);
2305 	}
2306 
2307 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2308 			(cpu_set_t *)thread->mask->affinity.bits);
2309 
2310 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2311 
2312 out_err:
2313 	pthread_attr_destroy(&attrs);
2314 
2315 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2316 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2317 		ret = -1;
2318 	}
2319 
2320 	return ret;
2321 }
2322 
2323 static int record__stop_threads(struct record *rec)
2324 {
2325 	int t;
2326 	struct record_thread *thread_data = rec->thread_data;
2327 
2328 	for (t = 1; t < rec->nr_threads; t++)
2329 		record__terminate_thread(&thread_data[t]);
2330 
2331 	for (t = 0; t < rec->nr_threads; t++) {
2332 		rec->samples += thread_data[t].samples;
2333 		if (!record__threads_enabled(rec))
2334 			continue;
2335 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2336 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2337 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2338 			 thread_data[t].samples, thread_data[t].waking);
2339 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2340 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2341 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2342 		else
2343 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2344 	}
2345 
2346 	return 0;
2347 }
2348 
2349 static unsigned long record__waking(struct record *rec)
2350 {
2351 	int t;
2352 	unsigned long waking = 0;
2353 	struct record_thread *thread_data = rec->thread_data;
2354 
2355 	for (t = 0; t < rec->nr_threads; t++)
2356 		waking += thread_data[t].waking;
2357 
2358 	return waking;
2359 }
2360 
2361 static int __cmd_record(struct record *rec, int argc, const char **argv)
2362 {
2363 	int err;
2364 	int status = 0;
2365 	const bool forks = argc > 0;
2366 	struct perf_tool *tool = &rec->tool;
2367 	struct record_opts *opts = &rec->opts;
2368 	struct perf_data *data = &rec->data;
2369 	struct perf_session *session;
2370 	bool disabled = false, draining = false;
2371 	int fd;
2372 	float ratio = 0;
2373 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2374 
2375 	atexit(record__sig_exit);
2376 	signal(SIGCHLD, sig_handler);
2377 	signal(SIGINT, sig_handler);
2378 	signal(SIGTERM, sig_handler);
2379 	signal(SIGSEGV, sigsegv_handler);
2380 
2381 	if (rec->opts.record_namespaces)
2382 		tool->namespace_events = true;
2383 
2384 	if (rec->opts.record_cgroup) {
2385 #ifdef HAVE_FILE_HANDLE
2386 		tool->cgroup_events = true;
2387 #else
2388 		pr_err("cgroup tracking is not supported\n");
2389 		return -1;
2390 #endif
2391 	}
2392 
2393 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2394 		signal(SIGUSR2, snapshot_sig_handler);
2395 		if (rec->opts.auxtrace_snapshot_mode)
2396 			trigger_on(&auxtrace_snapshot_trigger);
2397 		if (rec->switch_output.enabled)
2398 			trigger_on(&switch_output_trigger);
2399 	} else {
2400 		signal(SIGUSR2, SIG_IGN);
2401 	}
2402 
2403 	session = perf_session__new(data, tool);
2404 	if (IS_ERR(session)) {
2405 		pr_err("Perf session creation failed.\n");
2406 		return PTR_ERR(session);
2407 	}
2408 
2409 	if (record__threads_enabled(rec)) {
2410 		if (perf_data__is_pipe(&rec->data)) {
2411 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2412 			return -1;
2413 		}
2414 		if (rec->opts.full_auxtrace) {
2415 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2416 			return -1;
2417 		}
2418 	}
2419 
2420 	fd = perf_data__fd(data);
2421 	rec->session = session;
2422 
2423 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2424 		pr_err("Compression initialization failed.\n");
2425 		return -1;
2426 	}
2427 #ifdef HAVE_EVENTFD_SUPPORT
2428 	done_fd = eventfd(0, EFD_NONBLOCK);
2429 	if (done_fd < 0) {
2430 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2431 		status = -1;
2432 		goto out_delete_session;
2433 	}
2434 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2435 	if (err < 0) {
2436 		pr_err("Failed to add wakeup eventfd to poll list\n");
2437 		status = err;
2438 		goto out_delete_session;
2439 	}
2440 #endif // HAVE_EVENTFD_SUPPORT
2441 
2442 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2443 	session->header.env.comp_level = rec->opts.comp_level;
2444 
2445 	if (rec->opts.kcore &&
2446 	    !record__kcore_readable(&session->machines.host)) {
2447 		pr_err("ERROR: kcore is not readable.\n");
2448 		return -1;
2449 	}
2450 
2451 	if (record__init_clock(rec))
2452 		return -1;
2453 
2454 	record__init_features(rec);
2455 
2456 	if (forks) {
2457 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2458 					       workload_exec_failed_signal);
2459 		if (err < 0) {
2460 			pr_err("Couldn't run the workload!\n");
2461 			status = err;
2462 			goto out_delete_session;
2463 		}
2464 	}
2465 
2466 	/*
2467 	 * If we have just single event and are sending data
2468 	 * through pipe, we need to force the ids allocation,
2469 	 * because we synthesize event name through the pipe
2470 	 * and need the id for that.
2471 	 */
2472 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2473 		rec->opts.sample_id = true;
2474 
2475 	evlist__uniquify_name(rec->evlist);
2476 
2477 	/* Debug message used by test scripts */
2478 	pr_debug3("perf record opening and mmapping events\n");
2479 	if (record__open(rec) != 0) {
2480 		err = -1;
2481 		goto out_free_threads;
2482 	}
2483 	/* Debug message used by test scripts */
2484 	pr_debug3("perf record done opening and mmapping events\n");
2485 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2486 
2487 	if (rec->opts.kcore) {
2488 		err = record__kcore_copy(&session->machines.host, data);
2489 		if (err) {
2490 			pr_err("ERROR: Failed to copy kcore\n");
2491 			goto out_free_threads;
2492 		}
2493 	}
2494 
2495 	/*
2496 	 * Normally perf_session__new would do this, but it doesn't have the
2497 	 * evlist.
2498 	 */
2499 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2500 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2501 		rec->tool.ordered_events = false;
2502 	}
2503 
2504 	if (evlist__nr_groups(rec->evlist) == 0)
2505 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2506 
2507 	if (data->is_pipe) {
2508 		err = perf_header__write_pipe(fd);
2509 		if (err < 0)
2510 			goto out_free_threads;
2511 	} else {
2512 		err = perf_session__write_header(session, rec->evlist, fd, false);
2513 		if (err < 0)
2514 			goto out_free_threads;
2515 	}
2516 
2517 	err = -1;
2518 	if (!rec->no_buildid
2519 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2520 		pr_err("Couldn't generate buildids. "
2521 		       "Use --no-buildid to profile anyway.\n");
2522 		goto out_free_threads;
2523 	}
2524 
2525 	err = record__setup_sb_evlist(rec);
2526 	if (err)
2527 		goto out_free_threads;
2528 
2529 	err = record__synthesize(rec, false);
2530 	if (err < 0)
2531 		goto out_free_threads;
2532 
2533 	if (rec->realtime_prio) {
2534 		struct sched_param param;
2535 
2536 		param.sched_priority = rec->realtime_prio;
2537 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2538 			pr_err("Could not set realtime priority.\n");
2539 			err = -1;
2540 			goto out_free_threads;
2541 		}
2542 	}
2543 
2544 	if (record__start_threads(rec))
2545 		goto out_free_threads;
2546 
2547 	/*
2548 	 * When perf is starting the traced process, all the events
2549 	 * (apart from group members) have enable_on_exec=1 set,
2550 	 * so don't spoil it by prematurely enabling them.
2551 	 */
2552 	if (!target__none(&opts->target) && !opts->target.initial_delay)
2553 		evlist__enable(rec->evlist);
2554 
2555 	/*
2556 	 * Let the child rip
2557 	 */
2558 	if (forks) {
2559 		struct machine *machine = &session->machines.host;
2560 		union perf_event *event;
2561 		pid_t tgid;
2562 
2563 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2564 		if (event == NULL) {
2565 			err = -ENOMEM;
2566 			goto out_child;
2567 		}
2568 
2569 		/*
2570 		 * Some H/W events are generated before COMM event
2571 		 * which is emitted during exec(), so perf script
2572 		 * cannot see a correct process name for those events.
2573 		 * Synthesize COMM event to prevent it.
2574 		 */
2575 		tgid = perf_event__synthesize_comm(tool, event,
2576 						   rec->evlist->workload.pid,
2577 						   process_synthesized_event,
2578 						   machine);
2579 		free(event);
2580 
2581 		if (tgid == -1)
2582 			goto out_child;
2583 
2584 		event = malloc(sizeof(event->namespaces) +
2585 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2586 			       machine->id_hdr_size);
2587 		if (event == NULL) {
2588 			err = -ENOMEM;
2589 			goto out_child;
2590 		}
2591 
2592 		/*
2593 		 * Synthesize NAMESPACES event for the command specified.
2594 		 */
2595 		perf_event__synthesize_namespaces(tool, event,
2596 						  rec->evlist->workload.pid,
2597 						  tgid, process_synthesized_event,
2598 						  machine);
2599 		free(event);
2600 
2601 		evlist__start_workload(rec->evlist);
2602 	}
2603 
2604 	if (opts->target.initial_delay) {
2605 		pr_info(EVLIST_DISABLED_MSG);
2606 		if (opts->target.initial_delay > 0) {
2607 			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2608 			evlist__enable(rec->evlist);
2609 			pr_info(EVLIST_ENABLED_MSG);
2610 		}
2611 	}
2612 
2613 	err = event_enable_timer__start(rec->evlist->eet);
2614 	if (err)
2615 		goto out_child;
2616 
2617 	/* Debug message used by test scripts */
2618 	pr_debug3("perf record has started\n");
2619 	fflush(stderr);
2620 
2621 	trigger_ready(&auxtrace_snapshot_trigger);
2622 	trigger_ready(&switch_output_trigger);
2623 	perf_hooks__invoke_record_start();
2624 
2625 	/*
2626 	 * Must write FINISHED_INIT so it will be seen after all other
2627 	 * synthesized user events, but before any regular events.
2628 	 */
2629 	err = write_finished_init(rec, false);
2630 	if (err < 0)
2631 		goto out_child;
2632 
2633 	for (;;) {
2634 		unsigned long long hits = thread->samples;
2635 
2636 		/*
2637 		 * rec->evlist->bkw_mmap_state is possible to be
2638 		 * BKW_MMAP_EMPTY here: when done == true and
2639 		 * hits != rec->samples in previous round.
2640 		 *
2641 		 * evlist__toggle_bkw_mmap ensure we never
2642 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2643 		 */
2644 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2645 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2646 
2647 		if (record__mmap_read_all(rec, false) < 0) {
2648 			trigger_error(&auxtrace_snapshot_trigger);
2649 			trigger_error(&switch_output_trigger);
2650 			err = -1;
2651 			goto out_child;
2652 		}
2653 
2654 		if (auxtrace_record__snapshot_started) {
2655 			auxtrace_record__snapshot_started = 0;
2656 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2657 				record__read_auxtrace_snapshot(rec, false);
2658 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2659 				pr_err("AUX area tracing snapshot failed\n");
2660 				err = -1;
2661 				goto out_child;
2662 			}
2663 		}
2664 
2665 		if (trigger_is_hit(&switch_output_trigger)) {
2666 			/*
2667 			 * If switch_output_trigger is hit, the data in
2668 			 * overwritable ring buffer should have been collected,
2669 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2670 			 *
2671 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2672 			 * record__mmap_read_all() didn't collect data from
2673 			 * overwritable ring buffer. Read again.
2674 			 */
2675 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2676 				continue;
2677 			trigger_ready(&switch_output_trigger);
2678 
2679 			/*
2680 			 * Reenable events in overwrite ring buffer after
2681 			 * record__mmap_read_all(): we should have collected
2682 			 * data from it.
2683 			 */
2684 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2685 
2686 			if (!quiet)
2687 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2688 					record__waking(rec));
2689 			thread->waking = 0;
2690 			fd = record__switch_output(rec, false);
2691 			if (fd < 0) {
2692 				pr_err("Failed to switch to new file\n");
2693 				trigger_error(&switch_output_trigger);
2694 				err = fd;
2695 				goto out_child;
2696 			}
2697 
2698 			/* re-arm the alarm */
2699 			if (rec->switch_output.time)
2700 				alarm(rec->switch_output.time);
2701 		}
2702 
2703 		if (hits == thread->samples) {
2704 			if (done || draining)
2705 				break;
2706 			err = fdarray__poll(&thread->pollfd, -1);
2707 			/*
2708 			 * Propagate error, only if there's any. Ignore positive
2709 			 * number of returned events and interrupt error.
2710 			 */
2711 			if (err > 0 || (err < 0 && errno == EINTR))
2712 				err = 0;
2713 			thread->waking++;
2714 
2715 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2716 					    record__thread_munmap_filtered, NULL) == 0)
2717 				draining = true;
2718 
2719 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2720 			if (err)
2721 				goto out_child;
2722 		}
2723 
2724 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2725 			switch (cmd) {
2726 			case EVLIST_CTL_CMD_SNAPSHOT:
2727 				hit_auxtrace_snapshot_trigger(rec);
2728 				evlist__ctlfd_ack(rec->evlist);
2729 				break;
2730 			case EVLIST_CTL_CMD_STOP:
2731 				done = 1;
2732 				break;
2733 			case EVLIST_CTL_CMD_ACK:
2734 			case EVLIST_CTL_CMD_UNSUPPORTED:
2735 			case EVLIST_CTL_CMD_ENABLE:
2736 			case EVLIST_CTL_CMD_DISABLE:
2737 			case EVLIST_CTL_CMD_EVLIST:
2738 			case EVLIST_CTL_CMD_PING:
2739 			default:
2740 				break;
2741 			}
2742 		}
2743 
2744 		err = event_enable_timer__process(rec->evlist->eet);
2745 		if (err < 0)
2746 			goto out_child;
2747 		if (err) {
2748 			err = 0;
2749 			done = 1;
2750 		}
2751 
2752 		/*
2753 		 * When perf is starting the traced process, at the end events
2754 		 * die with the process and we wait for that. Thus no need to
2755 		 * disable events in this case.
2756 		 */
2757 		if (done && !disabled && !target__none(&opts->target)) {
2758 			trigger_off(&auxtrace_snapshot_trigger);
2759 			evlist__disable(rec->evlist);
2760 			disabled = true;
2761 		}
2762 	}
2763 
2764 	trigger_off(&auxtrace_snapshot_trigger);
2765 	trigger_off(&switch_output_trigger);
2766 
2767 	if (opts->auxtrace_snapshot_on_exit)
2768 		record__auxtrace_snapshot_exit(rec);
2769 
2770 	if (forks && workload_exec_errno) {
2771 		char msg[STRERR_BUFSIZE], strevsels[2048];
2772 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2773 
2774 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2775 
2776 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2777 			strevsels, argv[0], emsg);
2778 		err = -1;
2779 		goto out_child;
2780 	}
2781 
2782 	if (!quiet)
2783 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2784 			record__waking(rec));
2785 
2786 	write_finished_init(rec, true);
2787 
2788 	if (target__none(&rec->opts.target))
2789 		record__synthesize_workload(rec, true);
2790 
2791 out_child:
2792 	record__stop_threads(rec);
2793 	record__mmap_read_all(rec, true);
2794 out_free_threads:
2795 	record__free_thread_data(rec);
2796 	evlist__finalize_ctlfd(rec->evlist);
2797 	record__aio_mmap_read_sync(rec);
2798 
2799 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2800 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2801 		session->header.env.comp_ratio = ratio + 0.5;
2802 	}
2803 
2804 	if (forks) {
2805 		int exit_status;
2806 
2807 		if (!child_finished)
2808 			kill(rec->evlist->workload.pid, SIGTERM);
2809 
2810 		wait(&exit_status);
2811 
2812 		if (err < 0)
2813 			status = err;
2814 		else if (WIFEXITED(exit_status))
2815 			status = WEXITSTATUS(exit_status);
2816 		else if (WIFSIGNALED(exit_status))
2817 			signr = WTERMSIG(exit_status);
2818 	} else
2819 		status = err;
2820 
2821 	if (rec->off_cpu)
2822 		rec->bytes_written += off_cpu_write(rec->session);
2823 
2824 	record__read_lost_samples(rec);
2825 	record__synthesize(rec, true);
2826 	/* this will be recalculated during process_buildids() */
2827 	rec->samples = 0;
2828 
2829 	if (!err) {
2830 		if (!rec->timestamp_filename) {
2831 			record__finish_output(rec);
2832 		} else {
2833 			fd = record__switch_output(rec, true);
2834 			if (fd < 0) {
2835 				status = fd;
2836 				goto out_delete_session;
2837 			}
2838 		}
2839 	}
2840 
2841 	perf_hooks__invoke_record_end();
2842 
2843 	if (!err && !quiet) {
2844 		char samples[128];
2845 		const char *postfix = rec->timestamp_filename ?
2846 					".<timestamp>" : "";
2847 
2848 		if (rec->samples && !rec->opts.full_auxtrace)
2849 			scnprintf(samples, sizeof(samples),
2850 				  " (%" PRIu64 " samples)", rec->samples);
2851 		else
2852 			samples[0] = '\0';
2853 
2854 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2855 			perf_data__size(data) / 1024.0 / 1024.0,
2856 			data->path, postfix, samples);
2857 		if (ratio) {
2858 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2859 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2860 					ratio);
2861 		}
2862 		fprintf(stderr, " ]\n");
2863 	}
2864 
2865 out_delete_session:
2866 #ifdef HAVE_EVENTFD_SUPPORT
2867 	if (done_fd >= 0) {
2868 		fd = done_fd;
2869 		done_fd = -1;
2870 
2871 		close(fd);
2872 	}
2873 #endif
2874 	zstd_fini(&session->zstd_data);
2875 	perf_session__delete(session);
2876 
2877 	if (!opts->no_bpf_event)
2878 		evlist__stop_sb_thread(rec->sb_evlist);
2879 	return status;
2880 }
2881 
2882 static void callchain_debug(struct callchain_param *callchain)
2883 {
2884 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2885 
2886 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2887 
2888 	if (callchain->record_mode == CALLCHAIN_DWARF)
2889 		pr_debug("callchain: stack dump size %d\n",
2890 			 callchain->dump_size);
2891 }
2892 
2893 int record_opts__parse_callchain(struct record_opts *record,
2894 				 struct callchain_param *callchain,
2895 				 const char *arg, bool unset)
2896 {
2897 	int ret;
2898 	callchain->enabled = !unset;
2899 
2900 	/* --no-call-graph */
2901 	if (unset) {
2902 		callchain->record_mode = CALLCHAIN_NONE;
2903 		pr_debug("callchain: disabled\n");
2904 		return 0;
2905 	}
2906 
2907 	ret = parse_callchain_record_opt(arg, callchain);
2908 	if (!ret) {
2909 		/* Enable data address sampling for DWARF unwind. */
2910 		if (callchain->record_mode == CALLCHAIN_DWARF)
2911 			record->sample_address = true;
2912 		callchain_debug(callchain);
2913 	}
2914 
2915 	return ret;
2916 }
2917 
2918 int record_parse_callchain_opt(const struct option *opt,
2919 			       const char *arg,
2920 			       int unset)
2921 {
2922 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2923 }
2924 
2925 int record_callchain_opt(const struct option *opt,
2926 			 const char *arg __maybe_unused,
2927 			 int unset __maybe_unused)
2928 {
2929 	struct callchain_param *callchain = opt->value;
2930 
2931 	callchain->enabled = true;
2932 
2933 	if (callchain->record_mode == CALLCHAIN_NONE)
2934 		callchain->record_mode = CALLCHAIN_FP;
2935 
2936 	callchain_debug(callchain);
2937 	return 0;
2938 }
2939 
2940 static int perf_record_config(const char *var, const char *value, void *cb)
2941 {
2942 	struct record *rec = cb;
2943 
2944 	if (!strcmp(var, "record.build-id")) {
2945 		if (!strcmp(value, "cache"))
2946 			rec->no_buildid_cache = false;
2947 		else if (!strcmp(value, "no-cache"))
2948 			rec->no_buildid_cache = true;
2949 		else if (!strcmp(value, "skip"))
2950 			rec->no_buildid = true;
2951 		else if (!strcmp(value, "mmap"))
2952 			rec->buildid_mmap = true;
2953 		else
2954 			return -1;
2955 		return 0;
2956 	}
2957 	if (!strcmp(var, "record.call-graph")) {
2958 		var = "call-graph.record-mode";
2959 		return perf_default_config(var, value, cb);
2960 	}
2961 #ifdef HAVE_AIO_SUPPORT
2962 	if (!strcmp(var, "record.aio")) {
2963 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2964 		if (!rec->opts.nr_cblocks)
2965 			rec->opts.nr_cblocks = nr_cblocks_default;
2966 	}
2967 #endif
2968 	if (!strcmp(var, "record.debuginfod")) {
2969 		rec->debuginfod.urls = strdup(value);
2970 		if (!rec->debuginfod.urls)
2971 			return -ENOMEM;
2972 		rec->debuginfod.set = true;
2973 	}
2974 
2975 	return 0;
2976 }
2977 
2978 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2979 {
2980 	struct record *rec = (struct record *)opt->value;
2981 
2982 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2983 }
2984 
2985 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2986 {
2987 	struct record_opts *opts = (struct record_opts *)opt->value;
2988 
2989 	if (unset || !str)
2990 		return 0;
2991 
2992 	if (!strcasecmp(str, "node"))
2993 		opts->affinity = PERF_AFFINITY_NODE;
2994 	else if (!strcasecmp(str, "cpu"))
2995 		opts->affinity = PERF_AFFINITY_CPU;
2996 
2997 	return 0;
2998 }
2999 
3000 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3001 {
3002 	mask->nbits = nr_bits;
3003 	mask->bits = bitmap_zalloc(mask->nbits);
3004 	if (!mask->bits)
3005 		return -ENOMEM;
3006 
3007 	return 0;
3008 }
3009 
3010 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3011 {
3012 	bitmap_free(mask->bits);
3013 	mask->nbits = 0;
3014 }
3015 
3016 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3017 {
3018 	int ret;
3019 
3020 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3021 	if (ret) {
3022 		mask->affinity.bits = NULL;
3023 		return ret;
3024 	}
3025 
3026 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3027 	if (ret) {
3028 		record__mmap_cpu_mask_free(&mask->maps);
3029 		mask->maps.bits = NULL;
3030 	}
3031 
3032 	return ret;
3033 }
3034 
3035 static void record__thread_mask_free(struct thread_mask *mask)
3036 {
3037 	record__mmap_cpu_mask_free(&mask->maps);
3038 	record__mmap_cpu_mask_free(&mask->affinity);
3039 }
3040 
3041 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3042 {
3043 	int s;
3044 	struct record_opts *opts = opt->value;
3045 
3046 	if (unset || !str || !strlen(str)) {
3047 		opts->threads_spec = THREAD_SPEC__CPU;
3048 	} else {
3049 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3050 			if (s == THREAD_SPEC__USER) {
3051 				opts->threads_user_spec = strdup(str);
3052 				if (!opts->threads_user_spec)
3053 					return -ENOMEM;
3054 				opts->threads_spec = THREAD_SPEC__USER;
3055 				break;
3056 			}
3057 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3058 				opts->threads_spec = s;
3059 				break;
3060 			}
3061 		}
3062 	}
3063 
3064 	if (opts->threads_spec == THREAD_SPEC__USER)
3065 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3066 	else
3067 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3068 
3069 	return 0;
3070 }
3071 
3072 static int parse_output_max_size(const struct option *opt,
3073 				 const char *str, int unset)
3074 {
3075 	unsigned long *s = (unsigned long *)opt->value;
3076 	static struct parse_tag tags_size[] = {
3077 		{ .tag  = 'B', .mult = 1       },
3078 		{ .tag  = 'K', .mult = 1 << 10 },
3079 		{ .tag  = 'M', .mult = 1 << 20 },
3080 		{ .tag  = 'G', .mult = 1 << 30 },
3081 		{ .tag  = 0 },
3082 	};
3083 	unsigned long val;
3084 
3085 	if (unset) {
3086 		*s = 0;
3087 		return 0;
3088 	}
3089 
3090 	val = parse_tag_value(str, tags_size);
3091 	if (val != (unsigned long) -1) {
3092 		*s = val;
3093 		return 0;
3094 	}
3095 
3096 	return -1;
3097 }
3098 
3099 static int record__parse_mmap_pages(const struct option *opt,
3100 				    const char *str,
3101 				    int unset __maybe_unused)
3102 {
3103 	struct record_opts *opts = opt->value;
3104 	char *s, *p;
3105 	unsigned int mmap_pages;
3106 	int ret;
3107 
3108 	if (!str)
3109 		return -EINVAL;
3110 
3111 	s = strdup(str);
3112 	if (!s)
3113 		return -ENOMEM;
3114 
3115 	p = strchr(s, ',');
3116 	if (p)
3117 		*p = '\0';
3118 
3119 	if (*s) {
3120 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3121 		if (ret)
3122 			goto out_free;
3123 		opts->mmap_pages = mmap_pages;
3124 	}
3125 
3126 	if (!p) {
3127 		ret = 0;
3128 		goto out_free;
3129 	}
3130 
3131 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3132 	if (ret)
3133 		goto out_free;
3134 
3135 	opts->auxtrace_mmap_pages = mmap_pages;
3136 
3137 out_free:
3138 	free(s);
3139 	return ret;
3140 }
3141 
3142 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3143 {
3144 }
3145 
3146 static int parse_control_option(const struct option *opt,
3147 				const char *str,
3148 				int unset __maybe_unused)
3149 {
3150 	struct record_opts *opts = opt->value;
3151 
3152 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3153 }
3154 
3155 static void switch_output_size_warn(struct record *rec)
3156 {
3157 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3158 	struct switch_output *s = &rec->switch_output;
3159 
3160 	wakeup_size /= 2;
3161 
3162 	if (s->size < wakeup_size) {
3163 		char buf[100];
3164 
3165 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3166 		pr_warning("WARNING: switch-output data size lower than "
3167 			   "wakeup kernel buffer size (%s) "
3168 			   "expect bigger perf.data sizes\n", buf);
3169 	}
3170 }
3171 
3172 static int switch_output_setup(struct record *rec)
3173 {
3174 	struct switch_output *s = &rec->switch_output;
3175 	static struct parse_tag tags_size[] = {
3176 		{ .tag  = 'B', .mult = 1       },
3177 		{ .tag  = 'K', .mult = 1 << 10 },
3178 		{ .tag  = 'M', .mult = 1 << 20 },
3179 		{ .tag  = 'G', .mult = 1 << 30 },
3180 		{ .tag  = 0 },
3181 	};
3182 	static struct parse_tag tags_time[] = {
3183 		{ .tag  = 's', .mult = 1        },
3184 		{ .tag  = 'm', .mult = 60       },
3185 		{ .tag  = 'h', .mult = 60*60    },
3186 		{ .tag  = 'd', .mult = 60*60*24 },
3187 		{ .tag  = 0 },
3188 	};
3189 	unsigned long val;
3190 
3191 	/*
3192 	 * If we're using --switch-output-events, then we imply its
3193 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3194 	 *  thread to its parent.
3195 	 */
3196 	if (rec->switch_output_event_set) {
3197 		if (record__threads_enabled(rec)) {
3198 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3199 			return 0;
3200 		}
3201 		goto do_signal;
3202 	}
3203 
3204 	if (!s->set)
3205 		return 0;
3206 
3207 	if (record__threads_enabled(rec)) {
3208 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3209 		return 0;
3210 	}
3211 
3212 	if (!strcmp(s->str, "signal")) {
3213 do_signal:
3214 		s->signal = true;
3215 		pr_debug("switch-output with SIGUSR2 signal\n");
3216 		goto enabled;
3217 	}
3218 
3219 	val = parse_tag_value(s->str, tags_size);
3220 	if (val != (unsigned long) -1) {
3221 		s->size = val;
3222 		pr_debug("switch-output with %s size threshold\n", s->str);
3223 		goto enabled;
3224 	}
3225 
3226 	val = parse_tag_value(s->str, tags_time);
3227 	if (val != (unsigned long) -1) {
3228 		s->time = val;
3229 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3230 			 s->str, s->time);
3231 		goto enabled;
3232 	}
3233 
3234 	return -1;
3235 
3236 enabled:
3237 	rec->timestamp_filename = true;
3238 	s->enabled              = true;
3239 
3240 	if (s->size && !rec->opts.no_buffering)
3241 		switch_output_size_warn(rec);
3242 
3243 	return 0;
3244 }
3245 
3246 static const char * const __record_usage[] = {
3247 	"perf record [<options>] [<command>]",
3248 	"perf record [<options>] -- <command> [<options>]",
3249 	NULL
3250 };
3251 const char * const *record_usage = __record_usage;
3252 
3253 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3254 				  struct perf_sample *sample, struct machine *machine)
3255 {
3256 	/*
3257 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3258 	 * no need to add them twice.
3259 	 */
3260 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3261 		return 0;
3262 	return perf_event__process_mmap(tool, event, sample, machine);
3263 }
3264 
3265 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3266 				   struct perf_sample *sample, struct machine *machine)
3267 {
3268 	/*
3269 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3270 	 * no need to add them twice.
3271 	 */
3272 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3273 		return 0;
3274 
3275 	return perf_event__process_mmap2(tool, event, sample, machine);
3276 }
3277 
3278 static int process_timestamp_boundary(struct perf_tool *tool,
3279 				      union perf_event *event __maybe_unused,
3280 				      struct perf_sample *sample,
3281 				      struct machine *machine __maybe_unused)
3282 {
3283 	struct record *rec = container_of(tool, struct record, tool);
3284 
3285 	set_timestamp_boundary(rec, sample->time);
3286 	return 0;
3287 }
3288 
3289 static int parse_record_synth_option(const struct option *opt,
3290 				     const char *str,
3291 				     int unset __maybe_unused)
3292 {
3293 	struct record_opts *opts = opt->value;
3294 	char *p = strdup(str);
3295 
3296 	if (p == NULL)
3297 		return -1;
3298 
3299 	opts->synth = parse_synth_opt(p);
3300 	free(p);
3301 
3302 	if (opts->synth < 0) {
3303 		pr_err("Invalid synth option: %s\n", str);
3304 		return -1;
3305 	}
3306 	return 0;
3307 }
3308 
3309 /*
3310  * XXX Ideally would be local to cmd_record() and passed to a record__new
3311  * because we need to have access to it in record__exit, that is called
3312  * after cmd_record() exits, but since record_options need to be accessible to
3313  * builtin-script, leave it here.
3314  *
3315  * At least we don't ouch it in all the other functions here directly.
3316  *
3317  * Just say no to tons of global variables, sigh.
3318  */
3319 static struct record record = {
3320 	.opts = {
3321 		.sample_time	     = true,
3322 		.mmap_pages	     = UINT_MAX,
3323 		.user_freq	     = UINT_MAX,
3324 		.user_interval	     = ULLONG_MAX,
3325 		.freq		     = 4000,
3326 		.target		     = {
3327 			.uses_mmap   = true,
3328 			.default_per_cpu = true,
3329 		},
3330 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3331 		.nr_threads_synthesize = 1,
3332 		.ctl_fd              = -1,
3333 		.ctl_fd_ack          = -1,
3334 		.synth               = PERF_SYNTH_ALL,
3335 	},
3336 	.tool = {
3337 		.sample		= process_sample_event,
3338 		.fork		= perf_event__process_fork,
3339 		.exit		= perf_event__process_exit,
3340 		.comm		= perf_event__process_comm,
3341 		.namespaces	= perf_event__process_namespaces,
3342 		.mmap		= build_id__process_mmap,
3343 		.mmap2		= build_id__process_mmap2,
3344 		.itrace_start	= process_timestamp_boundary,
3345 		.aux		= process_timestamp_boundary,
3346 		.ordered_events	= true,
3347 	},
3348 };
3349 
3350 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3351 	"\n\t\t\t\tDefault: fp";
3352 
3353 static bool dry_run;
3354 
3355 static struct parse_events_option_args parse_events_option_args = {
3356 	.evlistp = &record.evlist,
3357 };
3358 
3359 static struct parse_events_option_args switch_output_parse_events_option_args = {
3360 	.evlistp = &record.sb_evlist,
3361 };
3362 
3363 /*
3364  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3365  * with it and switch to use the library functions in perf_evlist that came
3366  * from builtin-record.c, i.e. use record_opts,
3367  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3368  * using pipes, etc.
3369  */
3370 static struct option __record_options[] = {
3371 	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3372 		     "event selector. use 'perf list' to list available events",
3373 		     parse_events_option),
3374 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3375 		     "event filter", parse_filter),
3376 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3377 			   NULL, "don't record events from perf itself",
3378 			   exclude_perf),
3379 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3380 		    "record events on existing process id"),
3381 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3382 		    "record events on existing thread id"),
3383 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3384 		    "collect data with this RT SCHED_FIFO priority"),
3385 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3386 		    "collect data without buffering"),
3387 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3388 		    "collect raw sample records from all opened counters"),
3389 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3390 			    "system-wide collection from all CPUs"),
3391 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3392 		    "list of cpus to monitor"),
3393 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3394 	OPT_STRING('o', "output", &record.data.path, "file",
3395 		    "output file name"),
3396 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3397 			&record.opts.no_inherit_set,
3398 			"child tasks do not inherit counters"),
3399 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3400 		    "synthesize non-sample events at the end of output"),
3401 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3402 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3403 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3404 		    "Fail if the specified frequency can't be used"),
3405 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3406 		     "profile at this frequency",
3407 		      record__parse_freq),
3408 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3409 		     "number of mmap data pages and AUX area tracing mmap pages",
3410 		     record__parse_mmap_pages),
3411 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3412 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3413 		     record__mmap_flush_parse),
3414 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3415 			   NULL, "enables call-graph recording" ,
3416 			   &record_callchain_opt),
3417 	OPT_CALLBACK(0, "call-graph", &record.opts,
3418 		     "record_mode[,record_size]", record_callchain_help,
3419 		     &record_parse_callchain_opt),
3420 	OPT_INCR('v', "verbose", &verbose,
3421 		    "be more verbose (show counter open errors, etc)"),
3422 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3423 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3424 		    "per thread counts"),
3425 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3426 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3427 		    "Record the sample physical addresses"),
3428 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3429 		    "Record the sampled data address data page size"),
3430 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3431 		    "Record the sampled code address (ip) page size"),
3432 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3433 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3434 		    "Record the sample identifier"),
3435 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3436 			&record.opts.sample_time_set,
3437 			"Record the sample timestamps"),
3438 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3439 			"Record the sample period"),
3440 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3441 		    "don't sample"),
3442 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3443 			&record.no_buildid_cache_set,
3444 			"do not update the buildid cache"),
3445 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3446 			&record.no_buildid_set,
3447 			"do not collect buildids in perf.data"),
3448 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3449 		     "monitor event in cgroup name only",
3450 		     parse_cgroups),
3451 	OPT_CALLBACK('D', "delay", &record, "ms",
3452 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3453 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3454 		     record__parse_event_enable_time),
3455 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3456 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3457 		   "user to profile"),
3458 
3459 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3460 		     "branch any", "sample any taken branches",
3461 		     parse_branch_stack),
3462 
3463 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3464 		     "branch filter mask", "branch stack filter modes",
3465 		     parse_branch_stack),
3466 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3467 		    "sample by weight (on special events only)"),
3468 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3469 		    "sample transaction flags (special events only)"),
3470 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3471 		    "use per-thread mmaps"),
3472 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3473 		    "sample selected machine registers on interrupt,"
3474 		    " use '-I?' to list register names", parse_intr_regs),
3475 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3476 		    "sample selected machine registers on interrupt,"
3477 		    " use '--user-regs=?' to list register names", parse_user_regs),
3478 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3479 		    "Record running/enabled time of read (:S) events"),
3480 	OPT_CALLBACK('k', "clockid", &record.opts,
3481 	"clockid", "clockid to use for events, see clock_gettime()",
3482 	parse_clockid),
3483 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3484 			  "opts", "AUX area tracing Snapshot Mode", ""),
3485 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3486 			  "opts", "sample AUX area", ""),
3487 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3488 			"per thread proc mmap processing timeout in ms"),
3489 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3490 		    "Record namespaces events"),
3491 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3492 		    "Record cgroup events"),
3493 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3494 			&record.opts.record_switch_events_set,
3495 			"Record context switch events"),
3496 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3497 			 "Configure all used events to run in kernel space.",
3498 			 PARSE_OPT_EXCLUSIVE),
3499 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3500 			 "Configure all used events to run in user space.",
3501 			 PARSE_OPT_EXCLUSIVE),
3502 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3503 		    "collect kernel callchains"),
3504 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3505 		    "collect user callchains"),
3506 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3507 		   "file", "vmlinux pathname"),
3508 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3509 		    "Record build-id of all DSOs regardless of hits"),
3510 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3511 		    "Record build-id in map events"),
3512 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3513 		    "append timestamp to output filename"),
3514 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3515 		    "Record timestamp boundary (time of first/last samples)"),
3516 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3517 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3518 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3519 			  "signal"),
3520 	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3521 			 &record.switch_output_event_set, "switch output event",
3522 			 "switch output event selector. use 'perf list' to list available events",
3523 			 parse_events_option_new_evlist),
3524 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3525 		   "Limit number of switch output generated files"),
3526 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3527 		    "Parse options then exit"),
3528 #ifdef HAVE_AIO_SUPPORT
3529 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3530 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3531 		     record__aio_parse),
3532 #endif
3533 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3534 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3535 		     record__parse_affinity),
3536 #ifdef HAVE_ZSTD_SUPPORT
3537 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3538 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3539 			    record__parse_comp_level),
3540 #endif
3541 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3542 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3543 	OPT_UINTEGER(0, "num-thread-synthesize",
3544 		     &record.opts.nr_threads_synthesize,
3545 		     "number of threads to run for event synthesis"),
3546 #ifdef HAVE_LIBPFM
3547 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3548 		"libpfm4 event selector. use 'perf list' to list available events",
3549 		parse_libpfm_events_option),
3550 #endif
3551 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3552 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3553 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3554 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3555 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3556 		      parse_control_option),
3557 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3558 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3559 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3560 			  &record.debuginfod.set, "debuginfod urls",
3561 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3562 			  "system"),
3563 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3564 			    "write collected trace data into several data files using parallel threads",
3565 			    record__parse_threads),
3566 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3567 	OPT_END()
3568 };
3569 
3570 struct option *record_options = __record_options;
3571 
3572 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3573 {
3574 	struct perf_cpu cpu;
3575 	int idx;
3576 
3577 	if (cpu_map__is_dummy(cpus))
3578 		return 0;
3579 
3580 	perf_cpu_map__for_each_cpu_skip_any(cpu, idx, cpus) {
3581 		/* Return ENODEV is input cpu is greater than max cpu */
3582 		if ((unsigned long)cpu.cpu > mask->nbits)
3583 			return -ENODEV;
3584 		__set_bit(cpu.cpu, mask->bits);
3585 	}
3586 
3587 	return 0;
3588 }
3589 
3590 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3591 {
3592 	struct perf_cpu_map *cpus;
3593 
3594 	cpus = perf_cpu_map__new(mask_spec);
3595 	if (!cpus)
3596 		return -ENOMEM;
3597 
3598 	bitmap_zero(mask->bits, mask->nbits);
3599 	if (record__mmap_cpu_mask_init(mask, cpus))
3600 		return -ENODEV;
3601 
3602 	perf_cpu_map__put(cpus);
3603 
3604 	return 0;
3605 }
3606 
3607 static void record__free_thread_masks(struct record *rec, int nr_threads)
3608 {
3609 	int t;
3610 
3611 	if (rec->thread_masks)
3612 		for (t = 0; t < nr_threads; t++)
3613 			record__thread_mask_free(&rec->thread_masks[t]);
3614 
3615 	zfree(&rec->thread_masks);
3616 }
3617 
3618 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3619 {
3620 	int t, ret;
3621 
3622 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3623 	if (!rec->thread_masks) {
3624 		pr_err("Failed to allocate thread masks\n");
3625 		return -ENOMEM;
3626 	}
3627 
3628 	for (t = 0; t < nr_threads; t++) {
3629 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3630 		if (ret) {
3631 			pr_err("Failed to allocate thread masks[%d]\n", t);
3632 			goto out_free;
3633 		}
3634 	}
3635 
3636 	return 0;
3637 
3638 out_free:
3639 	record__free_thread_masks(rec, nr_threads);
3640 
3641 	return ret;
3642 }
3643 
3644 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3645 {
3646 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3647 
3648 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3649 	if (ret)
3650 		return ret;
3651 
3652 	rec->nr_threads = nr_cpus;
3653 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3654 
3655 	for (t = 0; t < rec->nr_threads; t++) {
3656 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3657 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3658 		if (verbose > 0) {
3659 			pr_debug("thread_masks[%d]: ", t);
3660 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3661 			pr_debug("thread_masks[%d]: ", t);
3662 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3663 		}
3664 	}
3665 
3666 	return 0;
3667 }
3668 
3669 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3670 					  const char **maps_spec, const char **affinity_spec,
3671 					  u32 nr_spec)
3672 {
3673 	u32 s;
3674 	int ret = 0, t = 0;
3675 	struct mmap_cpu_mask cpus_mask;
3676 	struct thread_mask thread_mask, full_mask, *thread_masks;
3677 
3678 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3679 	if (ret) {
3680 		pr_err("Failed to allocate CPUs mask\n");
3681 		return ret;
3682 	}
3683 
3684 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3685 	if (ret) {
3686 		pr_err("Failed to init cpu mask\n");
3687 		goto out_free_cpu_mask;
3688 	}
3689 
3690 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3691 	if (ret) {
3692 		pr_err("Failed to allocate full mask\n");
3693 		goto out_free_cpu_mask;
3694 	}
3695 
3696 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3697 	if (ret) {
3698 		pr_err("Failed to allocate thread mask\n");
3699 		goto out_free_full_and_cpu_masks;
3700 	}
3701 
3702 	for (s = 0; s < nr_spec; s++) {
3703 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3704 		if (ret) {
3705 			pr_err("Failed to initialize maps thread mask\n");
3706 			goto out_free;
3707 		}
3708 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3709 		if (ret) {
3710 			pr_err("Failed to initialize affinity thread mask\n");
3711 			goto out_free;
3712 		}
3713 
3714 		/* ignore invalid CPUs but do not allow empty masks */
3715 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3716 				cpus_mask.bits, thread_mask.maps.nbits)) {
3717 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3718 			ret = -EINVAL;
3719 			goto out_free;
3720 		}
3721 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3722 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3723 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3724 			ret = -EINVAL;
3725 			goto out_free;
3726 		}
3727 
3728 		/* do not allow intersection with other masks (full_mask) */
3729 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3730 				      thread_mask.maps.nbits)) {
3731 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3732 			ret = -EINVAL;
3733 			goto out_free;
3734 		}
3735 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3736 				      thread_mask.affinity.nbits)) {
3737 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3738 			ret = -EINVAL;
3739 			goto out_free;
3740 		}
3741 
3742 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3743 			  thread_mask.maps.bits, full_mask.maps.nbits);
3744 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3745 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3746 
3747 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3748 		if (!thread_masks) {
3749 			pr_err("Failed to reallocate thread masks\n");
3750 			ret = -ENOMEM;
3751 			goto out_free;
3752 		}
3753 		rec->thread_masks = thread_masks;
3754 		rec->thread_masks[t] = thread_mask;
3755 		if (verbose > 0) {
3756 			pr_debug("thread_masks[%d]: ", t);
3757 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3758 			pr_debug("thread_masks[%d]: ", t);
3759 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3760 		}
3761 		t++;
3762 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3763 		if (ret) {
3764 			pr_err("Failed to allocate thread mask\n");
3765 			goto out_free_full_and_cpu_masks;
3766 		}
3767 	}
3768 	rec->nr_threads = t;
3769 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3770 	if (!rec->nr_threads)
3771 		ret = -EINVAL;
3772 
3773 out_free:
3774 	record__thread_mask_free(&thread_mask);
3775 out_free_full_and_cpu_masks:
3776 	record__thread_mask_free(&full_mask);
3777 out_free_cpu_mask:
3778 	record__mmap_cpu_mask_free(&cpus_mask);
3779 
3780 	return ret;
3781 }
3782 
3783 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3784 {
3785 	int ret;
3786 	struct cpu_topology *topo;
3787 
3788 	topo = cpu_topology__new();
3789 	if (!topo) {
3790 		pr_err("Failed to allocate CPU topology\n");
3791 		return -ENOMEM;
3792 	}
3793 
3794 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3795 					     topo->core_cpus_list, topo->core_cpus_lists);
3796 	cpu_topology__delete(topo);
3797 
3798 	return ret;
3799 }
3800 
3801 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3802 {
3803 	int ret;
3804 	struct cpu_topology *topo;
3805 
3806 	topo = cpu_topology__new();
3807 	if (!topo) {
3808 		pr_err("Failed to allocate CPU topology\n");
3809 		return -ENOMEM;
3810 	}
3811 
3812 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3813 					     topo->package_cpus_list, topo->package_cpus_lists);
3814 	cpu_topology__delete(topo);
3815 
3816 	return ret;
3817 }
3818 
3819 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3820 {
3821 	u32 s;
3822 	int ret;
3823 	const char **spec;
3824 	struct numa_topology *topo;
3825 
3826 	topo = numa_topology__new();
3827 	if (!topo) {
3828 		pr_err("Failed to allocate NUMA topology\n");
3829 		return -ENOMEM;
3830 	}
3831 
3832 	spec = zalloc(topo->nr * sizeof(char *));
3833 	if (!spec) {
3834 		pr_err("Failed to allocate NUMA spec\n");
3835 		ret = -ENOMEM;
3836 		goto out_delete_topo;
3837 	}
3838 	for (s = 0; s < topo->nr; s++)
3839 		spec[s] = topo->nodes[s].cpus;
3840 
3841 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3842 
3843 	zfree(&spec);
3844 
3845 out_delete_topo:
3846 	numa_topology__delete(topo);
3847 
3848 	return ret;
3849 }
3850 
3851 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3852 {
3853 	int t, ret;
3854 	u32 s, nr_spec = 0;
3855 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3856 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3857 
3858 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3859 		spec = strtok_r(user_spec, ":", &spec_ptr);
3860 		if (spec == NULL)
3861 			break;
3862 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3863 		mask = strtok_r(spec, "/", &mask_ptr);
3864 		if (mask == NULL)
3865 			break;
3866 		pr_debug2("  maps mask: %s\n", mask);
3867 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3868 		if (!tmp_spec) {
3869 			pr_err("Failed to reallocate maps spec\n");
3870 			ret = -ENOMEM;
3871 			goto out_free;
3872 		}
3873 		maps_spec = tmp_spec;
3874 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3875 		if (!maps_spec[nr_spec]) {
3876 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3877 			ret = -ENOMEM;
3878 			goto out_free;
3879 		}
3880 		mask = strtok_r(NULL, "/", &mask_ptr);
3881 		if (mask == NULL) {
3882 			pr_err("Invalid thread maps or affinity specs\n");
3883 			ret = -EINVAL;
3884 			goto out_free;
3885 		}
3886 		pr_debug2("  affinity mask: %s\n", mask);
3887 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3888 		if (!tmp_spec) {
3889 			pr_err("Failed to reallocate affinity spec\n");
3890 			ret = -ENOMEM;
3891 			goto out_free;
3892 		}
3893 		affinity_spec = tmp_spec;
3894 		affinity_spec[nr_spec] = strdup(mask);
3895 		if (!affinity_spec[nr_spec]) {
3896 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3897 			ret = -ENOMEM;
3898 			goto out_free;
3899 		}
3900 		dup_mask = NULL;
3901 		nr_spec++;
3902 	}
3903 
3904 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3905 					     (const char **)affinity_spec, nr_spec);
3906 
3907 out_free:
3908 	free(dup_mask);
3909 	for (s = 0; s < nr_spec; s++) {
3910 		if (maps_spec)
3911 			free(maps_spec[s]);
3912 		if (affinity_spec)
3913 			free(affinity_spec[s]);
3914 	}
3915 	free(affinity_spec);
3916 	free(maps_spec);
3917 
3918 	return ret;
3919 }
3920 
3921 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3922 {
3923 	int ret;
3924 
3925 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3926 	if (ret)
3927 		return ret;
3928 
3929 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3930 		return -ENODEV;
3931 
3932 	rec->nr_threads = 1;
3933 
3934 	return 0;
3935 }
3936 
3937 static int record__init_thread_masks(struct record *rec)
3938 {
3939 	int ret = 0;
3940 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3941 
3942 	if (!record__threads_enabled(rec))
3943 		return record__init_thread_default_masks(rec, cpus);
3944 
3945 	if (evlist__per_thread(rec->evlist)) {
3946 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3947 		return -EINVAL;
3948 	}
3949 
3950 	switch (rec->opts.threads_spec) {
3951 	case THREAD_SPEC__CPU:
3952 		ret = record__init_thread_cpu_masks(rec, cpus);
3953 		break;
3954 	case THREAD_SPEC__CORE:
3955 		ret = record__init_thread_core_masks(rec, cpus);
3956 		break;
3957 	case THREAD_SPEC__PACKAGE:
3958 		ret = record__init_thread_package_masks(rec, cpus);
3959 		break;
3960 	case THREAD_SPEC__NUMA:
3961 		ret = record__init_thread_numa_masks(rec, cpus);
3962 		break;
3963 	case THREAD_SPEC__USER:
3964 		ret = record__init_thread_user_masks(rec, cpus);
3965 		break;
3966 	default:
3967 		break;
3968 	}
3969 
3970 	return ret;
3971 }
3972 
3973 int cmd_record(int argc, const char **argv)
3974 {
3975 	int err;
3976 	struct record *rec = &record;
3977 	char errbuf[BUFSIZ];
3978 
3979 	setlocale(LC_ALL, "");
3980 
3981 #ifndef HAVE_BPF_SKEL
3982 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3983 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3984 # undef set_nobuild
3985 #endif
3986 
3987 	/* Disable eager loading of kernel symbols that adds overhead to perf record. */
3988 	symbol_conf.lazy_load_kernel_maps = true;
3989 	rec->opts.affinity = PERF_AFFINITY_SYS;
3990 
3991 	rec->evlist = evlist__new();
3992 	if (rec->evlist == NULL)
3993 		return -ENOMEM;
3994 
3995 	err = perf_config(perf_record_config, rec);
3996 	if (err)
3997 		return err;
3998 
3999 	argc = parse_options(argc, argv, record_options, record_usage,
4000 			    PARSE_OPT_STOP_AT_NON_OPTION);
4001 	if (quiet)
4002 		perf_quiet_option();
4003 
4004 	err = symbol__validate_sym_arguments();
4005 	if (err)
4006 		return err;
4007 
4008 	perf_debuginfod_setup(&record.debuginfod);
4009 
4010 	/* Make system wide (-a) the default target. */
4011 	if (!argc && target__none(&rec->opts.target))
4012 		rec->opts.target.system_wide = true;
4013 
4014 	if (nr_cgroups && !rec->opts.target.system_wide) {
4015 		usage_with_options_msg(record_usage, record_options,
4016 			"cgroup monitoring only available in system-wide mode");
4017 
4018 	}
4019 
4020 	if (rec->buildid_mmap) {
4021 		if (!perf_can_record_build_id()) {
4022 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4023 			err = -EINVAL;
4024 			goto out_opts;
4025 		}
4026 		pr_debug("Enabling build id in mmap2 events.\n");
4027 		/* Enable mmap build id synthesizing. */
4028 		symbol_conf.buildid_mmap2 = true;
4029 		/* Enable perf_event_attr::build_id bit. */
4030 		rec->opts.build_id = true;
4031 		/* Disable build id cache. */
4032 		rec->no_buildid = true;
4033 	}
4034 
4035 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4036 		pr_err("Kernel has no cgroup sampling support.\n");
4037 		err = -EINVAL;
4038 		goto out_opts;
4039 	}
4040 
4041 	if (rec->opts.kcore)
4042 		rec->opts.text_poke = true;
4043 
4044 	if (rec->opts.kcore || record__threads_enabled(rec))
4045 		rec->data.is_dir = true;
4046 
4047 	if (record__threads_enabled(rec)) {
4048 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4049 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4050 			goto out_opts;
4051 		}
4052 		if (record__aio_enabled(rec)) {
4053 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4054 			goto out_opts;
4055 		}
4056 	}
4057 
4058 	if (rec->opts.comp_level != 0) {
4059 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4060 		rec->no_buildid = true;
4061 	}
4062 
4063 	if (rec->opts.record_switch_events &&
4064 	    !perf_can_record_switch_events()) {
4065 		ui__error("kernel does not support recording context switch events\n");
4066 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4067 		err = -EINVAL;
4068 		goto out_opts;
4069 	}
4070 
4071 	if (switch_output_setup(rec)) {
4072 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4073 		err = -EINVAL;
4074 		goto out_opts;
4075 	}
4076 
4077 	if (rec->switch_output.time) {
4078 		signal(SIGALRM, alarm_sig_handler);
4079 		alarm(rec->switch_output.time);
4080 	}
4081 
4082 	if (rec->switch_output.num_files) {
4083 		rec->switch_output.filenames = calloc(sizeof(char *),
4084 						      rec->switch_output.num_files);
4085 		if (!rec->switch_output.filenames) {
4086 			err = -EINVAL;
4087 			goto out_opts;
4088 		}
4089 	}
4090 
4091 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4092 		rec->timestamp_filename = false;
4093 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4094 	}
4095 
4096 	/*
4097 	 * Allow aliases to facilitate the lookup of symbols for address
4098 	 * filters. Refer to auxtrace_parse_filters().
4099 	 */
4100 	symbol_conf.allow_aliases = true;
4101 
4102 	symbol__init(NULL);
4103 
4104 	err = record__auxtrace_init(rec);
4105 	if (err)
4106 		goto out;
4107 
4108 	if (dry_run)
4109 		goto out;
4110 
4111 	err = -ENOMEM;
4112 
4113 	if (rec->no_buildid_cache || rec->no_buildid) {
4114 		disable_buildid_cache();
4115 	} else if (rec->switch_output.enabled) {
4116 		/*
4117 		 * In 'perf record --switch-output', disable buildid
4118 		 * generation by default to reduce data file switching
4119 		 * overhead. Still generate buildid if they are required
4120 		 * explicitly using
4121 		 *
4122 		 *  perf record --switch-output --no-no-buildid \
4123 		 *              --no-no-buildid-cache
4124 		 *
4125 		 * Following code equals to:
4126 		 *
4127 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4128 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4129 		 *         disable_buildid_cache();
4130 		 */
4131 		bool disable = true;
4132 
4133 		if (rec->no_buildid_set && !rec->no_buildid)
4134 			disable = false;
4135 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4136 			disable = false;
4137 		if (disable) {
4138 			rec->no_buildid = true;
4139 			rec->no_buildid_cache = true;
4140 			disable_buildid_cache();
4141 		}
4142 	}
4143 
4144 	if (record.opts.overwrite)
4145 		record.opts.tail_synthesize = true;
4146 
4147 	if (rec->evlist->core.nr_entries == 0) {
4148 		bool can_profile_kernel = perf_event_paranoid_check(1);
4149 
4150 		err = parse_event(rec->evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu");
4151 		if (err)
4152 			goto out;
4153 	}
4154 
4155 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4156 		rec->opts.no_inherit = true;
4157 
4158 	err = target__validate(&rec->opts.target);
4159 	if (err) {
4160 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4161 		ui__warning("%s\n", errbuf);
4162 	}
4163 
4164 	err = target__parse_uid(&rec->opts.target);
4165 	if (err) {
4166 		int saved_errno = errno;
4167 
4168 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4169 		ui__error("%s", errbuf);
4170 
4171 		err = -saved_errno;
4172 		goto out;
4173 	}
4174 
4175 	/* Enable ignoring missing threads when -u/-p option is defined. */
4176 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4177 
4178 	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4179 
4180 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4181 		arch__add_leaf_frame_record_opts(&rec->opts);
4182 
4183 	err = -ENOMEM;
4184 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4185 		if (rec->opts.target.pid != NULL) {
4186 			pr_err("Couldn't create thread/CPU maps: %s\n",
4187 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4188 			goto out;
4189 		}
4190 		else
4191 			usage_with_options(record_usage, record_options);
4192 	}
4193 
4194 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4195 	if (err)
4196 		goto out;
4197 
4198 	/*
4199 	 * We take all buildids when the file contains
4200 	 * AUX area tracing data because we do not decode the
4201 	 * trace because it would take too long.
4202 	 */
4203 	if (rec->opts.full_auxtrace)
4204 		rec->buildid_all = true;
4205 
4206 	if (rec->opts.text_poke) {
4207 		err = record__config_text_poke(rec->evlist);
4208 		if (err) {
4209 			pr_err("record__config_text_poke failed, error %d\n", err);
4210 			goto out;
4211 		}
4212 	}
4213 
4214 	if (rec->off_cpu) {
4215 		err = record__config_off_cpu(rec);
4216 		if (err) {
4217 			pr_err("record__config_off_cpu failed, error %d\n", err);
4218 			goto out;
4219 		}
4220 	}
4221 
4222 	if (record_opts__config(&rec->opts)) {
4223 		err = -EINVAL;
4224 		goto out;
4225 	}
4226 
4227 	err = record__config_tracking_events(rec);
4228 	if (err) {
4229 		pr_err("record__config_tracking_events failed, error %d\n", err);
4230 		goto out;
4231 	}
4232 
4233 	err = record__init_thread_masks(rec);
4234 	if (err) {
4235 		pr_err("Failed to initialize parallel data streaming masks\n");
4236 		goto out;
4237 	}
4238 
4239 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4240 		rec->opts.nr_cblocks = nr_cblocks_max;
4241 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4242 
4243 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4244 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4245 
4246 	if (rec->opts.comp_level > comp_level_max)
4247 		rec->opts.comp_level = comp_level_max;
4248 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4249 
4250 	err = __cmd_record(&record, argc, argv);
4251 out:
4252 	evlist__delete(rec->evlist);
4253 	symbol__exit();
4254 	auxtrace_record__free(rec->itr);
4255 out_opts:
4256 	record__free_thread_masks(rec, rec->nr_threads);
4257 	rec->nr_threads = 0;
4258 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4259 	return err;
4260 }
4261 
4262 static void snapshot_sig_handler(int sig __maybe_unused)
4263 {
4264 	struct record *rec = &record;
4265 
4266 	hit_auxtrace_snapshot_trigger(rec);
4267 
4268 	if (switch_output_signal(rec))
4269 		trigger_hit(&switch_output_trigger);
4270 }
4271 
4272 static void alarm_sig_handler(int sig __maybe_unused)
4273 {
4274 	struct record *rec = &record;
4275 
4276 	if (switch_output_time(rec))
4277 		trigger_hit(&switch_output_trigger);
4278 }
4279