xref: /linux/tools/perf/builtin-record.c (revision 16e5ac127d8d18adf85fe5ba847d77b58d1ed418)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include <internal/xyarray.h>
14 #include "util/parse-events.h"
15 #include "util/config.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/mmap.h"
25 #include "util/mutex.h"
26 #include "util/target.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/record.h"
31 #include "util/cpumap.h"
32 #include "util/thread_map.h"
33 #include "util/data.h"
34 #include "util/perf_regs.h"
35 #include "util/auxtrace.h"
36 #include "util/tsc.h"
37 #include "util/parse-branch-options.h"
38 #include "util/parse-regs-options.h"
39 #include "util/perf_api_probe.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/pmu.h"
50 #include "util/pmus.h"
51 #include "util/clockid.h"
52 #include "util/off_cpu.h"
53 #include "util/bpf-filter.h"
54 #include "asm/bug.h"
55 #include "perf.h"
56 #include "cputopo.h"
57 
58 #include <errno.h>
59 #include <inttypes.h>
60 #include <locale.h>
61 #include <poll.h>
62 #include <pthread.h>
63 #include <unistd.h>
64 #ifndef HAVE_GETTID
65 #include <syscall.h>
66 #endif
67 #include <sched.h>
68 #include <signal.h>
69 #ifdef HAVE_EVENTFD_SUPPORT
70 #include <sys/eventfd.h>
71 #endif
72 #include <sys/mman.h>
73 #include <sys/wait.h>
74 #include <sys/types.h>
75 #include <sys/stat.h>
76 #include <fcntl.h>
77 #include <linux/err.h>
78 #include <linux/string.h>
79 #include <linux/time64.h>
80 #include <linux/zalloc.h>
81 #include <linux/bitmap.h>
82 #include <sys/time.h>
83 
84 struct switch_output {
85 	bool		 enabled;
86 	bool		 signal;
87 	unsigned long	 size;
88 	unsigned long	 time;
89 	const char	*str;
90 	bool		 set;
91 	char		 **filenames;
92 	int		 num_files;
93 	int		 cur_file;
94 };
95 
96 struct thread_mask {
97 	struct mmap_cpu_mask	maps;
98 	struct mmap_cpu_mask	affinity;
99 };
100 
101 struct record_thread {
102 	pid_t			tid;
103 	struct thread_mask	*mask;
104 	struct {
105 		int		msg[2];
106 		int		ack[2];
107 	} pipes;
108 	struct fdarray		pollfd;
109 	int			ctlfd_pos;
110 	int			nr_mmaps;
111 	struct mmap		**maps;
112 	struct mmap		**overwrite_maps;
113 	struct record		*rec;
114 	unsigned long long	samples;
115 	unsigned long		waking;
116 	u64			bytes_written;
117 	u64			bytes_transferred;
118 	u64			bytes_compressed;
119 };
120 
121 static __thread struct record_thread *thread;
122 
123 enum thread_msg {
124 	THREAD_MSG__UNDEFINED = 0,
125 	THREAD_MSG__READY,
126 	THREAD_MSG__MAX,
127 };
128 
129 static const char *thread_msg_tags[THREAD_MSG__MAX] = {
130 	"UNDEFINED", "READY"
131 };
132 
133 enum thread_spec {
134 	THREAD_SPEC__UNDEFINED = 0,
135 	THREAD_SPEC__CPU,
136 	THREAD_SPEC__CORE,
137 	THREAD_SPEC__PACKAGE,
138 	THREAD_SPEC__NUMA,
139 	THREAD_SPEC__USER,
140 	THREAD_SPEC__MAX,
141 };
142 
143 static const char *thread_spec_tags[THREAD_SPEC__MAX] = {
144 	"undefined", "cpu", "core", "package", "numa", "user"
145 };
146 
147 struct pollfd_index_map {
148 	int evlist_pollfd_index;
149 	int thread_pollfd_index;
150 };
151 
152 struct record {
153 	struct perf_tool	tool;
154 	struct record_opts	opts;
155 	u64			bytes_written;
156 	u64			thread_bytes_written;
157 	struct perf_data	data;
158 	struct auxtrace_record	*itr;
159 	struct evlist	*evlist;
160 	struct perf_session	*session;
161 	struct evlist		*sb_evlist;
162 	pthread_t		thread_id;
163 	int			realtime_prio;
164 	bool			switch_output_event_set;
165 	bool			no_buildid;
166 	bool			no_buildid_set;
167 	bool			no_buildid_cache;
168 	bool			no_buildid_cache_set;
169 	bool			buildid_all;
170 	bool			buildid_mmap;
171 	bool			timestamp_filename;
172 	bool			timestamp_boundary;
173 	bool			off_cpu;
174 	struct switch_output	switch_output;
175 	unsigned long long	samples;
176 	unsigned long		output_max_size;	/* = 0: unlimited */
177 	struct perf_debuginfod	debuginfod;
178 	int			nr_threads;
179 	struct thread_mask	*thread_masks;
180 	struct record_thread	*thread_data;
181 	struct pollfd_index_map	*index_map;
182 	size_t			index_map_sz;
183 	size_t			index_map_cnt;
184 };
185 
186 static volatile int done;
187 
188 static volatile int auxtrace_record__snapshot_started;
189 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
190 static DEFINE_TRIGGER(switch_output_trigger);
191 
192 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
193 	"SYS", "NODE", "CPU"
194 };
195 
196 #ifndef HAVE_GETTID
197 static inline pid_t gettid(void)
198 {
199 	return (pid_t)syscall(__NR_gettid);
200 }
201 #endif
202 
203 static int record__threads_enabled(struct record *rec)
204 {
205 	return rec->opts.threads_spec;
206 }
207 
208 static bool switch_output_signal(struct record *rec)
209 {
210 	return rec->switch_output.signal &&
211 	       trigger_is_ready(&switch_output_trigger);
212 }
213 
214 static bool switch_output_size(struct record *rec)
215 {
216 	return rec->switch_output.size &&
217 	       trigger_is_ready(&switch_output_trigger) &&
218 	       (rec->bytes_written >= rec->switch_output.size);
219 }
220 
221 static bool switch_output_time(struct record *rec)
222 {
223 	return rec->switch_output.time &&
224 	       trigger_is_ready(&switch_output_trigger);
225 }
226 
227 static u64 record__bytes_written(struct record *rec)
228 {
229 	return rec->bytes_written + rec->thread_bytes_written;
230 }
231 
232 static bool record__output_max_size_exceeded(struct record *rec)
233 {
234 	return rec->output_max_size &&
235 	       (record__bytes_written(rec) >= rec->output_max_size);
236 }
237 
238 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
239 			 void *bf, size_t size)
240 {
241 	struct perf_data_file *file = &rec->session->data->file;
242 
243 	if (map && map->file)
244 		file = map->file;
245 
246 	if (perf_data_file__write(file, bf, size) < 0) {
247 		pr_err("failed to write perf data, error: %m\n");
248 		return -1;
249 	}
250 
251 	if (map && map->file) {
252 		thread->bytes_written += size;
253 		rec->thread_bytes_written += size;
254 	} else {
255 		rec->bytes_written += size;
256 	}
257 
258 	if (record__output_max_size_exceeded(rec) && !done) {
259 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
260 				" stopping session ]\n",
261 				record__bytes_written(rec) >> 10);
262 		done = 1;
263 	}
264 
265 	if (switch_output_size(rec))
266 		trigger_hit(&switch_output_trigger);
267 
268 	return 0;
269 }
270 
271 static int record__aio_enabled(struct record *rec);
272 static int record__comp_enabled(struct record *rec);
273 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
274 			    void *dst, size_t dst_size, void *src, size_t src_size);
275 
276 #ifdef HAVE_AIO_SUPPORT
277 static int record__aio_write(struct aiocb *cblock, int trace_fd,
278 		void *buf, size_t size, off_t off)
279 {
280 	int rc;
281 
282 	cblock->aio_fildes = trace_fd;
283 	cblock->aio_buf    = buf;
284 	cblock->aio_nbytes = size;
285 	cblock->aio_offset = off;
286 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
287 
288 	do {
289 		rc = aio_write(cblock);
290 		if (rc == 0) {
291 			break;
292 		} else if (errno != EAGAIN) {
293 			cblock->aio_fildes = -1;
294 			pr_err("failed to queue perf data, error: %m\n");
295 			break;
296 		}
297 	} while (1);
298 
299 	return rc;
300 }
301 
302 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
303 {
304 	void *rem_buf;
305 	off_t rem_off;
306 	size_t rem_size;
307 	int rc, aio_errno;
308 	ssize_t aio_ret, written;
309 
310 	aio_errno = aio_error(cblock);
311 	if (aio_errno == EINPROGRESS)
312 		return 0;
313 
314 	written = aio_ret = aio_return(cblock);
315 	if (aio_ret < 0) {
316 		if (aio_errno != EINTR)
317 			pr_err("failed to write perf data, error: %m\n");
318 		written = 0;
319 	}
320 
321 	rem_size = cblock->aio_nbytes - written;
322 
323 	if (rem_size == 0) {
324 		cblock->aio_fildes = -1;
325 		/*
326 		 * md->refcount is incremented in record__aio_pushfn() for
327 		 * every aio write request started in record__aio_push() so
328 		 * decrement it because the request is now complete.
329 		 */
330 		perf_mmap__put(&md->core);
331 		rc = 1;
332 	} else {
333 		/*
334 		 * aio write request may require restart with the
335 		 * reminder if the kernel didn't write whole
336 		 * chunk at once.
337 		 */
338 		rem_off = cblock->aio_offset + written;
339 		rem_buf = (void *)(cblock->aio_buf + written);
340 		record__aio_write(cblock, cblock->aio_fildes,
341 				rem_buf, rem_size, rem_off);
342 		rc = 0;
343 	}
344 
345 	return rc;
346 }
347 
348 static int record__aio_sync(struct mmap *md, bool sync_all)
349 {
350 	struct aiocb **aiocb = md->aio.aiocb;
351 	struct aiocb *cblocks = md->aio.cblocks;
352 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
353 	int i, do_suspend;
354 
355 	do {
356 		do_suspend = 0;
357 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
358 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
359 				if (sync_all)
360 					aiocb[i] = NULL;
361 				else
362 					return i;
363 			} else {
364 				/*
365 				 * Started aio write is not complete yet
366 				 * so it has to be waited before the
367 				 * next allocation.
368 				 */
369 				aiocb[i] = &cblocks[i];
370 				do_suspend = 1;
371 			}
372 		}
373 		if (!do_suspend)
374 			return -1;
375 
376 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
377 			if (!(errno == EAGAIN || errno == EINTR))
378 				pr_err("failed to sync perf data, error: %m\n");
379 		}
380 	} while (1);
381 }
382 
383 struct record_aio {
384 	struct record	*rec;
385 	void		*data;
386 	size_t		size;
387 };
388 
389 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
390 {
391 	struct record_aio *aio = to;
392 
393 	/*
394 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
395 	 * to release space in the kernel buffer as fast as possible, calling
396 	 * perf_mmap__consume() from perf_mmap__push() function.
397 	 *
398 	 * That lets the kernel to proceed with storing more profiling data into
399 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
400 	 *
401 	 * Coping can be done in two steps in case the chunk of profiling data
402 	 * crosses the upper bound of the kernel buffer. In this case we first move
403 	 * part of data from map->start till the upper bound and then the reminder
404 	 * from the beginning of the kernel buffer till the end of the data chunk.
405 	 */
406 
407 	if (record__comp_enabled(aio->rec)) {
408 		size = zstd_compress(aio->rec->session, NULL, aio->data + aio->size,
409 				     mmap__mmap_len(map) - aio->size,
410 				     buf, size);
411 	} else {
412 		memcpy(aio->data + aio->size, buf, size);
413 	}
414 
415 	if (!aio->size) {
416 		/*
417 		 * Increment map->refcount to guard map->aio.data[] buffer
418 		 * from premature deallocation because map object can be
419 		 * released earlier than aio write request started on
420 		 * map->aio.data[] buffer is complete.
421 		 *
422 		 * perf_mmap__put() is done at record__aio_complete()
423 		 * after started aio request completion or at record__aio_push()
424 		 * if the request failed to start.
425 		 */
426 		perf_mmap__get(&map->core);
427 	}
428 
429 	aio->size += size;
430 
431 	return size;
432 }
433 
434 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
435 {
436 	int ret, idx;
437 	int trace_fd = rec->session->data->file.fd;
438 	struct record_aio aio = { .rec = rec, .size = 0 };
439 
440 	/*
441 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
442 	 * becomes available after previous aio write operation.
443 	 */
444 
445 	idx = record__aio_sync(map, false);
446 	aio.data = map->aio.data[idx];
447 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
448 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
449 		return ret;
450 
451 	rec->samples++;
452 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
453 	if (!ret) {
454 		*off += aio.size;
455 		rec->bytes_written += aio.size;
456 		if (switch_output_size(rec))
457 			trigger_hit(&switch_output_trigger);
458 	} else {
459 		/*
460 		 * Decrement map->refcount incremented in record__aio_pushfn()
461 		 * back if record__aio_write() operation failed to start, otherwise
462 		 * map->refcount is decremented in record__aio_complete() after
463 		 * aio write operation finishes successfully.
464 		 */
465 		perf_mmap__put(&map->core);
466 	}
467 
468 	return ret;
469 }
470 
471 static off_t record__aio_get_pos(int trace_fd)
472 {
473 	return lseek(trace_fd, 0, SEEK_CUR);
474 }
475 
476 static void record__aio_set_pos(int trace_fd, off_t pos)
477 {
478 	lseek(trace_fd, pos, SEEK_SET);
479 }
480 
481 static void record__aio_mmap_read_sync(struct record *rec)
482 {
483 	int i;
484 	struct evlist *evlist = rec->evlist;
485 	struct mmap *maps = evlist->mmap;
486 
487 	if (!record__aio_enabled(rec))
488 		return;
489 
490 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
491 		struct mmap *map = &maps[i];
492 
493 		if (map->core.base)
494 			record__aio_sync(map, true);
495 	}
496 }
497 
498 static int nr_cblocks_default = 1;
499 static int nr_cblocks_max = 4;
500 
501 static int record__aio_parse(const struct option *opt,
502 			     const char *str,
503 			     int unset)
504 {
505 	struct record_opts *opts = (struct record_opts *)opt->value;
506 
507 	if (unset) {
508 		opts->nr_cblocks = 0;
509 	} else {
510 		if (str)
511 			opts->nr_cblocks = strtol(str, NULL, 0);
512 		if (!opts->nr_cblocks)
513 			opts->nr_cblocks = nr_cblocks_default;
514 	}
515 
516 	return 0;
517 }
518 #else /* HAVE_AIO_SUPPORT */
519 static int nr_cblocks_max = 0;
520 
521 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
522 			    off_t *off __maybe_unused)
523 {
524 	return -1;
525 }
526 
527 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
528 {
529 	return -1;
530 }
531 
532 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
533 {
534 }
535 
536 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
537 {
538 }
539 #endif
540 
541 static int record__aio_enabled(struct record *rec)
542 {
543 	return rec->opts.nr_cblocks > 0;
544 }
545 
546 #define MMAP_FLUSH_DEFAULT 1
547 static int record__mmap_flush_parse(const struct option *opt,
548 				    const char *str,
549 				    int unset)
550 {
551 	int flush_max;
552 	struct record_opts *opts = (struct record_opts *)opt->value;
553 	static struct parse_tag tags[] = {
554 			{ .tag  = 'B', .mult = 1       },
555 			{ .tag  = 'K', .mult = 1 << 10 },
556 			{ .tag  = 'M', .mult = 1 << 20 },
557 			{ .tag  = 'G', .mult = 1 << 30 },
558 			{ .tag  = 0 },
559 	};
560 
561 	if (unset)
562 		return 0;
563 
564 	if (str) {
565 		opts->mmap_flush = parse_tag_value(str, tags);
566 		if (opts->mmap_flush == (int)-1)
567 			opts->mmap_flush = strtol(str, NULL, 0);
568 	}
569 
570 	if (!opts->mmap_flush)
571 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
572 
573 	flush_max = evlist__mmap_size(opts->mmap_pages);
574 	flush_max /= 4;
575 	if (opts->mmap_flush > flush_max)
576 		opts->mmap_flush = flush_max;
577 
578 	return 0;
579 }
580 
581 #ifdef HAVE_ZSTD_SUPPORT
582 static unsigned int comp_level_default = 1;
583 
584 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
585 {
586 	struct record_opts *opts = opt->value;
587 
588 	if (unset) {
589 		opts->comp_level = 0;
590 	} else {
591 		if (str)
592 			opts->comp_level = strtol(str, NULL, 0);
593 		if (!opts->comp_level)
594 			opts->comp_level = comp_level_default;
595 	}
596 
597 	return 0;
598 }
599 #endif
600 static unsigned int comp_level_max = 22;
601 
602 static int record__comp_enabled(struct record *rec)
603 {
604 	return rec->opts.comp_level > 0;
605 }
606 
607 static int process_synthesized_event(struct perf_tool *tool,
608 				     union perf_event *event,
609 				     struct perf_sample *sample __maybe_unused,
610 				     struct machine *machine __maybe_unused)
611 {
612 	struct record *rec = container_of(tool, struct record, tool);
613 	return record__write(rec, NULL, event, event->header.size);
614 }
615 
616 static struct mutex synth_lock;
617 
618 static int process_locked_synthesized_event(struct perf_tool *tool,
619 				     union perf_event *event,
620 				     struct perf_sample *sample __maybe_unused,
621 				     struct machine *machine __maybe_unused)
622 {
623 	int ret;
624 
625 	mutex_lock(&synth_lock);
626 	ret = process_synthesized_event(tool, event, sample, machine);
627 	mutex_unlock(&synth_lock);
628 	return ret;
629 }
630 
631 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
632 {
633 	struct record *rec = to;
634 
635 	if (record__comp_enabled(rec)) {
636 		size = zstd_compress(rec->session, map, map->data, mmap__mmap_len(map), bf, size);
637 		bf   = map->data;
638 	}
639 
640 	thread->samples++;
641 	return record__write(rec, map, bf, size);
642 }
643 
644 static volatile sig_atomic_t signr = -1;
645 static volatile sig_atomic_t child_finished;
646 #ifdef HAVE_EVENTFD_SUPPORT
647 static volatile sig_atomic_t done_fd = -1;
648 #endif
649 
650 static void sig_handler(int sig)
651 {
652 	if (sig == SIGCHLD)
653 		child_finished = 1;
654 	else
655 		signr = sig;
656 
657 	done = 1;
658 #ifdef HAVE_EVENTFD_SUPPORT
659 	if (done_fd >= 0) {
660 		u64 tmp = 1;
661 		int orig_errno = errno;
662 
663 		/*
664 		 * It is possible for this signal handler to run after done is
665 		 * checked in the main loop, but before the perf counter fds are
666 		 * polled. If this happens, the poll() will continue to wait
667 		 * even though done is set, and will only break out if either
668 		 * another signal is received, or the counters are ready for
669 		 * read. To ensure the poll() doesn't sleep when done is set,
670 		 * use an eventfd (done_fd) to wake up the poll().
671 		 */
672 		if (write(done_fd, &tmp, sizeof(tmp)) < 0)
673 			pr_err("failed to signal wakeup fd, error: %m\n");
674 
675 		errno = orig_errno;
676 	}
677 #endif // HAVE_EVENTFD_SUPPORT
678 }
679 
680 static void sigsegv_handler(int sig)
681 {
682 	perf_hooks__recover();
683 	sighandler_dump_stack(sig);
684 }
685 
686 static void record__sig_exit(void)
687 {
688 	if (signr == -1)
689 		return;
690 
691 	signal(signr, SIG_DFL);
692 	raise(signr);
693 }
694 
695 #ifdef HAVE_AUXTRACE_SUPPORT
696 
697 static int record__process_auxtrace(struct perf_tool *tool,
698 				    struct mmap *map,
699 				    union perf_event *event, void *data1,
700 				    size_t len1, void *data2, size_t len2)
701 {
702 	struct record *rec = container_of(tool, struct record, tool);
703 	struct perf_data *data = &rec->data;
704 	size_t padding;
705 	u8 pad[8] = {0};
706 
707 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
708 		off_t file_offset;
709 		int fd = perf_data__fd(data);
710 		int err;
711 
712 		file_offset = lseek(fd, 0, SEEK_CUR);
713 		if (file_offset == -1)
714 			return -1;
715 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
716 						     event, file_offset);
717 		if (err)
718 			return err;
719 	}
720 
721 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
722 	padding = (len1 + len2) & 7;
723 	if (padding)
724 		padding = 8 - padding;
725 
726 	record__write(rec, map, event, event->header.size);
727 	record__write(rec, map, data1, len1);
728 	if (len2)
729 		record__write(rec, map, data2, len2);
730 	record__write(rec, map, &pad, padding);
731 
732 	return 0;
733 }
734 
735 static int record__auxtrace_mmap_read(struct record *rec,
736 				      struct mmap *map)
737 {
738 	int ret;
739 
740 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
741 				  record__process_auxtrace);
742 	if (ret < 0)
743 		return ret;
744 
745 	if (ret)
746 		rec->samples++;
747 
748 	return 0;
749 }
750 
751 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
752 					       struct mmap *map)
753 {
754 	int ret;
755 
756 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
757 					   record__process_auxtrace,
758 					   rec->opts.auxtrace_snapshot_size);
759 	if (ret < 0)
760 		return ret;
761 
762 	if (ret)
763 		rec->samples++;
764 
765 	return 0;
766 }
767 
768 static int record__auxtrace_read_snapshot_all(struct record *rec)
769 {
770 	int i;
771 	int rc = 0;
772 
773 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
774 		struct mmap *map = &rec->evlist->mmap[i];
775 
776 		if (!map->auxtrace_mmap.base)
777 			continue;
778 
779 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
780 			rc = -1;
781 			goto out;
782 		}
783 	}
784 out:
785 	return rc;
786 }
787 
788 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
789 {
790 	pr_debug("Recording AUX area tracing snapshot\n");
791 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
792 		trigger_error(&auxtrace_snapshot_trigger);
793 	} else {
794 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
795 			trigger_error(&auxtrace_snapshot_trigger);
796 		else
797 			trigger_ready(&auxtrace_snapshot_trigger);
798 	}
799 }
800 
801 static int record__auxtrace_snapshot_exit(struct record *rec)
802 {
803 	if (trigger_is_error(&auxtrace_snapshot_trigger))
804 		return 0;
805 
806 	if (!auxtrace_record__snapshot_started &&
807 	    auxtrace_record__snapshot_start(rec->itr))
808 		return -1;
809 
810 	record__read_auxtrace_snapshot(rec, true);
811 	if (trigger_is_error(&auxtrace_snapshot_trigger))
812 		return -1;
813 
814 	return 0;
815 }
816 
817 static int record__auxtrace_init(struct record *rec)
818 {
819 	int err;
820 
821 	if ((rec->opts.auxtrace_snapshot_opts || rec->opts.auxtrace_sample_opts)
822 	    && record__threads_enabled(rec)) {
823 		pr_err("AUX area tracing options are not available in parallel streaming mode.\n");
824 		return -EINVAL;
825 	}
826 
827 	if (!rec->itr) {
828 		rec->itr = auxtrace_record__init(rec->evlist, &err);
829 		if (err)
830 			return err;
831 	}
832 
833 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
834 					      rec->opts.auxtrace_snapshot_opts);
835 	if (err)
836 		return err;
837 
838 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
839 					    rec->opts.auxtrace_sample_opts);
840 	if (err)
841 		return err;
842 
843 	auxtrace_regroup_aux_output(rec->evlist);
844 
845 	return auxtrace_parse_filters(rec->evlist);
846 }
847 
848 #else
849 
850 static inline
851 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
852 			       struct mmap *map __maybe_unused)
853 {
854 	return 0;
855 }
856 
857 static inline
858 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
859 				    bool on_exit __maybe_unused)
860 {
861 }
862 
863 static inline
864 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
865 {
866 	return 0;
867 }
868 
869 static inline
870 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
871 {
872 	return 0;
873 }
874 
875 static int record__auxtrace_init(struct record *rec __maybe_unused)
876 {
877 	return 0;
878 }
879 
880 #endif
881 
882 static int record__config_text_poke(struct evlist *evlist)
883 {
884 	struct evsel *evsel;
885 
886 	/* Nothing to do if text poke is already configured */
887 	evlist__for_each_entry(evlist, evsel) {
888 		if (evsel->core.attr.text_poke)
889 			return 0;
890 	}
891 
892 	evsel = evlist__add_dummy_on_all_cpus(evlist);
893 	if (!evsel)
894 		return -ENOMEM;
895 
896 	evsel->core.attr.text_poke = 1;
897 	evsel->core.attr.ksymbol = 1;
898 	evsel->immediate = true;
899 	evsel__set_sample_bit(evsel, TIME);
900 
901 	return 0;
902 }
903 
904 static int record__config_off_cpu(struct record *rec)
905 {
906 	return off_cpu_prepare(rec->evlist, &rec->opts.target, &rec->opts);
907 }
908 
909 static bool record__tracking_system_wide(struct record *rec)
910 {
911 	struct evlist *evlist = rec->evlist;
912 	struct evsel *evsel;
913 
914 	/*
915 	 * If non-dummy evsel exists, system_wide sideband is need to
916 	 * help parse sample information.
917 	 * For example, PERF_EVENT_MMAP event to help parse symbol,
918 	 * and PERF_EVENT_COMM event to help parse task executable name.
919 	 */
920 	evlist__for_each_entry(evlist, evsel) {
921 		if (!evsel__is_dummy_event(evsel))
922 			return true;
923 	}
924 
925 	return false;
926 }
927 
928 static int record__config_tracking_events(struct record *rec)
929 {
930 	struct record_opts *opts = &rec->opts;
931 	struct evlist *evlist = rec->evlist;
932 	bool system_wide = false;
933 	struct evsel *evsel;
934 
935 	/*
936 	 * For initial_delay, system wide or a hybrid system, we need to add
937 	 * tracking event so that we can track PERF_RECORD_MMAP to cover the
938 	 * delay of waiting or event synthesis.
939 	 */
940 	if (opts->target.initial_delay || target__has_cpu(&opts->target) ||
941 	    perf_pmus__num_core_pmus() > 1) {
942 
943 		/*
944 		 * User space tasks can migrate between CPUs, so when tracing
945 		 * selected CPUs, sideband for all CPUs is still needed.
946 		 */
947 		if (!!opts->target.cpu_list && record__tracking_system_wide(rec))
948 			system_wide = true;
949 
950 		evsel = evlist__findnew_tracking_event(evlist, system_wide);
951 		if (!evsel)
952 			return -ENOMEM;
953 
954 		/*
955 		 * Enable the tracking event when the process is forked for
956 		 * initial_delay, immediately for system wide.
957 		 */
958 		if (opts->target.initial_delay && !evsel->immediate &&
959 		    !target__has_cpu(&opts->target))
960 			evsel->core.attr.enable_on_exec = 1;
961 		else
962 			evsel->immediate = 1;
963 	}
964 
965 	return 0;
966 }
967 
968 static bool record__kcore_readable(struct machine *machine)
969 {
970 	char kcore[PATH_MAX];
971 	int fd;
972 
973 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
974 
975 	fd = open(kcore, O_RDONLY);
976 	if (fd < 0)
977 		return false;
978 
979 	close(fd);
980 
981 	return true;
982 }
983 
984 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
985 {
986 	char from_dir[PATH_MAX];
987 	char kcore_dir[PATH_MAX];
988 	int ret;
989 
990 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
991 
992 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
993 	if (ret)
994 		return ret;
995 
996 	return kcore_copy(from_dir, kcore_dir);
997 }
998 
999 static void record__thread_data_init_pipes(struct record_thread *thread_data)
1000 {
1001 	thread_data->pipes.msg[0] = -1;
1002 	thread_data->pipes.msg[1] = -1;
1003 	thread_data->pipes.ack[0] = -1;
1004 	thread_data->pipes.ack[1] = -1;
1005 }
1006 
1007 static int record__thread_data_open_pipes(struct record_thread *thread_data)
1008 {
1009 	if (pipe(thread_data->pipes.msg))
1010 		return -EINVAL;
1011 
1012 	if (pipe(thread_data->pipes.ack)) {
1013 		close(thread_data->pipes.msg[0]);
1014 		thread_data->pipes.msg[0] = -1;
1015 		close(thread_data->pipes.msg[1]);
1016 		thread_data->pipes.msg[1] = -1;
1017 		return -EINVAL;
1018 	}
1019 
1020 	pr_debug2("thread_data[%p]: msg=[%d,%d], ack=[%d,%d]\n", thread_data,
1021 		 thread_data->pipes.msg[0], thread_data->pipes.msg[1],
1022 		 thread_data->pipes.ack[0], thread_data->pipes.ack[1]);
1023 
1024 	return 0;
1025 }
1026 
1027 static void record__thread_data_close_pipes(struct record_thread *thread_data)
1028 {
1029 	if (thread_data->pipes.msg[0] != -1) {
1030 		close(thread_data->pipes.msg[0]);
1031 		thread_data->pipes.msg[0] = -1;
1032 	}
1033 	if (thread_data->pipes.msg[1] != -1) {
1034 		close(thread_data->pipes.msg[1]);
1035 		thread_data->pipes.msg[1] = -1;
1036 	}
1037 	if (thread_data->pipes.ack[0] != -1) {
1038 		close(thread_data->pipes.ack[0]);
1039 		thread_data->pipes.ack[0] = -1;
1040 	}
1041 	if (thread_data->pipes.ack[1] != -1) {
1042 		close(thread_data->pipes.ack[1]);
1043 		thread_data->pipes.ack[1] = -1;
1044 	}
1045 }
1046 
1047 static bool evlist__per_thread(struct evlist *evlist)
1048 {
1049 	return cpu_map__is_dummy(evlist->core.user_requested_cpus);
1050 }
1051 
1052 static int record__thread_data_init_maps(struct record_thread *thread_data, struct evlist *evlist)
1053 {
1054 	int m, tm, nr_mmaps = evlist->core.nr_mmaps;
1055 	struct mmap *mmap = evlist->mmap;
1056 	struct mmap *overwrite_mmap = evlist->overwrite_mmap;
1057 	struct perf_cpu_map *cpus = evlist->core.all_cpus;
1058 	bool per_thread = evlist__per_thread(evlist);
1059 
1060 	if (per_thread)
1061 		thread_data->nr_mmaps = nr_mmaps;
1062 	else
1063 		thread_data->nr_mmaps = bitmap_weight(thread_data->mask->maps.bits,
1064 						      thread_data->mask->maps.nbits);
1065 	if (mmap) {
1066 		thread_data->maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1067 		if (!thread_data->maps)
1068 			return -ENOMEM;
1069 	}
1070 	if (overwrite_mmap) {
1071 		thread_data->overwrite_maps = zalloc(thread_data->nr_mmaps * sizeof(struct mmap *));
1072 		if (!thread_data->overwrite_maps) {
1073 			zfree(&thread_data->maps);
1074 			return -ENOMEM;
1075 		}
1076 	}
1077 	pr_debug2("thread_data[%p]: nr_mmaps=%d, maps=%p, ow_maps=%p\n", thread_data,
1078 		 thread_data->nr_mmaps, thread_data->maps, thread_data->overwrite_maps);
1079 
1080 	for (m = 0, tm = 0; m < nr_mmaps && tm < thread_data->nr_mmaps; m++) {
1081 		if (per_thread ||
1082 		    test_bit(perf_cpu_map__cpu(cpus, m).cpu, thread_data->mask->maps.bits)) {
1083 			if (thread_data->maps) {
1084 				thread_data->maps[tm] = &mmap[m];
1085 				pr_debug2("thread_data[%p]: cpu%d: maps[%d] -> mmap[%d]\n",
1086 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1087 			}
1088 			if (thread_data->overwrite_maps) {
1089 				thread_data->overwrite_maps[tm] = &overwrite_mmap[m];
1090 				pr_debug2("thread_data[%p]: cpu%d: ow_maps[%d] -> ow_mmap[%d]\n",
1091 					  thread_data, perf_cpu_map__cpu(cpus, m).cpu, tm, m);
1092 			}
1093 			tm++;
1094 		}
1095 	}
1096 
1097 	return 0;
1098 }
1099 
1100 static int record__thread_data_init_pollfd(struct record_thread *thread_data, struct evlist *evlist)
1101 {
1102 	int f, tm, pos;
1103 	struct mmap *map, *overwrite_map;
1104 
1105 	fdarray__init(&thread_data->pollfd, 64);
1106 
1107 	for (tm = 0; tm < thread_data->nr_mmaps; tm++) {
1108 		map = thread_data->maps ? thread_data->maps[tm] : NULL;
1109 		overwrite_map = thread_data->overwrite_maps ?
1110 				thread_data->overwrite_maps[tm] : NULL;
1111 
1112 		for (f = 0; f < evlist->core.pollfd.nr; f++) {
1113 			void *ptr = evlist->core.pollfd.priv[f].ptr;
1114 
1115 			if ((map && ptr == map) || (overwrite_map && ptr == overwrite_map)) {
1116 				pos = fdarray__dup_entry_from(&thread_data->pollfd, f,
1117 							      &evlist->core.pollfd);
1118 				if (pos < 0)
1119 					return pos;
1120 				pr_debug2("thread_data[%p]: pollfd[%d] <- event_fd=%d\n",
1121 					 thread_data, pos, evlist->core.pollfd.entries[f].fd);
1122 			}
1123 		}
1124 	}
1125 
1126 	return 0;
1127 }
1128 
1129 static void record__free_thread_data(struct record *rec)
1130 {
1131 	int t;
1132 	struct record_thread *thread_data = rec->thread_data;
1133 
1134 	if (thread_data == NULL)
1135 		return;
1136 
1137 	for (t = 0; t < rec->nr_threads; t++) {
1138 		record__thread_data_close_pipes(&thread_data[t]);
1139 		zfree(&thread_data[t].maps);
1140 		zfree(&thread_data[t].overwrite_maps);
1141 		fdarray__exit(&thread_data[t].pollfd);
1142 	}
1143 
1144 	zfree(&rec->thread_data);
1145 }
1146 
1147 static int record__map_thread_evlist_pollfd_indexes(struct record *rec,
1148 						    int evlist_pollfd_index,
1149 						    int thread_pollfd_index)
1150 {
1151 	size_t x = rec->index_map_cnt;
1152 
1153 	if (realloc_array_as_needed(rec->index_map, rec->index_map_sz, x, NULL))
1154 		return -ENOMEM;
1155 	rec->index_map[x].evlist_pollfd_index = evlist_pollfd_index;
1156 	rec->index_map[x].thread_pollfd_index = thread_pollfd_index;
1157 	rec->index_map_cnt += 1;
1158 	return 0;
1159 }
1160 
1161 static int record__update_evlist_pollfd_from_thread(struct record *rec,
1162 						    struct evlist *evlist,
1163 						    struct record_thread *thread_data)
1164 {
1165 	struct pollfd *e_entries = evlist->core.pollfd.entries;
1166 	struct pollfd *t_entries = thread_data->pollfd.entries;
1167 	int err = 0;
1168 	size_t i;
1169 
1170 	for (i = 0; i < rec->index_map_cnt; i++) {
1171 		int e_pos = rec->index_map[i].evlist_pollfd_index;
1172 		int t_pos = rec->index_map[i].thread_pollfd_index;
1173 
1174 		if (e_entries[e_pos].fd != t_entries[t_pos].fd ||
1175 		    e_entries[e_pos].events != t_entries[t_pos].events) {
1176 			pr_err("Thread and evlist pollfd index mismatch\n");
1177 			err = -EINVAL;
1178 			continue;
1179 		}
1180 		e_entries[e_pos].revents = t_entries[t_pos].revents;
1181 	}
1182 	return err;
1183 }
1184 
1185 static int record__dup_non_perf_events(struct record *rec,
1186 				       struct evlist *evlist,
1187 				       struct record_thread *thread_data)
1188 {
1189 	struct fdarray *fda = &evlist->core.pollfd;
1190 	int i, ret;
1191 
1192 	for (i = 0; i < fda->nr; i++) {
1193 		if (!(fda->priv[i].flags & fdarray_flag__non_perf_event))
1194 			continue;
1195 		ret = fdarray__dup_entry_from(&thread_data->pollfd, i, fda);
1196 		if (ret < 0) {
1197 			pr_err("Failed to duplicate descriptor in main thread pollfd\n");
1198 			return ret;
1199 		}
1200 		pr_debug2("thread_data[%p]: pollfd[%d] <- non_perf_event fd=%d\n",
1201 			  thread_data, ret, fda->entries[i].fd);
1202 		ret = record__map_thread_evlist_pollfd_indexes(rec, i, ret);
1203 		if (ret < 0) {
1204 			pr_err("Failed to map thread and evlist pollfd indexes\n");
1205 			return ret;
1206 		}
1207 	}
1208 	return 0;
1209 }
1210 
1211 static int record__alloc_thread_data(struct record *rec, struct evlist *evlist)
1212 {
1213 	int t, ret;
1214 	struct record_thread *thread_data;
1215 
1216 	rec->thread_data = zalloc(rec->nr_threads * sizeof(*(rec->thread_data)));
1217 	if (!rec->thread_data) {
1218 		pr_err("Failed to allocate thread data\n");
1219 		return -ENOMEM;
1220 	}
1221 	thread_data = rec->thread_data;
1222 
1223 	for (t = 0; t < rec->nr_threads; t++)
1224 		record__thread_data_init_pipes(&thread_data[t]);
1225 
1226 	for (t = 0; t < rec->nr_threads; t++) {
1227 		thread_data[t].rec = rec;
1228 		thread_data[t].mask = &rec->thread_masks[t];
1229 		ret = record__thread_data_init_maps(&thread_data[t], evlist);
1230 		if (ret) {
1231 			pr_err("Failed to initialize thread[%d] maps\n", t);
1232 			goto out_free;
1233 		}
1234 		ret = record__thread_data_init_pollfd(&thread_data[t], evlist);
1235 		if (ret) {
1236 			pr_err("Failed to initialize thread[%d] pollfd\n", t);
1237 			goto out_free;
1238 		}
1239 		if (t) {
1240 			thread_data[t].tid = -1;
1241 			ret = record__thread_data_open_pipes(&thread_data[t]);
1242 			if (ret) {
1243 				pr_err("Failed to open thread[%d] communication pipes\n", t);
1244 				goto out_free;
1245 			}
1246 			ret = fdarray__add(&thread_data[t].pollfd, thread_data[t].pipes.msg[0],
1247 					   POLLIN | POLLERR | POLLHUP, fdarray_flag__nonfilterable);
1248 			if (ret < 0) {
1249 				pr_err("Failed to add descriptor to thread[%d] pollfd\n", t);
1250 				goto out_free;
1251 			}
1252 			thread_data[t].ctlfd_pos = ret;
1253 			pr_debug2("thread_data[%p]: pollfd[%d] <- ctl_fd=%d\n",
1254 				 thread_data, thread_data[t].ctlfd_pos,
1255 				 thread_data[t].pipes.msg[0]);
1256 		} else {
1257 			thread_data[t].tid = gettid();
1258 
1259 			ret = record__dup_non_perf_events(rec, evlist, &thread_data[t]);
1260 			if (ret < 0)
1261 				goto out_free;
1262 
1263 			thread_data[t].ctlfd_pos = -1; /* Not used */
1264 		}
1265 	}
1266 
1267 	return 0;
1268 
1269 out_free:
1270 	record__free_thread_data(rec);
1271 
1272 	return ret;
1273 }
1274 
1275 static int record__mmap_evlist(struct record *rec,
1276 			       struct evlist *evlist)
1277 {
1278 	int i, ret;
1279 	struct record_opts *opts = &rec->opts;
1280 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
1281 				  opts->auxtrace_sample_mode;
1282 	char msg[512];
1283 
1284 	if (opts->affinity != PERF_AFFINITY_SYS)
1285 		cpu__setup_cpunode_map();
1286 
1287 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
1288 				 opts->auxtrace_mmap_pages,
1289 				 auxtrace_overwrite,
1290 				 opts->nr_cblocks, opts->affinity,
1291 				 opts->mmap_flush, opts->comp_level) < 0) {
1292 		if (errno == EPERM) {
1293 			pr_err("Permission error mapping pages.\n"
1294 			       "Consider increasing "
1295 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
1296 			       "or try again with a smaller value of -m/--mmap_pages.\n"
1297 			       "(current value: %u,%u)\n",
1298 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
1299 			return -errno;
1300 		} else {
1301 			pr_err("failed to mmap with %d (%s)\n", errno,
1302 				str_error_r(errno, msg, sizeof(msg)));
1303 			if (errno)
1304 				return -errno;
1305 			else
1306 				return -EINVAL;
1307 		}
1308 	}
1309 
1310 	if (evlist__initialize_ctlfd(evlist, opts->ctl_fd, opts->ctl_fd_ack))
1311 		return -1;
1312 
1313 	ret = record__alloc_thread_data(rec, evlist);
1314 	if (ret)
1315 		return ret;
1316 
1317 	if (record__threads_enabled(rec)) {
1318 		ret = perf_data__create_dir(&rec->data, evlist->core.nr_mmaps);
1319 		if (ret) {
1320 			pr_err("Failed to create data directory: %s\n", strerror(-ret));
1321 			return ret;
1322 		}
1323 		for (i = 0; i < evlist->core.nr_mmaps; i++) {
1324 			if (evlist->mmap)
1325 				evlist->mmap[i].file = &rec->data.dir.files[i];
1326 			if (evlist->overwrite_mmap)
1327 				evlist->overwrite_mmap[i].file = &rec->data.dir.files[i];
1328 		}
1329 	}
1330 
1331 	return 0;
1332 }
1333 
1334 static int record__mmap(struct record *rec)
1335 {
1336 	return record__mmap_evlist(rec, rec->evlist);
1337 }
1338 
1339 static int record__open(struct record *rec)
1340 {
1341 	char msg[BUFSIZ];
1342 	struct evsel *pos;
1343 	struct evlist *evlist = rec->evlist;
1344 	struct perf_session *session = rec->session;
1345 	struct record_opts *opts = &rec->opts;
1346 	int rc = 0;
1347 
1348 	evlist__config(evlist, opts, &callchain_param);
1349 
1350 	evlist__for_each_entry(evlist, pos) {
1351 try_again:
1352 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
1353 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
1354 				if (verbose > 0)
1355 					ui__warning("%s\n", msg);
1356 				goto try_again;
1357 			}
1358 			if ((errno == EINVAL || errno == EBADF) &&
1359 			    pos->core.leader != &pos->core &&
1360 			    pos->weak_group) {
1361 			        pos = evlist__reset_weak_group(evlist, pos, true);
1362 				goto try_again;
1363 			}
1364 			rc = -errno;
1365 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
1366 			ui__error("%s\n", msg);
1367 			goto out;
1368 		}
1369 
1370 		pos->supported = true;
1371 	}
1372 
1373 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
1374 		pr_warning(
1375 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1376 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
1377 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1378 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1379 "Samples in kernel modules won't be resolved at all.\n\n"
1380 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1381 "even with a suitable vmlinux or kallsyms file.\n\n");
1382 	}
1383 
1384 	if (evlist__apply_filters(evlist, &pos)) {
1385 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
1386 			pos->filter ?: "BPF", evsel__name(pos), errno,
1387 			str_error_r(errno, msg, sizeof(msg)));
1388 		rc = -1;
1389 		goto out;
1390 	}
1391 
1392 	rc = record__mmap(rec);
1393 	if (rc)
1394 		goto out;
1395 
1396 	session->evlist = evlist;
1397 	perf_session__set_id_hdr_size(session);
1398 out:
1399 	return rc;
1400 }
1401 
1402 static void set_timestamp_boundary(struct record *rec, u64 sample_time)
1403 {
1404 	if (rec->evlist->first_sample_time == 0)
1405 		rec->evlist->first_sample_time = sample_time;
1406 
1407 	if (sample_time)
1408 		rec->evlist->last_sample_time = sample_time;
1409 }
1410 
1411 static int process_sample_event(struct perf_tool *tool,
1412 				union perf_event *event,
1413 				struct perf_sample *sample,
1414 				struct evsel *evsel,
1415 				struct machine *machine)
1416 {
1417 	struct record *rec = container_of(tool, struct record, tool);
1418 
1419 	set_timestamp_boundary(rec, sample->time);
1420 
1421 	if (rec->buildid_all)
1422 		return 0;
1423 
1424 	rec->samples++;
1425 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
1426 }
1427 
1428 static int process_buildids(struct record *rec)
1429 {
1430 	struct perf_session *session = rec->session;
1431 
1432 	if (perf_data__size(&rec->data) == 0)
1433 		return 0;
1434 
1435 	/*
1436 	 * During this process, it'll load kernel map and replace the
1437 	 * dso->long_name to a real pathname it found.  In this case
1438 	 * we prefer the vmlinux path like
1439 	 *   /lib/modules/3.16.4/build/vmlinux
1440 	 *
1441 	 * rather than build-id path (in debug directory).
1442 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1443 	 */
1444 	symbol_conf.ignore_vmlinux_buildid = true;
1445 
1446 	/*
1447 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1448 	 * so no need to process samples. But if timestamp_boundary is enabled,
1449 	 * it still needs to walk on all samples to get the timestamps of
1450 	 * first/last samples.
1451 	 */
1452 	if (rec->buildid_all && !rec->timestamp_boundary)
1453 		rec->tool.sample = NULL;
1454 
1455 	return perf_session__process_events(session);
1456 }
1457 
1458 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1459 {
1460 	int err;
1461 	struct perf_tool *tool = data;
1462 	/*
1463 	 *As for guest kernel when processing subcommand record&report,
1464 	 *we arrange module mmap prior to guest kernel mmap and trigger
1465 	 *a preload dso because default guest module symbols are loaded
1466 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1467 	 *method is used to avoid symbol missing when the first addr is
1468 	 *in module instead of in guest kernel.
1469 	 */
1470 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1471 					     machine);
1472 	if (err < 0)
1473 		pr_err("Couldn't record guest kernel [%d]'s reference"
1474 		       " relocation symbol.\n", machine->pid);
1475 
1476 	/*
1477 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1478 	 * have no _text sometimes.
1479 	 */
1480 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1481 						 machine);
1482 	if (err < 0)
1483 		pr_err("Couldn't record guest kernel [%d]'s reference"
1484 		       " relocation symbol.\n", machine->pid);
1485 }
1486 
1487 static struct perf_event_header finished_round_event = {
1488 	.size = sizeof(struct perf_event_header),
1489 	.type = PERF_RECORD_FINISHED_ROUND,
1490 };
1491 
1492 static struct perf_event_header finished_init_event = {
1493 	.size = sizeof(struct perf_event_header),
1494 	.type = PERF_RECORD_FINISHED_INIT,
1495 };
1496 
1497 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1498 {
1499 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1500 	    !bitmap_equal(thread->mask->affinity.bits, map->affinity_mask.bits,
1501 			  thread->mask->affinity.nbits)) {
1502 		bitmap_zero(thread->mask->affinity.bits, thread->mask->affinity.nbits);
1503 		bitmap_or(thread->mask->affinity.bits, thread->mask->affinity.bits,
1504 			  map->affinity_mask.bits, thread->mask->affinity.nbits);
1505 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
1506 					(cpu_set_t *)thread->mask->affinity.bits);
1507 		if (verbose == 2) {
1508 			pr_debug("threads[%d]: running on cpu%d: ", thread->tid, sched_getcpu());
1509 			mmap_cpu_mask__scnprintf(&thread->mask->affinity, "affinity");
1510 		}
1511 	}
1512 }
1513 
1514 static size_t process_comp_header(void *record, size_t increment)
1515 {
1516 	struct perf_record_compressed *event = record;
1517 	size_t size = sizeof(*event);
1518 
1519 	if (increment) {
1520 		event->header.size += increment;
1521 		return increment;
1522 	}
1523 
1524 	event->header.type = PERF_RECORD_COMPRESSED;
1525 	event->header.size = size;
1526 
1527 	return size;
1528 }
1529 
1530 static size_t zstd_compress(struct perf_session *session, struct mmap *map,
1531 			    void *dst, size_t dst_size, void *src, size_t src_size)
1532 {
1533 	size_t compressed;
1534 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1535 	struct zstd_data *zstd_data = &session->zstd_data;
1536 
1537 	if (map && map->file)
1538 		zstd_data = &map->zstd_data;
1539 
1540 	compressed = zstd_compress_stream_to_records(zstd_data, dst, dst_size, src, src_size,
1541 						     max_record_size, process_comp_header);
1542 
1543 	if (map && map->file) {
1544 		thread->bytes_transferred += src_size;
1545 		thread->bytes_compressed  += compressed;
1546 	} else {
1547 		session->bytes_transferred += src_size;
1548 		session->bytes_compressed  += compressed;
1549 	}
1550 
1551 	return compressed;
1552 }
1553 
1554 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1555 				    bool overwrite, bool synch)
1556 {
1557 	u64 bytes_written = rec->bytes_written;
1558 	int i;
1559 	int rc = 0;
1560 	int nr_mmaps;
1561 	struct mmap **maps;
1562 	int trace_fd = rec->data.file.fd;
1563 	off_t off = 0;
1564 
1565 	if (!evlist)
1566 		return 0;
1567 
1568 	nr_mmaps = thread->nr_mmaps;
1569 	maps = overwrite ? thread->overwrite_maps : thread->maps;
1570 
1571 	if (!maps)
1572 		return 0;
1573 
1574 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1575 		return 0;
1576 
1577 	if (record__aio_enabled(rec))
1578 		off = record__aio_get_pos(trace_fd);
1579 
1580 	for (i = 0; i < nr_mmaps; i++) {
1581 		u64 flush = 0;
1582 		struct mmap *map = maps[i];
1583 
1584 		if (map->core.base) {
1585 			record__adjust_affinity(rec, map);
1586 			if (synch) {
1587 				flush = map->core.flush;
1588 				map->core.flush = 1;
1589 			}
1590 			if (!record__aio_enabled(rec)) {
1591 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1592 					if (synch)
1593 						map->core.flush = flush;
1594 					rc = -1;
1595 					goto out;
1596 				}
1597 			} else {
1598 				if (record__aio_push(rec, map, &off) < 0) {
1599 					record__aio_set_pos(trace_fd, off);
1600 					if (synch)
1601 						map->core.flush = flush;
1602 					rc = -1;
1603 					goto out;
1604 				}
1605 			}
1606 			if (synch)
1607 				map->core.flush = flush;
1608 		}
1609 
1610 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1611 		    !rec->opts.auxtrace_sample_mode &&
1612 		    record__auxtrace_mmap_read(rec, map) != 0) {
1613 			rc = -1;
1614 			goto out;
1615 		}
1616 	}
1617 
1618 	if (record__aio_enabled(rec))
1619 		record__aio_set_pos(trace_fd, off);
1620 
1621 	/*
1622 	 * Mark the round finished in case we wrote
1623 	 * at least one event.
1624 	 *
1625 	 * No need for round events in directory mode,
1626 	 * because per-cpu maps and files have data
1627 	 * sorted by kernel.
1628 	 */
1629 	if (!record__threads_enabled(rec) && bytes_written != rec->bytes_written)
1630 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1631 
1632 	if (overwrite)
1633 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1634 out:
1635 	return rc;
1636 }
1637 
1638 static int record__mmap_read_all(struct record *rec, bool synch)
1639 {
1640 	int err;
1641 
1642 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1643 	if (err)
1644 		return err;
1645 
1646 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1647 }
1648 
1649 static void record__thread_munmap_filtered(struct fdarray *fda, int fd,
1650 					   void *arg __maybe_unused)
1651 {
1652 	struct perf_mmap *map = fda->priv[fd].ptr;
1653 
1654 	if (map)
1655 		perf_mmap__put(map);
1656 }
1657 
1658 static void *record__thread(void *arg)
1659 {
1660 	enum thread_msg msg = THREAD_MSG__READY;
1661 	bool terminate = false;
1662 	struct fdarray *pollfd;
1663 	int err, ctlfd_pos;
1664 
1665 	thread = arg;
1666 	thread->tid = gettid();
1667 
1668 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1669 	if (err == -1)
1670 		pr_warning("threads[%d]: failed to notify on start: %s\n",
1671 			   thread->tid, strerror(errno));
1672 
1673 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
1674 
1675 	pollfd = &thread->pollfd;
1676 	ctlfd_pos = thread->ctlfd_pos;
1677 
1678 	for (;;) {
1679 		unsigned long long hits = thread->samples;
1680 
1681 		if (record__mmap_read_all(thread->rec, false) < 0 || terminate)
1682 			break;
1683 
1684 		if (hits == thread->samples) {
1685 
1686 			err = fdarray__poll(pollfd, -1);
1687 			/*
1688 			 * Propagate error, only if there's any. Ignore positive
1689 			 * number of returned events and interrupt error.
1690 			 */
1691 			if (err > 0 || (err < 0 && errno == EINTR))
1692 				err = 0;
1693 			thread->waking++;
1694 
1695 			if (fdarray__filter(pollfd, POLLERR | POLLHUP,
1696 					    record__thread_munmap_filtered, NULL) == 0)
1697 				break;
1698 		}
1699 
1700 		if (pollfd->entries[ctlfd_pos].revents & POLLHUP) {
1701 			terminate = true;
1702 			close(thread->pipes.msg[0]);
1703 			thread->pipes.msg[0] = -1;
1704 			pollfd->entries[ctlfd_pos].fd = -1;
1705 			pollfd->entries[ctlfd_pos].events = 0;
1706 		}
1707 
1708 		pollfd->entries[ctlfd_pos].revents = 0;
1709 	}
1710 	record__mmap_read_all(thread->rec, true);
1711 
1712 	err = write(thread->pipes.ack[1], &msg, sizeof(msg));
1713 	if (err == -1)
1714 		pr_warning("threads[%d]: failed to notify on termination: %s\n",
1715 			   thread->tid, strerror(errno));
1716 
1717 	return NULL;
1718 }
1719 
1720 static void record__init_features(struct record *rec)
1721 {
1722 	struct perf_session *session = rec->session;
1723 	int feat;
1724 
1725 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1726 		perf_header__set_feat(&session->header, feat);
1727 
1728 	if (rec->no_buildid)
1729 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1730 
1731 #ifdef HAVE_LIBTRACEEVENT
1732 	if (!have_tracepoints(&rec->evlist->core.entries))
1733 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1734 #endif
1735 
1736 	if (!rec->opts.branch_stack)
1737 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1738 
1739 	if (!rec->opts.full_auxtrace)
1740 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1741 
1742 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1743 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1744 
1745 	if (!rec->opts.use_clockid)
1746 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1747 
1748 	if (!record__threads_enabled(rec))
1749 		perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1750 
1751 	if (!record__comp_enabled(rec))
1752 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1753 
1754 	perf_header__clear_feat(&session->header, HEADER_STAT);
1755 }
1756 
1757 static void
1758 record__finish_output(struct record *rec)
1759 {
1760 	int i;
1761 	struct perf_data *data = &rec->data;
1762 	int fd = perf_data__fd(data);
1763 
1764 	if (data->is_pipe)
1765 		return;
1766 
1767 	rec->session->header.data_size += rec->bytes_written;
1768 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1769 	if (record__threads_enabled(rec)) {
1770 		for (i = 0; i < data->dir.nr; i++)
1771 			data->dir.files[i].size = lseek(data->dir.files[i].fd, 0, SEEK_CUR);
1772 	}
1773 
1774 	if (!rec->no_buildid) {
1775 		process_buildids(rec);
1776 
1777 		if (rec->buildid_all)
1778 			dsos__hit_all(rec->session);
1779 	}
1780 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1781 
1782 	return;
1783 }
1784 
1785 static int record__synthesize_workload(struct record *rec, bool tail)
1786 {
1787 	int err;
1788 	struct perf_thread_map *thread_map;
1789 	bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
1790 
1791 	if (rec->opts.tail_synthesize != tail)
1792 		return 0;
1793 
1794 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1795 	if (thread_map == NULL)
1796 		return -1;
1797 
1798 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1799 						 process_synthesized_event,
1800 						 &rec->session->machines.host,
1801 						 needs_mmap,
1802 						 rec->opts.sample_address);
1803 	perf_thread_map__put(thread_map);
1804 	return err;
1805 }
1806 
1807 static int write_finished_init(struct record *rec, bool tail)
1808 {
1809 	if (rec->opts.tail_synthesize != tail)
1810 		return 0;
1811 
1812 	return record__write(rec, NULL, &finished_init_event, sizeof(finished_init_event));
1813 }
1814 
1815 static int record__synthesize(struct record *rec, bool tail);
1816 
1817 static int
1818 record__switch_output(struct record *rec, bool at_exit)
1819 {
1820 	struct perf_data *data = &rec->data;
1821 	int fd, err;
1822 	char *new_filename;
1823 
1824 	/* Same Size:      "2015122520103046"*/
1825 	char timestamp[] = "InvalidTimestamp";
1826 
1827 	record__aio_mmap_read_sync(rec);
1828 
1829 	write_finished_init(rec, true);
1830 
1831 	record__synthesize(rec, true);
1832 	if (target__none(&rec->opts.target))
1833 		record__synthesize_workload(rec, true);
1834 
1835 	rec->samples = 0;
1836 	record__finish_output(rec);
1837 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1838 	if (err) {
1839 		pr_err("Failed to get current timestamp\n");
1840 		return -EINVAL;
1841 	}
1842 
1843 	fd = perf_data__switch(data, timestamp,
1844 				    rec->session->header.data_offset,
1845 				    at_exit, &new_filename);
1846 	if (fd >= 0 && !at_exit) {
1847 		rec->bytes_written = 0;
1848 		rec->session->header.data_size = 0;
1849 	}
1850 
1851 	if (!quiet)
1852 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1853 			data->path, timestamp);
1854 
1855 	if (rec->switch_output.num_files) {
1856 		int n = rec->switch_output.cur_file + 1;
1857 
1858 		if (n >= rec->switch_output.num_files)
1859 			n = 0;
1860 		rec->switch_output.cur_file = n;
1861 		if (rec->switch_output.filenames[n]) {
1862 			remove(rec->switch_output.filenames[n]);
1863 			zfree(&rec->switch_output.filenames[n]);
1864 		}
1865 		rec->switch_output.filenames[n] = new_filename;
1866 	} else {
1867 		free(new_filename);
1868 	}
1869 
1870 	/* Output tracking events */
1871 	if (!at_exit) {
1872 		record__synthesize(rec, false);
1873 
1874 		/*
1875 		 * In 'perf record --switch-output' without -a,
1876 		 * record__synthesize() in record__switch_output() won't
1877 		 * generate tracking events because there's no thread_map
1878 		 * in evlist. Which causes newly created perf.data doesn't
1879 		 * contain map and comm information.
1880 		 * Create a fake thread_map and directly call
1881 		 * perf_event__synthesize_thread_map() for those events.
1882 		 */
1883 		if (target__none(&rec->opts.target))
1884 			record__synthesize_workload(rec, false);
1885 		write_finished_init(rec, false);
1886 	}
1887 	return fd;
1888 }
1889 
1890 static void __record__save_lost_samples(struct record *rec, struct evsel *evsel,
1891 					struct perf_record_lost_samples *lost,
1892 					int cpu_idx, int thread_idx, u64 lost_count,
1893 					u16 misc_flag)
1894 {
1895 	struct perf_sample_id *sid;
1896 	struct perf_sample sample = {};
1897 	int id_hdr_size;
1898 
1899 	lost->lost = lost_count;
1900 	if (evsel->core.ids) {
1901 		sid = xyarray__entry(evsel->core.sample_id, cpu_idx, thread_idx);
1902 		sample.id = sid->id;
1903 	}
1904 
1905 	id_hdr_size = perf_event__synthesize_id_sample((void *)(lost + 1),
1906 						       evsel->core.attr.sample_type, &sample);
1907 	lost->header.size = sizeof(*lost) + id_hdr_size;
1908 	lost->header.misc = misc_flag;
1909 	record__write(rec, NULL, lost, lost->header.size);
1910 }
1911 
1912 static void record__read_lost_samples(struct record *rec)
1913 {
1914 	struct perf_session *session = rec->session;
1915 	struct perf_record_lost_samples *lost;
1916 	struct evsel *evsel;
1917 
1918 	/* there was an error during record__open */
1919 	if (session->evlist == NULL)
1920 		return;
1921 
1922 	lost = zalloc(PERF_SAMPLE_MAX_SIZE);
1923 	if (lost == NULL) {
1924 		pr_debug("Memory allocation failed\n");
1925 		return;
1926 	}
1927 
1928 	lost->header.type = PERF_RECORD_LOST_SAMPLES;
1929 
1930 	evlist__for_each_entry(session->evlist, evsel) {
1931 		struct xyarray *xy = evsel->core.sample_id;
1932 		u64 lost_count;
1933 
1934 		if (xy == NULL || evsel->core.fd == NULL)
1935 			continue;
1936 		if (xyarray__max_x(evsel->core.fd) != xyarray__max_x(xy) ||
1937 		    xyarray__max_y(evsel->core.fd) != xyarray__max_y(xy)) {
1938 			pr_debug("Unmatched FD vs. sample ID: skip reading LOST count\n");
1939 			continue;
1940 		}
1941 
1942 		for (int x = 0; x < xyarray__max_x(xy); x++) {
1943 			for (int y = 0; y < xyarray__max_y(xy); y++) {
1944 				struct perf_counts_values count;
1945 
1946 				if (perf_evsel__read(&evsel->core, x, y, &count) < 0) {
1947 					pr_debug("read LOST count failed\n");
1948 					goto out;
1949 				}
1950 
1951 				if (count.lost) {
1952 					__record__save_lost_samples(rec, evsel, lost,
1953 								    x, y, count.lost, 0);
1954 				}
1955 			}
1956 		}
1957 
1958 		lost_count = perf_bpf_filter__lost_count(evsel);
1959 		if (lost_count)
1960 			__record__save_lost_samples(rec, evsel, lost, 0, 0, lost_count,
1961 						    PERF_RECORD_MISC_LOST_SAMPLES_BPF);
1962 	}
1963 out:
1964 	free(lost);
1965 }
1966 
1967 static volatile sig_atomic_t workload_exec_errno;
1968 
1969 /*
1970  * evlist__prepare_workload will send a SIGUSR1
1971  * if the fork fails, since we asked by setting its
1972  * want_signal to true.
1973  */
1974 static void workload_exec_failed_signal(int signo __maybe_unused,
1975 					siginfo_t *info,
1976 					void *ucontext __maybe_unused)
1977 {
1978 	workload_exec_errno = info->si_value.sival_int;
1979 	done = 1;
1980 	child_finished = 1;
1981 }
1982 
1983 static void snapshot_sig_handler(int sig);
1984 static void alarm_sig_handler(int sig);
1985 
1986 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1987 {
1988 	if (evlist) {
1989 		if (evlist->mmap && evlist->mmap[0].core.base)
1990 			return evlist->mmap[0].core.base;
1991 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1992 			return evlist->overwrite_mmap[0].core.base;
1993 	}
1994 	return NULL;
1995 }
1996 
1997 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1998 {
1999 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
2000 	if (pc)
2001 		return pc;
2002 	return NULL;
2003 }
2004 
2005 static int record__synthesize(struct record *rec, bool tail)
2006 {
2007 	struct perf_session *session = rec->session;
2008 	struct machine *machine = &session->machines.host;
2009 	struct perf_data *data = &rec->data;
2010 	struct record_opts *opts = &rec->opts;
2011 	struct perf_tool *tool = &rec->tool;
2012 	int err = 0;
2013 	event_op f = process_synthesized_event;
2014 
2015 	if (rec->opts.tail_synthesize != tail)
2016 		return 0;
2017 
2018 	if (data->is_pipe) {
2019 		err = perf_event__synthesize_for_pipe(tool, session, data,
2020 						      process_synthesized_event);
2021 		if (err < 0)
2022 			goto out;
2023 
2024 		rec->bytes_written += err;
2025 	}
2026 
2027 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
2028 					  process_synthesized_event, machine);
2029 	if (err)
2030 		goto out;
2031 
2032 	/* Synthesize id_index before auxtrace_info */
2033 	err = perf_event__synthesize_id_index(tool,
2034 					      process_synthesized_event,
2035 					      session->evlist, machine);
2036 	if (err)
2037 		goto out;
2038 
2039 	if (rec->opts.full_auxtrace) {
2040 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
2041 					session, process_synthesized_event);
2042 		if (err)
2043 			goto out;
2044 	}
2045 
2046 	if (!evlist__exclude_kernel(rec->evlist)) {
2047 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
2048 							 machine);
2049 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
2050 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2051 				   "Check /proc/kallsyms permission or run as root.\n");
2052 
2053 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
2054 						     machine);
2055 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
2056 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
2057 				   "Check /proc/modules permission or run as root.\n");
2058 	}
2059 
2060 	if (perf_guest) {
2061 		machines__process_guests(&session->machines,
2062 					 perf_event__synthesize_guest_os, tool);
2063 	}
2064 
2065 	err = perf_event__synthesize_extra_attr(&rec->tool,
2066 						rec->evlist,
2067 						process_synthesized_event,
2068 						data->is_pipe);
2069 	if (err)
2070 		goto out;
2071 
2072 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
2073 						 process_synthesized_event,
2074 						NULL);
2075 	if (err < 0) {
2076 		pr_err("Couldn't synthesize thread map.\n");
2077 		return err;
2078 	}
2079 
2080 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.all_cpus,
2081 					     process_synthesized_event, NULL);
2082 	if (err < 0) {
2083 		pr_err("Couldn't synthesize cpu map.\n");
2084 		return err;
2085 	}
2086 
2087 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
2088 						machine, opts);
2089 	if (err < 0) {
2090 		pr_warning("Couldn't synthesize bpf events.\n");
2091 		err = 0;
2092 	}
2093 
2094 	if (rec->opts.synth & PERF_SYNTH_CGROUP) {
2095 		err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
2096 						     machine);
2097 		if (err < 0) {
2098 			pr_warning("Couldn't synthesize cgroup events.\n");
2099 			err = 0;
2100 		}
2101 	}
2102 
2103 	if (rec->opts.nr_threads_synthesize > 1) {
2104 		mutex_init(&synth_lock);
2105 		perf_set_multithreaded();
2106 		f = process_locked_synthesized_event;
2107 	}
2108 
2109 	if (rec->opts.synth & PERF_SYNTH_TASK) {
2110 		bool needs_mmap = rec->opts.synth & PERF_SYNTH_MMAP;
2111 
2112 		err = __machine__synthesize_threads(machine, tool, &opts->target,
2113 						    rec->evlist->core.threads,
2114 						    f, needs_mmap, opts->sample_address,
2115 						    rec->opts.nr_threads_synthesize);
2116 	}
2117 
2118 	if (rec->opts.nr_threads_synthesize > 1) {
2119 		perf_set_singlethreaded();
2120 		mutex_destroy(&synth_lock);
2121 	}
2122 
2123 out:
2124 	return err;
2125 }
2126 
2127 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
2128 {
2129 	struct record *rec = data;
2130 	pthread_kill(rec->thread_id, SIGUSR2);
2131 	return 0;
2132 }
2133 
2134 static int record__setup_sb_evlist(struct record *rec)
2135 {
2136 	struct record_opts *opts = &rec->opts;
2137 
2138 	if (rec->sb_evlist != NULL) {
2139 		/*
2140 		 * We get here if --switch-output-event populated the
2141 		 * sb_evlist, so associate a callback that will send a SIGUSR2
2142 		 * to the main thread.
2143 		 */
2144 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
2145 		rec->thread_id = pthread_self();
2146 	}
2147 #ifdef HAVE_LIBBPF_SUPPORT
2148 	if (!opts->no_bpf_event) {
2149 		if (rec->sb_evlist == NULL) {
2150 			rec->sb_evlist = evlist__new();
2151 
2152 			if (rec->sb_evlist == NULL) {
2153 				pr_err("Couldn't create side band evlist.\n.");
2154 				return -1;
2155 			}
2156 		}
2157 
2158 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
2159 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
2160 			return -1;
2161 		}
2162 	}
2163 #endif
2164 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
2165 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
2166 		opts->no_bpf_event = true;
2167 	}
2168 
2169 	return 0;
2170 }
2171 
2172 static int record__init_clock(struct record *rec)
2173 {
2174 	struct perf_session *session = rec->session;
2175 	struct timespec ref_clockid;
2176 	struct timeval ref_tod;
2177 	u64 ref;
2178 
2179 	if (!rec->opts.use_clockid)
2180 		return 0;
2181 
2182 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
2183 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
2184 
2185 	session->header.env.clock.clockid = rec->opts.clockid;
2186 
2187 	if (gettimeofday(&ref_tod, NULL) != 0) {
2188 		pr_err("gettimeofday failed, cannot set reference time.\n");
2189 		return -1;
2190 	}
2191 
2192 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
2193 		pr_err("clock_gettime failed, cannot set reference time.\n");
2194 		return -1;
2195 	}
2196 
2197 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
2198 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
2199 
2200 	session->header.env.clock.tod_ns = ref;
2201 
2202 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
2203 	      (u64) ref_clockid.tv_nsec;
2204 
2205 	session->header.env.clock.clockid_ns = ref;
2206 	return 0;
2207 }
2208 
2209 static void hit_auxtrace_snapshot_trigger(struct record *rec)
2210 {
2211 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2212 		trigger_hit(&auxtrace_snapshot_trigger);
2213 		auxtrace_record__snapshot_started = 1;
2214 		if (auxtrace_record__snapshot_start(rec->itr))
2215 			trigger_error(&auxtrace_snapshot_trigger);
2216 	}
2217 }
2218 
2219 static void record__uniquify_name(struct record *rec)
2220 {
2221 	struct evsel *pos;
2222 	struct evlist *evlist = rec->evlist;
2223 	char *new_name;
2224 	int ret;
2225 
2226 	if (perf_pmus__num_core_pmus() == 1)
2227 		return;
2228 
2229 	evlist__for_each_entry(evlist, pos) {
2230 		if (!evsel__is_hybrid(pos))
2231 			continue;
2232 
2233 		if (strchr(pos->name, '/'))
2234 			continue;
2235 
2236 		ret = asprintf(&new_name, "%s/%s/",
2237 			       pos->pmu_name, pos->name);
2238 		if (ret) {
2239 			free(pos->name);
2240 			pos->name = new_name;
2241 		}
2242 	}
2243 }
2244 
2245 static int record__terminate_thread(struct record_thread *thread_data)
2246 {
2247 	int err;
2248 	enum thread_msg ack = THREAD_MSG__UNDEFINED;
2249 	pid_t tid = thread_data->tid;
2250 
2251 	close(thread_data->pipes.msg[1]);
2252 	thread_data->pipes.msg[1] = -1;
2253 	err = read(thread_data->pipes.ack[0], &ack, sizeof(ack));
2254 	if (err > 0)
2255 		pr_debug2("threads[%d]: sent %s\n", tid, thread_msg_tags[ack]);
2256 	else
2257 		pr_warning("threads[%d]: failed to receive termination notification from %d\n",
2258 			   thread->tid, tid);
2259 
2260 	return 0;
2261 }
2262 
2263 static int record__start_threads(struct record *rec)
2264 {
2265 	int t, tt, err, ret = 0, nr_threads = rec->nr_threads;
2266 	struct record_thread *thread_data = rec->thread_data;
2267 	sigset_t full, mask;
2268 	pthread_t handle;
2269 	pthread_attr_t attrs;
2270 
2271 	thread = &thread_data[0];
2272 
2273 	if (!record__threads_enabled(rec))
2274 		return 0;
2275 
2276 	sigfillset(&full);
2277 	if (sigprocmask(SIG_SETMASK, &full, &mask)) {
2278 		pr_err("Failed to block signals on threads start: %s\n", strerror(errno));
2279 		return -1;
2280 	}
2281 
2282 	pthread_attr_init(&attrs);
2283 	pthread_attr_setdetachstate(&attrs, PTHREAD_CREATE_DETACHED);
2284 
2285 	for (t = 1; t < nr_threads; t++) {
2286 		enum thread_msg msg = THREAD_MSG__UNDEFINED;
2287 
2288 #ifdef HAVE_PTHREAD_ATTR_SETAFFINITY_NP
2289 		pthread_attr_setaffinity_np(&attrs,
2290 					    MMAP_CPU_MASK_BYTES(&(thread_data[t].mask->affinity)),
2291 					    (cpu_set_t *)(thread_data[t].mask->affinity.bits));
2292 #endif
2293 		if (pthread_create(&handle, &attrs, record__thread, &thread_data[t])) {
2294 			for (tt = 1; tt < t; tt++)
2295 				record__terminate_thread(&thread_data[t]);
2296 			pr_err("Failed to start threads: %s\n", strerror(errno));
2297 			ret = -1;
2298 			goto out_err;
2299 		}
2300 
2301 		err = read(thread_data[t].pipes.ack[0], &msg, sizeof(msg));
2302 		if (err > 0)
2303 			pr_debug2("threads[%d]: sent %s\n", rec->thread_data[t].tid,
2304 				  thread_msg_tags[msg]);
2305 		else
2306 			pr_warning("threads[%d]: failed to receive start notification from %d\n",
2307 				   thread->tid, rec->thread_data[t].tid);
2308 	}
2309 
2310 	sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&thread->mask->affinity),
2311 			(cpu_set_t *)thread->mask->affinity.bits);
2312 
2313 	pr_debug("threads[%d]: started on cpu%d\n", thread->tid, sched_getcpu());
2314 
2315 out_err:
2316 	pthread_attr_destroy(&attrs);
2317 
2318 	if (sigprocmask(SIG_SETMASK, &mask, NULL)) {
2319 		pr_err("Failed to unblock signals on threads start: %s\n", strerror(errno));
2320 		ret = -1;
2321 	}
2322 
2323 	return ret;
2324 }
2325 
2326 static int record__stop_threads(struct record *rec)
2327 {
2328 	int t;
2329 	struct record_thread *thread_data = rec->thread_data;
2330 
2331 	for (t = 1; t < rec->nr_threads; t++)
2332 		record__terminate_thread(&thread_data[t]);
2333 
2334 	for (t = 0; t < rec->nr_threads; t++) {
2335 		rec->samples += thread_data[t].samples;
2336 		if (!record__threads_enabled(rec))
2337 			continue;
2338 		rec->session->bytes_transferred += thread_data[t].bytes_transferred;
2339 		rec->session->bytes_compressed += thread_data[t].bytes_compressed;
2340 		pr_debug("threads[%d]: samples=%lld, wakes=%ld, ", thread_data[t].tid,
2341 			 thread_data[t].samples, thread_data[t].waking);
2342 		if (thread_data[t].bytes_transferred && thread_data[t].bytes_compressed)
2343 			pr_debug("transferred=%" PRIu64 ", compressed=%" PRIu64 "\n",
2344 				 thread_data[t].bytes_transferred, thread_data[t].bytes_compressed);
2345 		else
2346 			pr_debug("written=%" PRIu64 "\n", thread_data[t].bytes_written);
2347 	}
2348 
2349 	return 0;
2350 }
2351 
2352 static unsigned long record__waking(struct record *rec)
2353 {
2354 	int t;
2355 	unsigned long waking = 0;
2356 	struct record_thread *thread_data = rec->thread_data;
2357 
2358 	for (t = 0; t < rec->nr_threads; t++)
2359 		waking += thread_data[t].waking;
2360 
2361 	return waking;
2362 }
2363 
2364 static int __cmd_record(struct record *rec, int argc, const char **argv)
2365 {
2366 	int err;
2367 	int status = 0;
2368 	const bool forks = argc > 0;
2369 	struct perf_tool *tool = &rec->tool;
2370 	struct record_opts *opts = &rec->opts;
2371 	struct perf_data *data = &rec->data;
2372 	struct perf_session *session;
2373 	bool disabled = false, draining = false;
2374 	int fd;
2375 	float ratio = 0;
2376 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
2377 
2378 	atexit(record__sig_exit);
2379 	signal(SIGCHLD, sig_handler);
2380 	signal(SIGINT, sig_handler);
2381 	signal(SIGTERM, sig_handler);
2382 	signal(SIGSEGV, sigsegv_handler);
2383 
2384 	if (rec->opts.record_namespaces)
2385 		tool->namespace_events = true;
2386 
2387 	if (rec->opts.record_cgroup) {
2388 #ifdef HAVE_FILE_HANDLE
2389 		tool->cgroup_events = true;
2390 #else
2391 		pr_err("cgroup tracking is not supported\n");
2392 		return -1;
2393 #endif
2394 	}
2395 
2396 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
2397 		signal(SIGUSR2, snapshot_sig_handler);
2398 		if (rec->opts.auxtrace_snapshot_mode)
2399 			trigger_on(&auxtrace_snapshot_trigger);
2400 		if (rec->switch_output.enabled)
2401 			trigger_on(&switch_output_trigger);
2402 	} else {
2403 		signal(SIGUSR2, SIG_IGN);
2404 	}
2405 
2406 	session = perf_session__new(data, tool);
2407 	if (IS_ERR(session)) {
2408 		pr_err("Perf session creation failed.\n");
2409 		return PTR_ERR(session);
2410 	}
2411 
2412 	if (record__threads_enabled(rec)) {
2413 		if (perf_data__is_pipe(&rec->data)) {
2414 			pr_err("Parallel trace streaming is not available in pipe mode.\n");
2415 			return -1;
2416 		}
2417 		if (rec->opts.full_auxtrace) {
2418 			pr_err("Parallel trace streaming is not available in AUX area tracing mode.\n");
2419 			return -1;
2420 		}
2421 	}
2422 
2423 	fd = perf_data__fd(data);
2424 	rec->session = session;
2425 
2426 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
2427 		pr_err("Compression initialization failed.\n");
2428 		return -1;
2429 	}
2430 #ifdef HAVE_EVENTFD_SUPPORT
2431 	done_fd = eventfd(0, EFD_NONBLOCK);
2432 	if (done_fd < 0) {
2433 		pr_err("Failed to create wakeup eventfd, error: %m\n");
2434 		status = -1;
2435 		goto out_delete_session;
2436 	}
2437 	err = evlist__add_wakeup_eventfd(rec->evlist, done_fd);
2438 	if (err < 0) {
2439 		pr_err("Failed to add wakeup eventfd to poll list\n");
2440 		status = err;
2441 		goto out_delete_session;
2442 	}
2443 #endif // HAVE_EVENTFD_SUPPORT
2444 
2445 	session->header.env.comp_type  = PERF_COMP_ZSTD;
2446 	session->header.env.comp_level = rec->opts.comp_level;
2447 
2448 	if (rec->opts.kcore &&
2449 	    !record__kcore_readable(&session->machines.host)) {
2450 		pr_err("ERROR: kcore is not readable.\n");
2451 		return -1;
2452 	}
2453 
2454 	if (record__init_clock(rec))
2455 		return -1;
2456 
2457 	record__init_features(rec);
2458 
2459 	if (forks) {
2460 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
2461 					       workload_exec_failed_signal);
2462 		if (err < 0) {
2463 			pr_err("Couldn't run the workload!\n");
2464 			status = err;
2465 			goto out_delete_session;
2466 		}
2467 	}
2468 
2469 	/*
2470 	 * If we have just single event and are sending data
2471 	 * through pipe, we need to force the ids allocation,
2472 	 * because we synthesize event name through the pipe
2473 	 * and need the id for that.
2474 	 */
2475 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
2476 		rec->opts.sample_id = true;
2477 
2478 	record__uniquify_name(rec);
2479 
2480 	/* Debug message used by test scripts */
2481 	pr_debug3("perf record opening and mmapping events\n");
2482 	if (record__open(rec) != 0) {
2483 		err = -1;
2484 		goto out_free_threads;
2485 	}
2486 	/* Debug message used by test scripts */
2487 	pr_debug3("perf record done opening and mmapping events\n");
2488 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
2489 
2490 	if (rec->opts.kcore) {
2491 		err = record__kcore_copy(&session->machines.host, data);
2492 		if (err) {
2493 			pr_err("ERROR: Failed to copy kcore\n");
2494 			goto out_free_threads;
2495 		}
2496 	}
2497 
2498 	/*
2499 	 * Normally perf_session__new would do this, but it doesn't have the
2500 	 * evlist.
2501 	 */
2502 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
2503 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
2504 		rec->tool.ordered_events = false;
2505 	}
2506 
2507 	if (evlist__nr_groups(rec->evlist) == 0)
2508 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
2509 
2510 	if (data->is_pipe) {
2511 		err = perf_header__write_pipe(fd);
2512 		if (err < 0)
2513 			goto out_free_threads;
2514 	} else {
2515 		err = perf_session__write_header(session, rec->evlist, fd, false);
2516 		if (err < 0)
2517 			goto out_free_threads;
2518 	}
2519 
2520 	err = -1;
2521 	if (!rec->no_buildid
2522 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
2523 		pr_err("Couldn't generate buildids. "
2524 		       "Use --no-buildid to profile anyway.\n");
2525 		goto out_free_threads;
2526 	}
2527 
2528 	err = record__setup_sb_evlist(rec);
2529 	if (err)
2530 		goto out_free_threads;
2531 
2532 	err = record__synthesize(rec, false);
2533 	if (err < 0)
2534 		goto out_free_threads;
2535 
2536 	if (rec->realtime_prio) {
2537 		struct sched_param param;
2538 
2539 		param.sched_priority = rec->realtime_prio;
2540 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
2541 			pr_err("Could not set realtime priority.\n");
2542 			err = -1;
2543 			goto out_free_threads;
2544 		}
2545 	}
2546 
2547 	if (record__start_threads(rec))
2548 		goto out_free_threads;
2549 
2550 	/*
2551 	 * When perf is starting the traced process, all the events
2552 	 * (apart from group members) have enable_on_exec=1 set,
2553 	 * so don't spoil it by prematurely enabling them.
2554 	 */
2555 	if (!target__none(&opts->target) && !opts->target.initial_delay)
2556 		evlist__enable(rec->evlist);
2557 
2558 	/*
2559 	 * Let the child rip
2560 	 */
2561 	if (forks) {
2562 		struct machine *machine = &session->machines.host;
2563 		union perf_event *event;
2564 		pid_t tgid;
2565 
2566 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
2567 		if (event == NULL) {
2568 			err = -ENOMEM;
2569 			goto out_child;
2570 		}
2571 
2572 		/*
2573 		 * Some H/W events are generated before COMM event
2574 		 * which is emitted during exec(), so perf script
2575 		 * cannot see a correct process name for those events.
2576 		 * Synthesize COMM event to prevent it.
2577 		 */
2578 		tgid = perf_event__synthesize_comm(tool, event,
2579 						   rec->evlist->workload.pid,
2580 						   process_synthesized_event,
2581 						   machine);
2582 		free(event);
2583 
2584 		if (tgid == -1)
2585 			goto out_child;
2586 
2587 		event = malloc(sizeof(event->namespaces) +
2588 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
2589 			       machine->id_hdr_size);
2590 		if (event == NULL) {
2591 			err = -ENOMEM;
2592 			goto out_child;
2593 		}
2594 
2595 		/*
2596 		 * Synthesize NAMESPACES event for the command specified.
2597 		 */
2598 		perf_event__synthesize_namespaces(tool, event,
2599 						  rec->evlist->workload.pid,
2600 						  tgid, process_synthesized_event,
2601 						  machine);
2602 		free(event);
2603 
2604 		evlist__start_workload(rec->evlist);
2605 	}
2606 
2607 	if (opts->target.initial_delay) {
2608 		pr_info(EVLIST_DISABLED_MSG);
2609 		if (opts->target.initial_delay > 0) {
2610 			usleep(opts->target.initial_delay * USEC_PER_MSEC);
2611 			evlist__enable(rec->evlist);
2612 			pr_info(EVLIST_ENABLED_MSG);
2613 		}
2614 	}
2615 
2616 	err = event_enable_timer__start(rec->evlist->eet);
2617 	if (err)
2618 		goto out_child;
2619 
2620 	/* Debug message used by test scripts */
2621 	pr_debug3("perf record has started\n");
2622 	fflush(stderr);
2623 
2624 	trigger_ready(&auxtrace_snapshot_trigger);
2625 	trigger_ready(&switch_output_trigger);
2626 	perf_hooks__invoke_record_start();
2627 
2628 	/*
2629 	 * Must write FINISHED_INIT so it will be seen after all other
2630 	 * synthesized user events, but before any regular events.
2631 	 */
2632 	err = write_finished_init(rec, false);
2633 	if (err < 0)
2634 		goto out_child;
2635 
2636 	for (;;) {
2637 		unsigned long long hits = thread->samples;
2638 
2639 		/*
2640 		 * rec->evlist->bkw_mmap_state is possible to be
2641 		 * BKW_MMAP_EMPTY here: when done == true and
2642 		 * hits != rec->samples in previous round.
2643 		 *
2644 		 * evlist__toggle_bkw_mmap ensure we never
2645 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
2646 		 */
2647 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
2648 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
2649 
2650 		if (record__mmap_read_all(rec, false) < 0) {
2651 			trigger_error(&auxtrace_snapshot_trigger);
2652 			trigger_error(&switch_output_trigger);
2653 			err = -1;
2654 			goto out_child;
2655 		}
2656 
2657 		if (auxtrace_record__snapshot_started) {
2658 			auxtrace_record__snapshot_started = 0;
2659 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
2660 				record__read_auxtrace_snapshot(rec, false);
2661 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
2662 				pr_err("AUX area tracing snapshot failed\n");
2663 				err = -1;
2664 				goto out_child;
2665 			}
2666 		}
2667 
2668 		if (trigger_is_hit(&switch_output_trigger)) {
2669 			/*
2670 			 * If switch_output_trigger is hit, the data in
2671 			 * overwritable ring buffer should have been collected,
2672 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
2673 			 *
2674 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
2675 			 * record__mmap_read_all() didn't collect data from
2676 			 * overwritable ring buffer. Read again.
2677 			 */
2678 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
2679 				continue;
2680 			trigger_ready(&switch_output_trigger);
2681 
2682 			/*
2683 			 * Reenable events in overwrite ring buffer after
2684 			 * record__mmap_read_all(): we should have collected
2685 			 * data from it.
2686 			 */
2687 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
2688 
2689 			if (!quiet)
2690 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
2691 					record__waking(rec));
2692 			thread->waking = 0;
2693 			fd = record__switch_output(rec, false);
2694 			if (fd < 0) {
2695 				pr_err("Failed to switch to new file\n");
2696 				trigger_error(&switch_output_trigger);
2697 				err = fd;
2698 				goto out_child;
2699 			}
2700 
2701 			/* re-arm the alarm */
2702 			if (rec->switch_output.time)
2703 				alarm(rec->switch_output.time);
2704 		}
2705 
2706 		if (hits == thread->samples) {
2707 			if (done || draining)
2708 				break;
2709 			err = fdarray__poll(&thread->pollfd, -1);
2710 			/*
2711 			 * Propagate error, only if there's any. Ignore positive
2712 			 * number of returned events and interrupt error.
2713 			 */
2714 			if (err > 0 || (err < 0 && errno == EINTR))
2715 				err = 0;
2716 			thread->waking++;
2717 
2718 			if (fdarray__filter(&thread->pollfd, POLLERR | POLLHUP,
2719 					    record__thread_munmap_filtered, NULL) == 0)
2720 				draining = true;
2721 
2722 			err = record__update_evlist_pollfd_from_thread(rec, rec->evlist, thread);
2723 			if (err)
2724 				goto out_child;
2725 		}
2726 
2727 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
2728 			switch (cmd) {
2729 			case EVLIST_CTL_CMD_SNAPSHOT:
2730 				hit_auxtrace_snapshot_trigger(rec);
2731 				evlist__ctlfd_ack(rec->evlist);
2732 				break;
2733 			case EVLIST_CTL_CMD_STOP:
2734 				done = 1;
2735 				break;
2736 			case EVLIST_CTL_CMD_ACK:
2737 			case EVLIST_CTL_CMD_UNSUPPORTED:
2738 			case EVLIST_CTL_CMD_ENABLE:
2739 			case EVLIST_CTL_CMD_DISABLE:
2740 			case EVLIST_CTL_CMD_EVLIST:
2741 			case EVLIST_CTL_CMD_PING:
2742 			default:
2743 				break;
2744 			}
2745 		}
2746 
2747 		err = event_enable_timer__process(rec->evlist->eet);
2748 		if (err < 0)
2749 			goto out_child;
2750 		if (err) {
2751 			err = 0;
2752 			done = 1;
2753 		}
2754 
2755 		/*
2756 		 * When perf is starting the traced process, at the end events
2757 		 * die with the process and we wait for that. Thus no need to
2758 		 * disable events in this case.
2759 		 */
2760 		if (done && !disabled && !target__none(&opts->target)) {
2761 			trigger_off(&auxtrace_snapshot_trigger);
2762 			evlist__disable(rec->evlist);
2763 			disabled = true;
2764 		}
2765 	}
2766 
2767 	trigger_off(&auxtrace_snapshot_trigger);
2768 	trigger_off(&switch_output_trigger);
2769 
2770 	if (opts->auxtrace_snapshot_on_exit)
2771 		record__auxtrace_snapshot_exit(rec);
2772 
2773 	if (forks && workload_exec_errno) {
2774 		char msg[STRERR_BUFSIZE], strevsels[2048];
2775 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
2776 
2777 		evlist__scnprintf_evsels(rec->evlist, sizeof(strevsels), strevsels);
2778 
2779 		pr_err("Failed to collect '%s' for the '%s' workload: %s\n",
2780 			strevsels, argv[0], emsg);
2781 		err = -1;
2782 		goto out_child;
2783 	}
2784 
2785 	if (!quiet)
2786 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n",
2787 			record__waking(rec));
2788 
2789 	write_finished_init(rec, true);
2790 
2791 	if (target__none(&rec->opts.target))
2792 		record__synthesize_workload(rec, true);
2793 
2794 out_child:
2795 	record__stop_threads(rec);
2796 	record__mmap_read_all(rec, true);
2797 out_free_threads:
2798 	record__free_thread_data(rec);
2799 	evlist__finalize_ctlfd(rec->evlist);
2800 	record__aio_mmap_read_sync(rec);
2801 
2802 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
2803 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
2804 		session->header.env.comp_ratio = ratio + 0.5;
2805 	}
2806 
2807 	if (forks) {
2808 		int exit_status;
2809 
2810 		if (!child_finished)
2811 			kill(rec->evlist->workload.pid, SIGTERM);
2812 
2813 		wait(&exit_status);
2814 
2815 		if (err < 0)
2816 			status = err;
2817 		else if (WIFEXITED(exit_status))
2818 			status = WEXITSTATUS(exit_status);
2819 		else if (WIFSIGNALED(exit_status))
2820 			signr = WTERMSIG(exit_status);
2821 	} else
2822 		status = err;
2823 
2824 	if (rec->off_cpu)
2825 		rec->bytes_written += off_cpu_write(rec->session);
2826 
2827 	record__read_lost_samples(rec);
2828 	record__synthesize(rec, true);
2829 	/* this will be recalculated during process_buildids() */
2830 	rec->samples = 0;
2831 
2832 	if (!err) {
2833 		if (!rec->timestamp_filename) {
2834 			record__finish_output(rec);
2835 		} else {
2836 			fd = record__switch_output(rec, true);
2837 			if (fd < 0) {
2838 				status = fd;
2839 				goto out_delete_session;
2840 			}
2841 		}
2842 	}
2843 
2844 	perf_hooks__invoke_record_end();
2845 
2846 	if (!err && !quiet) {
2847 		char samples[128];
2848 		const char *postfix = rec->timestamp_filename ?
2849 					".<timestamp>" : "";
2850 
2851 		if (rec->samples && !rec->opts.full_auxtrace)
2852 			scnprintf(samples, sizeof(samples),
2853 				  " (%" PRIu64 " samples)", rec->samples);
2854 		else
2855 			samples[0] = '\0';
2856 
2857 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2858 			perf_data__size(data) / 1024.0 / 1024.0,
2859 			data->path, postfix, samples);
2860 		if (ratio) {
2861 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2862 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2863 					ratio);
2864 		}
2865 		fprintf(stderr, " ]\n");
2866 	}
2867 
2868 out_delete_session:
2869 #ifdef HAVE_EVENTFD_SUPPORT
2870 	if (done_fd >= 0) {
2871 		fd = done_fd;
2872 		done_fd = -1;
2873 
2874 		close(fd);
2875 	}
2876 #endif
2877 	zstd_fini(&session->zstd_data);
2878 	perf_session__delete(session);
2879 
2880 	if (!opts->no_bpf_event)
2881 		evlist__stop_sb_thread(rec->sb_evlist);
2882 	return status;
2883 }
2884 
2885 static void callchain_debug(struct callchain_param *callchain)
2886 {
2887 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2888 
2889 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2890 
2891 	if (callchain->record_mode == CALLCHAIN_DWARF)
2892 		pr_debug("callchain: stack dump size %d\n",
2893 			 callchain->dump_size);
2894 }
2895 
2896 int record_opts__parse_callchain(struct record_opts *record,
2897 				 struct callchain_param *callchain,
2898 				 const char *arg, bool unset)
2899 {
2900 	int ret;
2901 	callchain->enabled = !unset;
2902 
2903 	/* --no-call-graph */
2904 	if (unset) {
2905 		callchain->record_mode = CALLCHAIN_NONE;
2906 		pr_debug("callchain: disabled\n");
2907 		return 0;
2908 	}
2909 
2910 	ret = parse_callchain_record_opt(arg, callchain);
2911 	if (!ret) {
2912 		/* Enable data address sampling for DWARF unwind. */
2913 		if (callchain->record_mode == CALLCHAIN_DWARF)
2914 			record->sample_address = true;
2915 		callchain_debug(callchain);
2916 	}
2917 
2918 	return ret;
2919 }
2920 
2921 int record_parse_callchain_opt(const struct option *opt,
2922 			       const char *arg,
2923 			       int unset)
2924 {
2925 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2926 }
2927 
2928 int record_callchain_opt(const struct option *opt,
2929 			 const char *arg __maybe_unused,
2930 			 int unset __maybe_unused)
2931 {
2932 	struct callchain_param *callchain = opt->value;
2933 
2934 	callchain->enabled = true;
2935 
2936 	if (callchain->record_mode == CALLCHAIN_NONE)
2937 		callchain->record_mode = CALLCHAIN_FP;
2938 
2939 	callchain_debug(callchain);
2940 	return 0;
2941 }
2942 
2943 static int perf_record_config(const char *var, const char *value, void *cb)
2944 {
2945 	struct record *rec = cb;
2946 
2947 	if (!strcmp(var, "record.build-id")) {
2948 		if (!strcmp(value, "cache"))
2949 			rec->no_buildid_cache = false;
2950 		else if (!strcmp(value, "no-cache"))
2951 			rec->no_buildid_cache = true;
2952 		else if (!strcmp(value, "skip"))
2953 			rec->no_buildid = true;
2954 		else if (!strcmp(value, "mmap"))
2955 			rec->buildid_mmap = true;
2956 		else
2957 			return -1;
2958 		return 0;
2959 	}
2960 	if (!strcmp(var, "record.call-graph")) {
2961 		var = "call-graph.record-mode";
2962 		return perf_default_config(var, value, cb);
2963 	}
2964 #ifdef HAVE_AIO_SUPPORT
2965 	if (!strcmp(var, "record.aio")) {
2966 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2967 		if (!rec->opts.nr_cblocks)
2968 			rec->opts.nr_cblocks = nr_cblocks_default;
2969 	}
2970 #endif
2971 	if (!strcmp(var, "record.debuginfod")) {
2972 		rec->debuginfod.urls = strdup(value);
2973 		if (!rec->debuginfod.urls)
2974 			return -ENOMEM;
2975 		rec->debuginfod.set = true;
2976 	}
2977 
2978 	return 0;
2979 }
2980 
2981 static int record__parse_event_enable_time(const struct option *opt, const char *str, int unset)
2982 {
2983 	struct record *rec = (struct record *)opt->value;
2984 
2985 	return evlist__parse_event_enable_time(rec->evlist, &rec->opts, str, unset);
2986 }
2987 
2988 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2989 {
2990 	struct record_opts *opts = (struct record_opts *)opt->value;
2991 
2992 	if (unset || !str)
2993 		return 0;
2994 
2995 	if (!strcasecmp(str, "node"))
2996 		opts->affinity = PERF_AFFINITY_NODE;
2997 	else if (!strcasecmp(str, "cpu"))
2998 		opts->affinity = PERF_AFFINITY_CPU;
2999 
3000 	return 0;
3001 }
3002 
3003 static int record__mmap_cpu_mask_alloc(struct mmap_cpu_mask *mask, int nr_bits)
3004 {
3005 	mask->nbits = nr_bits;
3006 	mask->bits = bitmap_zalloc(mask->nbits);
3007 	if (!mask->bits)
3008 		return -ENOMEM;
3009 
3010 	return 0;
3011 }
3012 
3013 static void record__mmap_cpu_mask_free(struct mmap_cpu_mask *mask)
3014 {
3015 	bitmap_free(mask->bits);
3016 	mask->nbits = 0;
3017 }
3018 
3019 static int record__thread_mask_alloc(struct thread_mask *mask, int nr_bits)
3020 {
3021 	int ret;
3022 
3023 	ret = record__mmap_cpu_mask_alloc(&mask->maps, nr_bits);
3024 	if (ret) {
3025 		mask->affinity.bits = NULL;
3026 		return ret;
3027 	}
3028 
3029 	ret = record__mmap_cpu_mask_alloc(&mask->affinity, nr_bits);
3030 	if (ret) {
3031 		record__mmap_cpu_mask_free(&mask->maps);
3032 		mask->maps.bits = NULL;
3033 	}
3034 
3035 	return ret;
3036 }
3037 
3038 static void record__thread_mask_free(struct thread_mask *mask)
3039 {
3040 	record__mmap_cpu_mask_free(&mask->maps);
3041 	record__mmap_cpu_mask_free(&mask->affinity);
3042 }
3043 
3044 static int record__parse_threads(const struct option *opt, const char *str, int unset)
3045 {
3046 	int s;
3047 	struct record_opts *opts = opt->value;
3048 
3049 	if (unset || !str || !strlen(str)) {
3050 		opts->threads_spec = THREAD_SPEC__CPU;
3051 	} else {
3052 		for (s = 1; s < THREAD_SPEC__MAX; s++) {
3053 			if (s == THREAD_SPEC__USER) {
3054 				opts->threads_user_spec = strdup(str);
3055 				if (!opts->threads_user_spec)
3056 					return -ENOMEM;
3057 				opts->threads_spec = THREAD_SPEC__USER;
3058 				break;
3059 			}
3060 			if (!strncasecmp(str, thread_spec_tags[s], strlen(thread_spec_tags[s]))) {
3061 				opts->threads_spec = s;
3062 				break;
3063 			}
3064 		}
3065 	}
3066 
3067 	if (opts->threads_spec == THREAD_SPEC__USER)
3068 		pr_debug("threads_spec: %s\n", opts->threads_user_spec);
3069 	else
3070 		pr_debug("threads_spec: %s\n", thread_spec_tags[opts->threads_spec]);
3071 
3072 	return 0;
3073 }
3074 
3075 static int parse_output_max_size(const struct option *opt,
3076 				 const char *str, int unset)
3077 {
3078 	unsigned long *s = (unsigned long *)opt->value;
3079 	static struct parse_tag tags_size[] = {
3080 		{ .tag  = 'B', .mult = 1       },
3081 		{ .tag  = 'K', .mult = 1 << 10 },
3082 		{ .tag  = 'M', .mult = 1 << 20 },
3083 		{ .tag  = 'G', .mult = 1 << 30 },
3084 		{ .tag  = 0 },
3085 	};
3086 	unsigned long val;
3087 
3088 	if (unset) {
3089 		*s = 0;
3090 		return 0;
3091 	}
3092 
3093 	val = parse_tag_value(str, tags_size);
3094 	if (val != (unsigned long) -1) {
3095 		*s = val;
3096 		return 0;
3097 	}
3098 
3099 	return -1;
3100 }
3101 
3102 static int record__parse_mmap_pages(const struct option *opt,
3103 				    const char *str,
3104 				    int unset __maybe_unused)
3105 {
3106 	struct record_opts *opts = opt->value;
3107 	char *s, *p;
3108 	unsigned int mmap_pages;
3109 	int ret;
3110 
3111 	if (!str)
3112 		return -EINVAL;
3113 
3114 	s = strdup(str);
3115 	if (!s)
3116 		return -ENOMEM;
3117 
3118 	p = strchr(s, ',');
3119 	if (p)
3120 		*p = '\0';
3121 
3122 	if (*s) {
3123 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
3124 		if (ret)
3125 			goto out_free;
3126 		opts->mmap_pages = mmap_pages;
3127 	}
3128 
3129 	if (!p) {
3130 		ret = 0;
3131 		goto out_free;
3132 	}
3133 
3134 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
3135 	if (ret)
3136 		goto out_free;
3137 
3138 	opts->auxtrace_mmap_pages = mmap_pages;
3139 
3140 out_free:
3141 	free(s);
3142 	return ret;
3143 }
3144 
3145 void __weak arch__add_leaf_frame_record_opts(struct record_opts *opts __maybe_unused)
3146 {
3147 }
3148 
3149 static int parse_control_option(const struct option *opt,
3150 				const char *str,
3151 				int unset __maybe_unused)
3152 {
3153 	struct record_opts *opts = opt->value;
3154 
3155 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
3156 }
3157 
3158 static void switch_output_size_warn(struct record *rec)
3159 {
3160 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
3161 	struct switch_output *s = &rec->switch_output;
3162 
3163 	wakeup_size /= 2;
3164 
3165 	if (s->size < wakeup_size) {
3166 		char buf[100];
3167 
3168 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
3169 		pr_warning("WARNING: switch-output data size lower than "
3170 			   "wakeup kernel buffer size (%s) "
3171 			   "expect bigger perf.data sizes\n", buf);
3172 	}
3173 }
3174 
3175 static int switch_output_setup(struct record *rec)
3176 {
3177 	struct switch_output *s = &rec->switch_output;
3178 	static struct parse_tag tags_size[] = {
3179 		{ .tag  = 'B', .mult = 1       },
3180 		{ .tag  = 'K', .mult = 1 << 10 },
3181 		{ .tag  = 'M', .mult = 1 << 20 },
3182 		{ .tag  = 'G', .mult = 1 << 30 },
3183 		{ .tag  = 0 },
3184 	};
3185 	static struct parse_tag tags_time[] = {
3186 		{ .tag  = 's', .mult = 1        },
3187 		{ .tag  = 'm', .mult = 60       },
3188 		{ .tag  = 'h', .mult = 60*60    },
3189 		{ .tag  = 'd', .mult = 60*60*24 },
3190 		{ .tag  = 0 },
3191 	};
3192 	unsigned long val;
3193 
3194 	/*
3195 	 * If we're using --switch-output-events, then we imply its
3196 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
3197 	 *  thread to its parent.
3198 	 */
3199 	if (rec->switch_output_event_set) {
3200 		if (record__threads_enabled(rec)) {
3201 			pr_warning("WARNING: --switch-output-event option is not available in parallel streaming mode.\n");
3202 			return 0;
3203 		}
3204 		goto do_signal;
3205 	}
3206 
3207 	if (!s->set)
3208 		return 0;
3209 
3210 	if (record__threads_enabled(rec)) {
3211 		pr_warning("WARNING: --switch-output option is not available in parallel streaming mode.\n");
3212 		return 0;
3213 	}
3214 
3215 	if (!strcmp(s->str, "signal")) {
3216 do_signal:
3217 		s->signal = true;
3218 		pr_debug("switch-output with SIGUSR2 signal\n");
3219 		goto enabled;
3220 	}
3221 
3222 	val = parse_tag_value(s->str, tags_size);
3223 	if (val != (unsigned long) -1) {
3224 		s->size = val;
3225 		pr_debug("switch-output with %s size threshold\n", s->str);
3226 		goto enabled;
3227 	}
3228 
3229 	val = parse_tag_value(s->str, tags_time);
3230 	if (val != (unsigned long) -1) {
3231 		s->time = val;
3232 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
3233 			 s->str, s->time);
3234 		goto enabled;
3235 	}
3236 
3237 	return -1;
3238 
3239 enabled:
3240 	rec->timestamp_filename = true;
3241 	s->enabled              = true;
3242 
3243 	if (s->size && !rec->opts.no_buffering)
3244 		switch_output_size_warn(rec);
3245 
3246 	return 0;
3247 }
3248 
3249 static const char * const __record_usage[] = {
3250 	"perf record [<options>] [<command>]",
3251 	"perf record [<options>] -- <command> [<options>]",
3252 	NULL
3253 };
3254 const char * const *record_usage = __record_usage;
3255 
3256 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
3257 				  struct perf_sample *sample, struct machine *machine)
3258 {
3259 	/*
3260 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3261 	 * no need to add them twice.
3262 	 */
3263 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3264 		return 0;
3265 	return perf_event__process_mmap(tool, event, sample, machine);
3266 }
3267 
3268 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
3269 				   struct perf_sample *sample, struct machine *machine)
3270 {
3271 	/*
3272 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
3273 	 * no need to add them twice.
3274 	 */
3275 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
3276 		return 0;
3277 
3278 	return perf_event__process_mmap2(tool, event, sample, machine);
3279 }
3280 
3281 static int process_timestamp_boundary(struct perf_tool *tool,
3282 				      union perf_event *event __maybe_unused,
3283 				      struct perf_sample *sample,
3284 				      struct machine *machine __maybe_unused)
3285 {
3286 	struct record *rec = container_of(tool, struct record, tool);
3287 
3288 	set_timestamp_boundary(rec, sample->time);
3289 	return 0;
3290 }
3291 
3292 static int parse_record_synth_option(const struct option *opt,
3293 				     const char *str,
3294 				     int unset __maybe_unused)
3295 {
3296 	struct record_opts *opts = opt->value;
3297 	char *p = strdup(str);
3298 
3299 	if (p == NULL)
3300 		return -1;
3301 
3302 	opts->synth = parse_synth_opt(p);
3303 	free(p);
3304 
3305 	if (opts->synth < 0) {
3306 		pr_err("Invalid synth option: %s\n", str);
3307 		return -1;
3308 	}
3309 	return 0;
3310 }
3311 
3312 /*
3313  * XXX Ideally would be local to cmd_record() and passed to a record__new
3314  * because we need to have access to it in record__exit, that is called
3315  * after cmd_record() exits, but since record_options need to be accessible to
3316  * builtin-script, leave it here.
3317  *
3318  * At least we don't ouch it in all the other functions here directly.
3319  *
3320  * Just say no to tons of global variables, sigh.
3321  */
3322 static struct record record = {
3323 	.opts = {
3324 		.sample_time	     = true,
3325 		.mmap_pages	     = UINT_MAX,
3326 		.user_freq	     = UINT_MAX,
3327 		.user_interval	     = ULLONG_MAX,
3328 		.freq		     = 4000,
3329 		.target		     = {
3330 			.uses_mmap   = true,
3331 			.default_per_cpu = true,
3332 		},
3333 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
3334 		.nr_threads_synthesize = 1,
3335 		.ctl_fd              = -1,
3336 		.ctl_fd_ack          = -1,
3337 		.synth               = PERF_SYNTH_ALL,
3338 	},
3339 	.tool = {
3340 		.sample		= process_sample_event,
3341 		.fork		= perf_event__process_fork,
3342 		.exit		= perf_event__process_exit,
3343 		.comm		= perf_event__process_comm,
3344 		.namespaces	= perf_event__process_namespaces,
3345 		.mmap		= build_id__process_mmap,
3346 		.mmap2		= build_id__process_mmap2,
3347 		.itrace_start	= process_timestamp_boundary,
3348 		.aux		= process_timestamp_boundary,
3349 		.ordered_events	= true,
3350 	},
3351 };
3352 
3353 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
3354 	"\n\t\t\t\tDefault: fp";
3355 
3356 static bool dry_run;
3357 
3358 static struct parse_events_option_args parse_events_option_args = {
3359 	.evlistp = &record.evlist,
3360 };
3361 
3362 static struct parse_events_option_args switch_output_parse_events_option_args = {
3363 	.evlistp = &record.sb_evlist,
3364 };
3365 
3366 /*
3367  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
3368  * with it and switch to use the library functions in perf_evlist that came
3369  * from builtin-record.c, i.e. use record_opts,
3370  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
3371  * using pipes, etc.
3372  */
3373 static struct option __record_options[] = {
3374 	OPT_CALLBACK('e', "event", &parse_events_option_args, "event",
3375 		     "event selector. use 'perf list' to list available events",
3376 		     parse_events_option),
3377 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
3378 		     "event filter", parse_filter),
3379 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
3380 			   NULL, "don't record events from perf itself",
3381 			   exclude_perf),
3382 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
3383 		    "record events on existing process id"),
3384 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
3385 		    "record events on existing thread id"),
3386 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
3387 		    "collect data with this RT SCHED_FIFO priority"),
3388 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
3389 		    "collect data without buffering"),
3390 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
3391 		    "collect raw sample records from all opened counters"),
3392 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
3393 			    "system-wide collection from all CPUs"),
3394 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
3395 		    "list of cpus to monitor"),
3396 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
3397 	OPT_STRING('o', "output", &record.data.path, "file",
3398 		    "output file name"),
3399 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
3400 			&record.opts.no_inherit_set,
3401 			"child tasks do not inherit counters"),
3402 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
3403 		    "synthesize non-sample events at the end of output"),
3404 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
3405 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
3406 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
3407 		    "Fail if the specified frequency can't be used"),
3408 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
3409 		     "profile at this frequency",
3410 		      record__parse_freq),
3411 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
3412 		     "number of mmap data pages and AUX area tracing mmap pages",
3413 		     record__parse_mmap_pages),
3414 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
3415 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
3416 		     record__mmap_flush_parse),
3417 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
3418 			   NULL, "enables call-graph recording" ,
3419 			   &record_callchain_opt),
3420 	OPT_CALLBACK(0, "call-graph", &record.opts,
3421 		     "record_mode[,record_size]", record_callchain_help,
3422 		     &record_parse_callchain_opt),
3423 	OPT_INCR('v', "verbose", &verbose,
3424 		    "be more verbose (show counter open errors, etc)"),
3425 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any warnings or messages"),
3426 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
3427 		    "per thread counts"),
3428 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
3429 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
3430 		    "Record the sample physical addresses"),
3431 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
3432 		    "Record the sampled data address data page size"),
3433 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
3434 		    "Record the sampled code address (ip) page size"),
3435 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
3436 	OPT_BOOLEAN(0, "sample-identifier", &record.opts.sample_identifier,
3437 		    "Record the sample identifier"),
3438 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
3439 			&record.opts.sample_time_set,
3440 			"Record the sample timestamps"),
3441 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
3442 			"Record the sample period"),
3443 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
3444 		    "don't sample"),
3445 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
3446 			&record.no_buildid_cache_set,
3447 			"do not update the buildid cache"),
3448 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
3449 			&record.no_buildid_set,
3450 			"do not collect buildids in perf.data"),
3451 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
3452 		     "monitor event in cgroup name only",
3453 		     parse_cgroups),
3454 	OPT_CALLBACK('D', "delay", &record, "ms",
3455 		     "ms to wait before starting measurement after program start (-1: start with events disabled), "
3456 		     "or ranges of time to enable events e.g. '-D 10-20,30-40'",
3457 		     record__parse_event_enable_time),
3458 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
3459 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
3460 		   "user to profile"),
3461 
3462 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
3463 		     "branch any", "sample any taken branches",
3464 		     parse_branch_stack),
3465 
3466 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
3467 		     "branch filter mask", "branch stack filter modes",
3468 		     parse_branch_stack),
3469 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
3470 		    "sample by weight (on special events only)"),
3471 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
3472 		    "sample transaction flags (special events only)"),
3473 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
3474 		    "use per-thread mmaps"),
3475 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
3476 		    "sample selected machine registers on interrupt,"
3477 		    " use '-I?' to list register names", parse_intr_regs),
3478 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
3479 		    "sample selected machine registers on interrupt,"
3480 		    " use '--user-regs=?' to list register names", parse_user_regs),
3481 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
3482 		    "Record running/enabled time of read (:S) events"),
3483 	OPT_CALLBACK('k', "clockid", &record.opts,
3484 	"clockid", "clockid to use for events, see clock_gettime()",
3485 	parse_clockid),
3486 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
3487 			  "opts", "AUX area tracing Snapshot Mode", ""),
3488 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
3489 			  "opts", "sample AUX area", ""),
3490 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
3491 			"per thread proc mmap processing timeout in ms"),
3492 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
3493 		    "Record namespaces events"),
3494 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
3495 		    "Record cgroup events"),
3496 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
3497 			&record.opts.record_switch_events_set,
3498 			"Record context switch events"),
3499 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
3500 			 "Configure all used events to run in kernel space.",
3501 			 PARSE_OPT_EXCLUSIVE),
3502 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
3503 			 "Configure all used events to run in user space.",
3504 			 PARSE_OPT_EXCLUSIVE),
3505 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
3506 		    "collect kernel callchains"),
3507 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
3508 		    "collect user callchains"),
3509 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
3510 		   "file", "vmlinux pathname"),
3511 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
3512 		    "Record build-id of all DSOs regardless of hits"),
3513 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
3514 		    "Record build-id in map events"),
3515 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
3516 		    "append timestamp to output filename"),
3517 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
3518 		    "Record timestamp boundary (time of first/last samples)"),
3519 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
3520 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
3521 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
3522 			  "signal"),
3523 	OPT_CALLBACK_SET(0, "switch-output-event", &switch_output_parse_events_option_args,
3524 			 &record.switch_output_event_set, "switch output event",
3525 			 "switch output event selector. use 'perf list' to list available events",
3526 			 parse_events_option_new_evlist),
3527 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
3528 		   "Limit number of switch output generated files"),
3529 	OPT_BOOLEAN(0, "dry-run", &dry_run,
3530 		    "Parse options then exit"),
3531 #ifdef HAVE_AIO_SUPPORT
3532 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
3533 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
3534 		     record__aio_parse),
3535 #endif
3536 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
3537 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
3538 		     record__parse_affinity),
3539 #ifdef HAVE_ZSTD_SUPPORT
3540 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default, "n",
3541 			    "Compress records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
3542 			    record__parse_comp_level),
3543 #endif
3544 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
3545 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
3546 	OPT_UINTEGER(0, "num-thread-synthesize",
3547 		     &record.opts.nr_threads_synthesize,
3548 		     "number of threads to run for event synthesis"),
3549 #ifdef HAVE_LIBPFM
3550 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
3551 		"libpfm4 event selector. use 'perf list' to list available events",
3552 		parse_libpfm_events_option),
3553 #endif
3554 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
3555 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
3556 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
3557 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
3558 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
3559 		      parse_control_option),
3560 	OPT_CALLBACK(0, "synth", &record.opts, "no|all|task|mmap|cgroup",
3561 		     "Fine-tune event synthesis: default=all", parse_record_synth_option),
3562 	OPT_STRING_OPTARG_SET(0, "debuginfod", &record.debuginfod.urls,
3563 			  &record.debuginfod.set, "debuginfod urls",
3564 			  "Enable debuginfod data retrieval from DEBUGINFOD_URLS or specified urls",
3565 			  "system"),
3566 	OPT_CALLBACK_OPTARG(0, "threads", &record.opts, NULL, "spec",
3567 			    "write collected trace data into several data files using parallel threads",
3568 			    record__parse_threads),
3569 	OPT_BOOLEAN(0, "off-cpu", &record.off_cpu, "Enable off-cpu analysis"),
3570 	OPT_END()
3571 };
3572 
3573 struct option *record_options = __record_options;
3574 
3575 static int record__mmap_cpu_mask_init(struct mmap_cpu_mask *mask, struct perf_cpu_map *cpus)
3576 {
3577 	struct perf_cpu cpu;
3578 	int idx;
3579 
3580 	if (cpu_map__is_dummy(cpus))
3581 		return 0;
3582 
3583 	perf_cpu_map__for_each_cpu(cpu, idx, cpus) {
3584 		if (cpu.cpu == -1)
3585 			continue;
3586 		/* Return ENODEV is input cpu is greater than max cpu */
3587 		if ((unsigned long)cpu.cpu > mask->nbits)
3588 			return -ENODEV;
3589 		__set_bit(cpu.cpu, mask->bits);
3590 	}
3591 
3592 	return 0;
3593 }
3594 
3595 static int record__mmap_cpu_mask_init_spec(struct mmap_cpu_mask *mask, const char *mask_spec)
3596 {
3597 	struct perf_cpu_map *cpus;
3598 
3599 	cpus = perf_cpu_map__new(mask_spec);
3600 	if (!cpus)
3601 		return -ENOMEM;
3602 
3603 	bitmap_zero(mask->bits, mask->nbits);
3604 	if (record__mmap_cpu_mask_init(mask, cpus))
3605 		return -ENODEV;
3606 
3607 	perf_cpu_map__put(cpus);
3608 
3609 	return 0;
3610 }
3611 
3612 static void record__free_thread_masks(struct record *rec, int nr_threads)
3613 {
3614 	int t;
3615 
3616 	if (rec->thread_masks)
3617 		for (t = 0; t < nr_threads; t++)
3618 			record__thread_mask_free(&rec->thread_masks[t]);
3619 
3620 	zfree(&rec->thread_masks);
3621 }
3622 
3623 static int record__alloc_thread_masks(struct record *rec, int nr_threads, int nr_bits)
3624 {
3625 	int t, ret;
3626 
3627 	rec->thread_masks = zalloc(nr_threads * sizeof(*(rec->thread_masks)));
3628 	if (!rec->thread_masks) {
3629 		pr_err("Failed to allocate thread masks\n");
3630 		return -ENOMEM;
3631 	}
3632 
3633 	for (t = 0; t < nr_threads; t++) {
3634 		ret = record__thread_mask_alloc(&rec->thread_masks[t], nr_bits);
3635 		if (ret) {
3636 			pr_err("Failed to allocate thread masks[%d]\n", t);
3637 			goto out_free;
3638 		}
3639 	}
3640 
3641 	return 0;
3642 
3643 out_free:
3644 	record__free_thread_masks(rec, nr_threads);
3645 
3646 	return ret;
3647 }
3648 
3649 static int record__init_thread_cpu_masks(struct record *rec, struct perf_cpu_map *cpus)
3650 {
3651 	int t, ret, nr_cpus = perf_cpu_map__nr(cpus);
3652 
3653 	ret = record__alloc_thread_masks(rec, nr_cpus, cpu__max_cpu().cpu);
3654 	if (ret)
3655 		return ret;
3656 
3657 	rec->nr_threads = nr_cpus;
3658 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3659 
3660 	for (t = 0; t < rec->nr_threads; t++) {
3661 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].maps.bits);
3662 		__set_bit(perf_cpu_map__cpu(cpus, t).cpu, rec->thread_masks[t].affinity.bits);
3663 		if (verbose > 0) {
3664 			pr_debug("thread_masks[%d]: ", t);
3665 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3666 			pr_debug("thread_masks[%d]: ", t);
3667 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3668 		}
3669 	}
3670 
3671 	return 0;
3672 }
3673 
3674 static int record__init_thread_masks_spec(struct record *rec, struct perf_cpu_map *cpus,
3675 					  const char **maps_spec, const char **affinity_spec,
3676 					  u32 nr_spec)
3677 {
3678 	u32 s;
3679 	int ret = 0, t = 0;
3680 	struct mmap_cpu_mask cpus_mask;
3681 	struct thread_mask thread_mask, full_mask, *thread_masks;
3682 
3683 	ret = record__mmap_cpu_mask_alloc(&cpus_mask, cpu__max_cpu().cpu);
3684 	if (ret) {
3685 		pr_err("Failed to allocate CPUs mask\n");
3686 		return ret;
3687 	}
3688 
3689 	ret = record__mmap_cpu_mask_init(&cpus_mask, cpus);
3690 	if (ret) {
3691 		pr_err("Failed to init cpu mask\n");
3692 		goto out_free_cpu_mask;
3693 	}
3694 
3695 	ret = record__thread_mask_alloc(&full_mask, cpu__max_cpu().cpu);
3696 	if (ret) {
3697 		pr_err("Failed to allocate full mask\n");
3698 		goto out_free_cpu_mask;
3699 	}
3700 
3701 	ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3702 	if (ret) {
3703 		pr_err("Failed to allocate thread mask\n");
3704 		goto out_free_full_and_cpu_masks;
3705 	}
3706 
3707 	for (s = 0; s < nr_spec; s++) {
3708 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.maps, maps_spec[s]);
3709 		if (ret) {
3710 			pr_err("Failed to initialize maps thread mask\n");
3711 			goto out_free;
3712 		}
3713 		ret = record__mmap_cpu_mask_init_spec(&thread_mask.affinity, affinity_spec[s]);
3714 		if (ret) {
3715 			pr_err("Failed to initialize affinity thread mask\n");
3716 			goto out_free;
3717 		}
3718 
3719 		/* ignore invalid CPUs but do not allow empty masks */
3720 		if (!bitmap_and(thread_mask.maps.bits, thread_mask.maps.bits,
3721 				cpus_mask.bits, thread_mask.maps.nbits)) {
3722 			pr_err("Empty maps mask: %s\n", maps_spec[s]);
3723 			ret = -EINVAL;
3724 			goto out_free;
3725 		}
3726 		if (!bitmap_and(thread_mask.affinity.bits, thread_mask.affinity.bits,
3727 				cpus_mask.bits, thread_mask.affinity.nbits)) {
3728 			pr_err("Empty affinity mask: %s\n", affinity_spec[s]);
3729 			ret = -EINVAL;
3730 			goto out_free;
3731 		}
3732 
3733 		/* do not allow intersection with other masks (full_mask) */
3734 		if (bitmap_intersects(thread_mask.maps.bits, full_mask.maps.bits,
3735 				      thread_mask.maps.nbits)) {
3736 			pr_err("Intersecting maps mask: %s\n", maps_spec[s]);
3737 			ret = -EINVAL;
3738 			goto out_free;
3739 		}
3740 		if (bitmap_intersects(thread_mask.affinity.bits, full_mask.affinity.bits,
3741 				      thread_mask.affinity.nbits)) {
3742 			pr_err("Intersecting affinity mask: %s\n", affinity_spec[s]);
3743 			ret = -EINVAL;
3744 			goto out_free;
3745 		}
3746 
3747 		bitmap_or(full_mask.maps.bits, full_mask.maps.bits,
3748 			  thread_mask.maps.bits, full_mask.maps.nbits);
3749 		bitmap_or(full_mask.affinity.bits, full_mask.affinity.bits,
3750 			  thread_mask.affinity.bits, full_mask.maps.nbits);
3751 
3752 		thread_masks = realloc(rec->thread_masks, (t + 1) * sizeof(struct thread_mask));
3753 		if (!thread_masks) {
3754 			pr_err("Failed to reallocate thread masks\n");
3755 			ret = -ENOMEM;
3756 			goto out_free;
3757 		}
3758 		rec->thread_masks = thread_masks;
3759 		rec->thread_masks[t] = thread_mask;
3760 		if (verbose > 0) {
3761 			pr_debug("thread_masks[%d]: ", t);
3762 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].maps, "maps");
3763 			pr_debug("thread_masks[%d]: ", t);
3764 			mmap_cpu_mask__scnprintf(&rec->thread_masks[t].affinity, "affinity");
3765 		}
3766 		t++;
3767 		ret = record__thread_mask_alloc(&thread_mask, cpu__max_cpu().cpu);
3768 		if (ret) {
3769 			pr_err("Failed to allocate thread mask\n");
3770 			goto out_free_full_and_cpu_masks;
3771 		}
3772 	}
3773 	rec->nr_threads = t;
3774 	pr_debug("nr_threads: %d\n", rec->nr_threads);
3775 	if (!rec->nr_threads)
3776 		ret = -EINVAL;
3777 
3778 out_free:
3779 	record__thread_mask_free(&thread_mask);
3780 out_free_full_and_cpu_masks:
3781 	record__thread_mask_free(&full_mask);
3782 out_free_cpu_mask:
3783 	record__mmap_cpu_mask_free(&cpus_mask);
3784 
3785 	return ret;
3786 }
3787 
3788 static int record__init_thread_core_masks(struct record *rec, struct perf_cpu_map *cpus)
3789 {
3790 	int ret;
3791 	struct cpu_topology *topo;
3792 
3793 	topo = cpu_topology__new();
3794 	if (!topo) {
3795 		pr_err("Failed to allocate CPU topology\n");
3796 		return -ENOMEM;
3797 	}
3798 
3799 	ret = record__init_thread_masks_spec(rec, cpus, topo->core_cpus_list,
3800 					     topo->core_cpus_list, topo->core_cpus_lists);
3801 	cpu_topology__delete(topo);
3802 
3803 	return ret;
3804 }
3805 
3806 static int record__init_thread_package_masks(struct record *rec, struct perf_cpu_map *cpus)
3807 {
3808 	int ret;
3809 	struct cpu_topology *topo;
3810 
3811 	topo = cpu_topology__new();
3812 	if (!topo) {
3813 		pr_err("Failed to allocate CPU topology\n");
3814 		return -ENOMEM;
3815 	}
3816 
3817 	ret = record__init_thread_masks_spec(rec, cpus, topo->package_cpus_list,
3818 					     topo->package_cpus_list, topo->package_cpus_lists);
3819 	cpu_topology__delete(topo);
3820 
3821 	return ret;
3822 }
3823 
3824 static int record__init_thread_numa_masks(struct record *rec, struct perf_cpu_map *cpus)
3825 {
3826 	u32 s;
3827 	int ret;
3828 	const char **spec;
3829 	struct numa_topology *topo;
3830 
3831 	topo = numa_topology__new();
3832 	if (!topo) {
3833 		pr_err("Failed to allocate NUMA topology\n");
3834 		return -ENOMEM;
3835 	}
3836 
3837 	spec = zalloc(topo->nr * sizeof(char *));
3838 	if (!spec) {
3839 		pr_err("Failed to allocate NUMA spec\n");
3840 		ret = -ENOMEM;
3841 		goto out_delete_topo;
3842 	}
3843 	for (s = 0; s < topo->nr; s++)
3844 		spec[s] = topo->nodes[s].cpus;
3845 
3846 	ret = record__init_thread_masks_spec(rec, cpus, spec, spec, topo->nr);
3847 
3848 	zfree(&spec);
3849 
3850 out_delete_topo:
3851 	numa_topology__delete(topo);
3852 
3853 	return ret;
3854 }
3855 
3856 static int record__init_thread_user_masks(struct record *rec, struct perf_cpu_map *cpus)
3857 {
3858 	int t, ret;
3859 	u32 s, nr_spec = 0;
3860 	char **maps_spec = NULL, **affinity_spec = NULL, **tmp_spec;
3861 	char *user_spec, *spec, *spec_ptr, *mask, *mask_ptr, *dup_mask = NULL;
3862 
3863 	for (t = 0, user_spec = (char *)rec->opts.threads_user_spec; ; t++, user_spec = NULL) {
3864 		spec = strtok_r(user_spec, ":", &spec_ptr);
3865 		if (spec == NULL)
3866 			break;
3867 		pr_debug2("threads_spec[%d]: %s\n", t, spec);
3868 		mask = strtok_r(spec, "/", &mask_ptr);
3869 		if (mask == NULL)
3870 			break;
3871 		pr_debug2("  maps mask: %s\n", mask);
3872 		tmp_spec = realloc(maps_spec, (nr_spec + 1) * sizeof(char *));
3873 		if (!tmp_spec) {
3874 			pr_err("Failed to reallocate maps spec\n");
3875 			ret = -ENOMEM;
3876 			goto out_free;
3877 		}
3878 		maps_spec = tmp_spec;
3879 		maps_spec[nr_spec] = dup_mask = strdup(mask);
3880 		if (!maps_spec[nr_spec]) {
3881 			pr_err("Failed to allocate maps spec[%d]\n", nr_spec);
3882 			ret = -ENOMEM;
3883 			goto out_free;
3884 		}
3885 		mask = strtok_r(NULL, "/", &mask_ptr);
3886 		if (mask == NULL) {
3887 			pr_err("Invalid thread maps or affinity specs\n");
3888 			ret = -EINVAL;
3889 			goto out_free;
3890 		}
3891 		pr_debug2("  affinity mask: %s\n", mask);
3892 		tmp_spec = realloc(affinity_spec, (nr_spec + 1) * sizeof(char *));
3893 		if (!tmp_spec) {
3894 			pr_err("Failed to reallocate affinity spec\n");
3895 			ret = -ENOMEM;
3896 			goto out_free;
3897 		}
3898 		affinity_spec = tmp_spec;
3899 		affinity_spec[nr_spec] = strdup(mask);
3900 		if (!affinity_spec[nr_spec]) {
3901 			pr_err("Failed to allocate affinity spec[%d]\n", nr_spec);
3902 			ret = -ENOMEM;
3903 			goto out_free;
3904 		}
3905 		dup_mask = NULL;
3906 		nr_spec++;
3907 	}
3908 
3909 	ret = record__init_thread_masks_spec(rec, cpus, (const char **)maps_spec,
3910 					     (const char **)affinity_spec, nr_spec);
3911 
3912 out_free:
3913 	free(dup_mask);
3914 	for (s = 0; s < nr_spec; s++) {
3915 		if (maps_spec)
3916 			free(maps_spec[s]);
3917 		if (affinity_spec)
3918 			free(affinity_spec[s]);
3919 	}
3920 	free(affinity_spec);
3921 	free(maps_spec);
3922 
3923 	return ret;
3924 }
3925 
3926 static int record__init_thread_default_masks(struct record *rec, struct perf_cpu_map *cpus)
3927 {
3928 	int ret;
3929 
3930 	ret = record__alloc_thread_masks(rec, 1, cpu__max_cpu().cpu);
3931 	if (ret)
3932 		return ret;
3933 
3934 	if (record__mmap_cpu_mask_init(&rec->thread_masks->maps, cpus))
3935 		return -ENODEV;
3936 
3937 	rec->nr_threads = 1;
3938 
3939 	return 0;
3940 }
3941 
3942 static int record__init_thread_masks(struct record *rec)
3943 {
3944 	int ret = 0;
3945 	struct perf_cpu_map *cpus = rec->evlist->core.all_cpus;
3946 
3947 	if (!record__threads_enabled(rec))
3948 		return record__init_thread_default_masks(rec, cpus);
3949 
3950 	if (evlist__per_thread(rec->evlist)) {
3951 		pr_err("--per-thread option is mutually exclusive to parallel streaming mode.\n");
3952 		return -EINVAL;
3953 	}
3954 
3955 	switch (rec->opts.threads_spec) {
3956 	case THREAD_SPEC__CPU:
3957 		ret = record__init_thread_cpu_masks(rec, cpus);
3958 		break;
3959 	case THREAD_SPEC__CORE:
3960 		ret = record__init_thread_core_masks(rec, cpus);
3961 		break;
3962 	case THREAD_SPEC__PACKAGE:
3963 		ret = record__init_thread_package_masks(rec, cpus);
3964 		break;
3965 	case THREAD_SPEC__NUMA:
3966 		ret = record__init_thread_numa_masks(rec, cpus);
3967 		break;
3968 	case THREAD_SPEC__USER:
3969 		ret = record__init_thread_user_masks(rec, cpus);
3970 		break;
3971 	default:
3972 		break;
3973 	}
3974 
3975 	return ret;
3976 }
3977 
3978 int cmd_record(int argc, const char **argv)
3979 {
3980 	int err;
3981 	struct record *rec = &record;
3982 	char errbuf[BUFSIZ];
3983 
3984 	setlocale(LC_ALL, "");
3985 
3986 #ifndef HAVE_BPF_SKEL
3987 # define set_nobuild(s, l, m, c) set_option_nobuild(record_options, s, l, m, c)
3988 	set_nobuild('\0', "off-cpu", "no BUILD_BPF_SKEL=1", true);
3989 # undef set_nobuild
3990 #endif
3991 
3992 	rec->opts.affinity = PERF_AFFINITY_SYS;
3993 
3994 	rec->evlist = evlist__new();
3995 	if (rec->evlist == NULL)
3996 		return -ENOMEM;
3997 
3998 	err = perf_config(perf_record_config, rec);
3999 	if (err)
4000 		return err;
4001 
4002 	argc = parse_options(argc, argv, record_options, record_usage,
4003 			    PARSE_OPT_STOP_AT_NON_OPTION);
4004 	if (quiet)
4005 		perf_quiet_option();
4006 
4007 	err = symbol__validate_sym_arguments();
4008 	if (err)
4009 		return err;
4010 
4011 	perf_debuginfod_setup(&record.debuginfod);
4012 
4013 	/* Make system wide (-a) the default target. */
4014 	if (!argc && target__none(&rec->opts.target))
4015 		rec->opts.target.system_wide = true;
4016 
4017 	if (nr_cgroups && !rec->opts.target.system_wide) {
4018 		usage_with_options_msg(record_usage, record_options,
4019 			"cgroup monitoring only available in system-wide mode");
4020 
4021 	}
4022 
4023 	if (rec->buildid_mmap) {
4024 		if (!perf_can_record_build_id()) {
4025 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
4026 			err = -EINVAL;
4027 			goto out_opts;
4028 		}
4029 		pr_debug("Enabling build id in mmap2 events.\n");
4030 		/* Enable mmap build id synthesizing. */
4031 		symbol_conf.buildid_mmap2 = true;
4032 		/* Enable perf_event_attr::build_id bit. */
4033 		rec->opts.build_id = true;
4034 		/* Disable build id cache. */
4035 		rec->no_buildid = true;
4036 	}
4037 
4038 	if (rec->opts.record_cgroup && !perf_can_record_cgroup()) {
4039 		pr_err("Kernel has no cgroup sampling support.\n");
4040 		err = -EINVAL;
4041 		goto out_opts;
4042 	}
4043 
4044 	if (rec->opts.kcore)
4045 		rec->opts.text_poke = true;
4046 
4047 	if (rec->opts.kcore || record__threads_enabled(rec))
4048 		rec->data.is_dir = true;
4049 
4050 	if (record__threads_enabled(rec)) {
4051 		if (rec->opts.affinity != PERF_AFFINITY_SYS) {
4052 			pr_err("--affinity option is mutually exclusive to parallel streaming mode.\n");
4053 			goto out_opts;
4054 		}
4055 		if (record__aio_enabled(rec)) {
4056 			pr_err("Asynchronous streaming mode (--aio) is mutually exclusive to parallel streaming mode.\n");
4057 			goto out_opts;
4058 		}
4059 	}
4060 
4061 	if (rec->opts.comp_level != 0) {
4062 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
4063 		rec->no_buildid = true;
4064 	}
4065 
4066 	if (rec->opts.record_switch_events &&
4067 	    !perf_can_record_switch_events()) {
4068 		ui__error("kernel does not support recording context switch events\n");
4069 		parse_options_usage(record_usage, record_options, "switch-events", 0);
4070 		err = -EINVAL;
4071 		goto out_opts;
4072 	}
4073 
4074 	if (switch_output_setup(rec)) {
4075 		parse_options_usage(record_usage, record_options, "switch-output", 0);
4076 		err = -EINVAL;
4077 		goto out_opts;
4078 	}
4079 
4080 	if (rec->switch_output.time) {
4081 		signal(SIGALRM, alarm_sig_handler);
4082 		alarm(rec->switch_output.time);
4083 	}
4084 
4085 	if (rec->switch_output.num_files) {
4086 		rec->switch_output.filenames = calloc(sizeof(char *),
4087 						      rec->switch_output.num_files);
4088 		if (!rec->switch_output.filenames) {
4089 			err = -EINVAL;
4090 			goto out_opts;
4091 		}
4092 	}
4093 
4094 	if (rec->timestamp_filename && record__threads_enabled(rec)) {
4095 		rec->timestamp_filename = false;
4096 		pr_warning("WARNING: --timestamp-filename option is not available in parallel streaming mode.\n");
4097 	}
4098 
4099 	/*
4100 	 * Allow aliases to facilitate the lookup of symbols for address
4101 	 * filters. Refer to auxtrace_parse_filters().
4102 	 */
4103 	symbol_conf.allow_aliases = true;
4104 
4105 	symbol__init(NULL);
4106 
4107 	err = record__auxtrace_init(rec);
4108 	if (err)
4109 		goto out;
4110 
4111 	if (dry_run)
4112 		goto out;
4113 
4114 	err = -ENOMEM;
4115 
4116 	if (rec->no_buildid_cache || rec->no_buildid) {
4117 		disable_buildid_cache();
4118 	} else if (rec->switch_output.enabled) {
4119 		/*
4120 		 * In 'perf record --switch-output', disable buildid
4121 		 * generation by default to reduce data file switching
4122 		 * overhead. Still generate buildid if they are required
4123 		 * explicitly using
4124 		 *
4125 		 *  perf record --switch-output --no-no-buildid \
4126 		 *              --no-no-buildid-cache
4127 		 *
4128 		 * Following code equals to:
4129 		 *
4130 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
4131 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
4132 		 *         disable_buildid_cache();
4133 		 */
4134 		bool disable = true;
4135 
4136 		if (rec->no_buildid_set && !rec->no_buildid)
4137 			disable = false;
4138 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
4139 			disable = false;
4140 		if (disable) {
4141 			rec->no_buildid = true;
4142 			rec->no_buildid_cache = true;
4143 			disable_buildid_cache();
4144 		}
4145 	}
4146 
4147 	if (record.opts.overwrite)
4148 		record.opts.tail_synthesize = true;
4149 
4150 	if (rec->evlist->core.nr_entries == 0) {
4151 		bool can_profile_kernel = perf_event_paranoid_check(1);
4152 
4153 		err = parse_event(rec->evlist, can_profile_kernel ? "cycles:P" : "cycles:Pu");
4154 		if (err)
4155 			goto out;
4156 	}
4157 
4158 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
4159 		rec->opts.no_inherit = true;
4160 
4161 	err = target__validate(&rec->opts.target);
4162 	if (err) {
4163 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4164 		ui__warning("%s\n", errbuf);
4165 	}
4166 
4167 	err = target__parse_uid(&rec->opts.target);
4168 	if (err) {
4169 		int saved_errno = errno;
4170 
4171 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
4172 		ui__error("%s", errbuf);
4173 
4174 		err = -saved_errno;
4175 		goto out;
4176 	}
4177 
4178 	/* Enable ignoring missing threads when -u/-p option is defined. */
4179 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
4180 
4181 	evlist__warn_user_requested_cpus(rec->evlist, rec->opts.target.cpu_list);
4182 
4183 	if (callchain_param.enabled && callchain_param.record_mode == CALLCHAIN_FP)
4184 		arch__add_leaf_frame_record_opts(&rec->opts);
4185 
4186 	err = -ENOMEM;
4187 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0) {
4188 		if (rec->opts.target.pid != NULL) {
4189 			pr_err("Couldn't create thread/CPU maps: %s\n",
4190 				errno == ENOENT ? "No such process" : str_error_r(errno, errbuf, sizeof(errbuf)));
4191 			goto out;
4192 		}
4193 		else
4194 			usage_with_options(record_usage, record_options);
4195 	}
4196 
4197 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
4198 	if (err)
4199 		goto out;
4200 
4201 	/*
4202 	 * We take all buildids when the file contains
4203 	 * AUX area tracing data because we do not decode the
4204 	 * trace because it would take too long.
4205 	 */
4206 	if (rec->opts.full_auxtrace)
4207 		rec->buildid_all = true;
4208 
4209 	if (rec->opts.text_poke) {
4210 		err = record__config_text_poke(rec->evlist);
4211 		if (err) {
4212 			pr_err("record__config_text_poke failed, error %d\n", err);
4213 			goto out;
4214 		}
4215 	}
4216 
4217 	if (rec->off_cpu) {
4218 		err = record__config_off_cpu(rec);
4219 		if (err) {
4220 			pr_err("record__config_off_cpu failed, error %d\n", err);
4221 			goto out;
4222 		}
4223 	}
4224 
4225 	if (record_opts__config(&rec->opts)) {
4226 		err = -EINVAL;
4227 		goto out;
4228 	}
4229 
4230 	err = record__config_tracking_events(rec);
4231 	if (err) {
4232 		pr_err("record__config_tracking_events failed, error %d\n", err);
4233 		goto out;
4234 	}
4235 
4236 	err = record__init_thread_masks(rec);
4237 	if (err) {
4238 		pr_err("Failed to initialize parallel data streaming masks\n");
4239 		goto out;
4240 	}
4241 
4242 	if (rec->opts.nr_cblocks > nr_cblocks_max)
4243 		rec->opts.nr_cblocks = nr_cblocks_max;
4244 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
4245 
4246 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
4247 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
4248 
4249 	if (rec->opts.comp_level > comp_level_max)
4250 		rec->opts.comp_level = comp_level_max;
4251 	pr_debug("comp level: %d\n", rec->opts.comp_level);
4252 
4253 	err = __cmd_record(&record, argc, argv);
4254 out:
4255 	evlist__delete(rec->evlist);
4256 	symbol__exit();
4257 	auxtrace_record__free(rec->itr);
4258 out_opts:
4259 	record__free_thread_masks(rec, rec->nr_threads);
4260 	rec->nr_threads = 0;
4261 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
4262 	return err;
4263 }
4264 
4265 static void snapshot_sig_handler(int sig __maybe_unused)
4266 {
4267 	struct record *rec = &record;
4268 
4269 	hit_auxtrace_snapshot_trigger(rec);
4270 
4271 	if (switch_output_signal(rec))
4272 		trigger_hit(&switch_output_trigger);
4273 }
4274 
4275 static void alarm_sig_handler(int sig __maybe_unused)
4276 {
4277 	struct record *rec = &record;
4278 
4279 	if (switch_output_time(rec))
4280 		trigger_hit(&switch_output_trigger);
4281 }
4282