xref: /linux/tools/perf/builtin-record.c (revision 37b9c7bbe1ee1937a317f7fafacd1d116202b2d8)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/perf_api_probe.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/cpu-set-sched.h"
43 #include "util/synthetic-events.h"
44 #include "util/time-utils.h"
45 #include "util/units.h"
46 #include "util/bpf-event.h"
47 #include "util/util.h"
48 #include "util/pfm.h"
49 #include "util/clockid.h"
50 #include "asm/bug.h"
51 #include "perf.h"
52 
53 #include <errno.h>
54 #include <inttypes.h>
55 #include <locale.h>
56 #include <poll.h>
57 #include <pthread.h>
58 #include <unistd.h>
59 #include <sched.h>
60 #include <signal.h>
61 #ifdef HAVE_EVENTFD_SUPPORT
62 #include <sys/eventfd.h>
63 #endif
64 #include <sys/mman.h>
65 #include <sys/wait.h>
66 #include <sys/types.h>
67 #include <sys/stat.h>
68 #include <fcntl.h>
69 #include <linux/err.h>
70 #include <linux/string.h>
71 #include <linux/time64.h>
72 #include <linux/zalloc.h>
73 #include <linux/bitmap.h>
74 #include <sys/time.h>
75 
76 struct switch_output {
77 	bool		 enabled;
78 	bool		 signal;
79 	unsigned long	 size;
80 	unsigned long	 time;
81 	const char	*str;
82 	bool		 set;
83 	char		 **filenames;
84 	int		 num_files;
85 	int		 cur_file;
86 };
87 
88 struct record {
89 	struct perf_tool	tool;
90 	struct record_opts	opts;
91 	u64			bytes_written;
92 	struct perf_data	data;
93 	struct auxtrace_record	*itr;
94 	struct evlist	*evlist;
95 	struct perf_session	*session;
96 	struct evlist		*sb_evlist;
97 	pthread_t		thread_id;
98 	int			realtime_prio;
99 	bool			switch_output_event_set;
100 	bool			no_buildid;
101 	bool			no_buildid_set;
102 	bool			no_buildid_cache;
103 	bool			no_buildid_cache_set;
104 	bool			buildid_all;
105 	bool			buildid_mmap;
106 	bool			timestamp_filename;
107 	bool			timestamp_boundary;
108 	struct switch_output	switch_output;
109 	unsigned long long	samples;
110 	struct mmap_cpu_mask	affinity_mask;
111 	unsigned long		output_max_size;	/* = 0: unlimited */
112 };
113 
114 static volatile int done;
115 
116 static volatile int auxtrace_record__snapshot_started;
117 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
118 static DEFINE_TRIGGER(switch_output_trigger);
119 
120 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
121 	"SYS", "NODE", "CPU"
122 };
123 
124 static bool switch_output_signal(struct record *rec)
125 {
126 	return rec->switch_output.signal &&
127 	       trigger_is_ready(&switch_output_trigger);
128 }
129 
130 static bool switch_output_size(struct record *rec)
131 {
132 	return rec->switch_output.size &&
133 	       trigger_is_ready(&switch_output_trigger) &&
134 	       (rec->bytes_written >= rec->switch_output.size);
135 }
136 
137 static bool switch_output_time(struct record *rec)
138 {
139 	return rec->switch_output.time &&
140 	       trigger_is_ready(&switch_output_trigger);
141 }
142 
143 static bool record__output_max_size_exceeded(struct record *rec)
144 {
145 	return rec->output_max_size &&
146 	       (rec->bytes_written >= rec->output_max_size);
147 }
148 
149 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
150 			 void *bf, size_t size)
151 {
152 	struct perf_data_file *file = &rec->session->data->file;
153 
154 	if (perf_data_file__write(file, bf, size) < 0) {
155 		pr_err("failed to write perf data, error: %m\n");
156 		return -1;
157 	}
158 
159 	rec->bytes_written += size;
160 
161 	if (record__output_max_size_exceeded(rec) && !done) {
162 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
163 				" stopping session ]\n",
164 				rec->bytes_written >> 10);
165 		done = 1;
166 	}
167 
168 	if (switch_output_size(rec))
169 		trigger_hit(&switch_output_trigger);
170 
171 	return 0;
172 }
173 
174 static int record__aio_enabled(struct record *rec);
175 static int record__comp_enabled(struct record *rec);
176 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
177 			    void *src, size_t src_size);
178 
179 #ifdef HAVE_AIO_SUPPORT
180 static int record__aio_write(struct aiocb *cblock, int trace_fd,
181 		void *buf, size_t size, off_t off)
182 {
183 	int rc;
184 
185 	cblock->aio_fildes = trace_fd;
186 	cblock->aio_buf    = buf;
187 	cblock->aio_nbytes = size;
188 	cblock->aio_offset = off;
189 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
190 
191 	do {
192 		rc = aio_write(cblock);
193 		if (rc == 0) {
194 			break;
195 		} else if (errno != EAGAIN) {
196 			cblock->aio_fildes = -1;
197 			pr_err("failed to queue perf data, error: %m\n");
198 			break;
199 		}
200 	} while (1);
201 
202 	return rc;
203 }
204 
205 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
206 {
207 	void *rem_buf;
208 	off_t rem_off;
209 	size_t rem_size;
210 	int rc, aio_errno;
211 	ssize_t aio_ret, written;
212 
213 	aio_errno = aio_error(cblock);
214 	if (aio_errno == EINPROGRESS)
215 		return 0;
216 
217 	written = aio_ret = aio_return(cblock);
218 	if (aio_ret < 0) {
219 		if (aio_errno != EINTR)
220 			pr_err("failed to write perf data, error: %m\n");
221 		written = 0;
222 	}
223 
224 	rem_size = cblock->aio_nbytes - written;
225 
226 	if (rem_size == 0) {
227 		cblock->aio_fildes = -1;
228 		/*
229 		 * md->refcount is incremented in record__aio_pushfn() for
230 		 * every aio write request started in record__aio_push() so
231 		 * decrement it because the request is now complete.
232 		 */
233 		perf_mmap__put(&md->core);
234 		rc = 1;
235 	} else {
236 		/*
237 		 * aio write request may require restart with the
238 		 * reminder if the kernel didn't write whole
239 		 * chunk at once.
240 		 */
241 		rem_off = cblock->aio_offset + written;
242 		rem_buf = (void *)(cblock->aio_buf + written);
243 		record__aio_write(cblock, cblock->aio_fildes,
244 				rem_buf, rem_size, rem_off);
245 		rc = 0;
246 	}
247 
248 	return rc;
249 }
250 
251 static int record__aio_sync(struct mmap *md, bool sync_all)
252 {
253 	struct aiocb **aiocb = md->aio.aiocb;
254 	struct aiocb *cblocks = md->aio.cblocks;
255 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
256 	int i, do_suspend;
257 
258 	do {
259 		do_suspend = 0;
260 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
261 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
262 				if (sync_all)
263 					aiocb[i] = NULL;
264 				else
265 					return i;
266 			} else {
267 				/*
268 				 * Started aio write is not complete yet
269 				 * so it has to be waited before the
270 				 * next allocation.
271 				 */
272 				aiocb[i] = &cblocks[i];
273 				do_suspend = 1;
274 			}
275 		}
276 		if (!do_suspend)
277 			return -1;
278 
279 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
280 			if (!(errno == EAGAIN || errno == EINTR))
281 				pr_err("failed to sync perf data, error: %m\n");
282 		}
283 	} while (1);
284 }
285 
286 struct record_aio {
287 	struct record	*rec;
288 	void		*data;
289 	size_t		size;
290 };
291 
292 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
293 {
294 	struct record_aio *aio = to;
295 
296 	/*
297 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
298 	 * to release space in the kernel buffer as fast as possible, calling
299 	 * perf_mmap__consume() from perf_mmap__push() function.
300 	 *
301 	 * That lets the kernel to proceed with storing more profiling data into
302 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
303 	 *
304 	 * Coping can be done in two steps in case the chunk of profiling data
305 	 * crosses the upper bound of the kernel buffer. In this case we first move
306 	 * part of data from map->start till the upper bound and then the reminder
307 	 * from the beginning of the kernel buffer till the end of the data chunk.
308 	 */
309 
310 	if (record__comp_enabled(aio->rec)) {
311 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
312 				     mmap__mmap_len(map) - aio->size,
313 				     buf, size);
314 	} else {
315 		memcpy(aio->data + aio->size, buf, size);
316 	}
317 
318 	if (!aio->size) {
319 		/*
320 		 * Increment map->refcount to guard map->aio.data[] buffer
321 		 * from premature deallocation because map object can be
322 		 * released earlier than aio write request started on
323 		 * map->aio.data[] buffer is complete.
324 		 *
325 		 * perf_mmap__put() is done at record__aio_complete()
326 		 * after started aio request completion or at record__aio_push()
327 		 * if the request failed to start.
328 		 */
329 		perf_mmap__get(&map->core);
330 	}
331 
332 	aio->size += size;
333 
334 	return size;
335 }
336 
337 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
338 {
339 	int ret, idx;
340 	int trace_fd = rec->session->data->file.fd;
341 	struct record_aio aio = { .rec = rec, .size = 0 };
342 
343 	/*
344 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
345 	 * becomes available after previous aio write operation.
346 	 */
347 
348 	idx = record__aio_sync(map, false);
349 	aio.data = map->aio.data[idx];
350 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
351 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
352 		return ret;
353 
354 	rec->samples++;
355 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
356 	if (!ret) {
357 		*off += aio.size;
358 		rec->bytes_written += aio.size;
359 		if (switch_output_size(rec))
360 			trigger_hit(&switch_output_trigger);
361 	} else {
362 		/*
363 		 * Decrement map->refcount incremented in record__aio_pushfn()
364 		 * back if record__aio_write() operation failed to start, otherwise
365 		 * map->refcount is decremented in record__aio_complete() after
366 		 * aio write operation finishes successfully.
367 		 */
368 		perf_mmap__put(&map->core);
369 	}
370 
371 	return ret;
372 }
373 
374 static off_t record__aio_get_pos(int trace_fd)
375 {
376 	return lseek(trace_fd, 0, SEEK_CUR);
377 }
378 
379 static void record__aio_set_pos(int trace_fd, off_t pos)
380 {
381 	lseek(trace_fd, pos, SEEK_SET);
382 }
383 
384 static void record__aio_mmap_read_sync(struct record *rec)
385 {
386 	int i;
387 	struct evlist *evlist = rec->evlist;
388 	struct mmap *maps = evlist->mmap;
389 
390 	if (!record__aio_enabled(rec))
391 		return;
392 
393 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
394 		struct mmap *map = &maps[i];
395 
396 		if (map->core.base)
397 			record__aio_sync(map, true);
398 	}
399 }
400 
401 static int nr_cblocks_default = 1;
402 static int nr_cblocks_max = 4;
403 
404 static int record__aio_parse(const struct option *opt,
405 			     const char *str,
406 			     int unset)
407 {
408 	struct record_opts *opts = (struct record_opts *)opt->value;
409 
410 	if (unset) {
411 		opts->nr_cblocks = 0;
412 	} else {
413 		if (str)
414 			opts->nr_cblocks = strtol(str, NULL, 0);
415 		if (!opts->nr_cblocks)
416 			opts->nr_cblocks = nr_cblocks_default;
417 	}
418 
419 	return 0;
420 }
421 #else /* HAVE_AIO_SUPPORT */
422 static int nr_cblocks_max = 0;
423 
424 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
425 			    off_t *off __maybe_unused)
426 {
427 	return -1;
428 }
429 
430 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
431 {
432 	return -1;
433 }
434 
435 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
436 {
437 }
438 
439 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
440 {
441 }
442 #endif
443 
444 static int record__aio_enabled(struct record *rec)
445 {
446 	return rec->opts.nr_cblocks > 0;
447 }
448 
449 #define MMAP_FLUSH_DEFAULT 1
450 static int record__mmap_flush_parse(const struct option *opt,
451 				    const char *str,
452 				    int unset)
453 {
454 	int flush_max;
455 	struct record_opts *opts = (struct record_opts *)opt->value;
456 	static struct parse_tag tags[] = {
457 			{ .tag  = 'B', .mult = 1       },
458 			{ .tag  = 'K', .mult = 1 << 10 },
459 			{ .tag  = 'M', .mult = 1 << 20 },
460 			{ .tag  = 'G', .mult = 1 << 30 },
461 			{ .tag  = 0 },
462 	};
463 
464 	if (unset)
465 		return 0;
466 
467 	if (str) {
468 		opts->mmap_flush = parse_tag_value(str, tags);
469 		if (opts->mmap_flush == (int)-1)
470 			opts->mmap_flush = strtol(str, NULL, 0);
471 	}
472 
473 	if (!opts->mmap_flush)
474 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
475 
476 	flush_max = evlist__mmap_size(opts->mmap_pages);
477 	flush_max /= 4;
478 	if (opts->mmap_flush > flush_max)
479 		opts->mmap_flush = flush_max;
480 
481 	return 0;
482 }
483 
484 #ifdef HAVE_ZSTD_SUPPORT
485 static unsigned int comp_level_default = 1;
486 
487 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
488 {
489 	struct record_opts *opts = opt->value;
490 
491 	if (unset) {
492 		opts->comp_level = 0;
493 	} else {
494 		if (str)
495 			opts->comp_level = strtol(str, NULL, 0);
496 		if (!opts->comp_level)
497 			opts->comp_level = comp_level_default;
498 	}
499 
500 	return 0;
501 }
502 #endif
503 static unsigned int comp_level_max = 22;
504 
505 static int record__comp_enabled(struct record *rec)
506 {
507 	return rec->opts.comp_level > 0;
508 }
509 
510 static int process_synthesized_event(struct perf_tool *tool,
511 				     union perf_event *event,
512 				     struct perf_sample *sample __maybe_unused,
513 				     struct machine *machine __maybe_unused)
514 {
515 	struct record *rec = container_of(tool, struct record, tool);
516 	return record__write(rec, NULL, event, event->header.size);
517 }
518 
519 static int process_locked_synthesized_event(struct perf_tool *tool,
520 				     union perf_event *event,
521 				     struct perf_sample *sample __maybe_unused,
522 				     struct machine *machine __maybe_unused)
523 {
524 	static pthread_mutex_t synth_lock = PTHREAD_MUTEX_INITIALIZER;
525 	int ret;
526 
527 	pthread_mutex_lock(&synth_lock);
528 	ret = process_synthesized_event(tool, event, sample, machine);
529 	pthread_mutex_unlock(&synth_lock);
530 	return ret;
531 }
532 
533 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
534 {
535 	struct record *rec = to;
536 
537 	if (record__comp_enabled(rec)) {
538 		size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
539 		bf   = map->data;
540 	}
541 
542 	rec->samples++;
543 	return record__write(rec, map, bf, size);
544 }
545 
546 static volatile int signr = -1;
547 static volatile int child_finished;
548 #ifdef HAVE_EVENTFD_SUPPORT
549 static int done_fd = -1;
550 #endif
551 
552 static void sig_handler(int sig)
553 {
554 	if (sig == SIGCHLD)
555 		child_finished = 1;
556 	else
557 		signr = sig;
558 
559 	done = 1;
560 #ifdef HAVE_EVENTFD_SUPPORT
561 {
562 	u64 tmp = 1;
563 	/*
564 	 * It is possible for this signal handler to run after done is checked
565 	 * in the main loop, but before the perf counter fds are polled. If this
566 	 * happens, the poll() will continue to wait even though done is set,
567 	 * and will only break out if either another signal is received, or the
568 	 * counters are ready for read. To ensure the poll() doesn't sleep when
569 	 * done is set, use an eventfd (done_fd) to wake up the poll().
570 	 */
571 	if (write(done_fd, &tmp, sizeof(tmp)) < 0)
572 		pr_err("failed to signal wakeup fd, error: %m\n");
573 }
574 #endif // HAVE_EVENTFD_SUPPORT
575 }
576 
577 static void sigsegv_handler(int sig)
578 {
579 	perf_hooks__recover();
580 	sighandler_dump_stack(sig);
581 }
582 
583 static void record__sig_exit(void)
584 {
585 	if (signr == -1)
586 		return;
587 
588 	signal(signr, SIG_DFL);
589 	raise(signr);
590 }
591 
592 #ifdef HAVE_AUXTRACE_SUPPORT
593 
594 static int record__process_auxtrace(struct perf_tool *tool,
595 				    struct mmap *map,
596 				    union perf_event *event, void *data1,
597 				    size_t len1, void *data2, size_t len2)
598 {
599 	struct record *rec = container_of(tool, struct record, tool);
600 	struct perf_data *data = &rec->data;
601 	size_t padding;
602 	u8 pad[8] = {0};
603 
604 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
605 		off_t file_offset;
606 		int fd = perf_data__fd(data);
607 		int err;
608 
609 		file_offset = lseek(fd, 0, SEEK_CUR);
610 		if (file_offset == -1)
611 			return -1;
612 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
613 						     event, file_offset);
614 		if (err)
615 			return err;
616 	}
617 
618 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
619 	padding = (len1 + len2) & 7;
620 	if (padding)
621 		padding = 8 - padding;
622 
623 	record__write(rec, map, event, event->header.size);
624 	record__write(rec, map, data1, len1);
625 	if (len2)
626 		record__write(rec, map, data2, len2);
627 	record__write(rec, map, &pad, padding);
628 
629 	return 0;
630 }
631 
632 static int record__auxtrace_mmap_read(struct record *rec,
633 				      struct mmap *map)
634 {
635 	int ret;
636 
637 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
638 				  record__process_auxtrace);
639 	if (ret < 0)
640 		return ret;
641 
642 	if (ret)
643 		rec->samples++;
644 
645 	return 0;
646 }
647 
648 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
649 					       struct mmap *map)
650 {
651 	int ret;
652 
653 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
654 					   record__process_auxtrace,
655 					   rec->opts.auxtrace_snapshot_size);
656 	if (ret < 0)
657 		return ret;
658 
659 	if (ret)
660 		rec->samples++;
661 
662 	return 0;
663 }
664 
665 static int record__auxtrace_read_snapshot_all(struct record *rec)
666 {
667 	int i;
668 	int rc = 0;
669 
670 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
671 		struct mmap *map = &rec->evlist->mmap[i];
672 
673 		if (!map->auxtrace_mmap.base)
674 			continue;
675 
676 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
677 			rc = -1;
678 			goto out;
679 		}
680 	}
681 out:
682 	return rc;
683 }
684 
685 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
686 {
687 	pr_debug("Recording AUX area tracing snapshot\n");
688 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
689 		trigger_error(&auxtrace_snapshot_trigger);
690 	} else {
691 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
692 			trigger_error(&auxtrace_snapshot_trigger);
693 		else
694 			trigger_ready(&auxtrace_snapshot_trigger);
695 	}
696 }
697 
698 static int record__auxtrace_snapshot_exit(struct record *rec)
699 {
700 	if (trigger_is_error(&auxtrace_snapshot_trigger))
701 		return 0;
702 
703 	if (!auxtrace_record__snapshot_started &&
704 	    auxtrace_record__snapshot_start(rec->itr))
705 		return -1;
706 
707 	record__read_auxtrace_snapshot(rec, true);
708 	if (trigger_is_error(&auxtrace_snapshot_trigger))
709 		return -1;
710 
711 	return 0;
712 }
713 
714 static int record__auxtrace_init(struct record *rec)
715 {
716 	int err;
717 
718 	if (!rec->itr) {
719 		rec->itr = auxtrace_record__init(rec->evlist, &err);
720 		if (err)
721 			return err;
722 	}
723 
724 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
725 					      rec->opts.auxtrace_snapshot_opts);
726 	if (err)
727 		return err;
728 
729 	err = auxtrace_parse_sample_options(rec->itr, rec->evlist, &rec->opts,
730 					    rec->opts.auxtrace_sample_opts);
731 	if (err)
732 		return err;
733 
734 	return auxtrace_parse_filters(rec->evlist);
735 }
736 
737 #else
738 
739 static inline
740 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
741 			       struct mmap *map __maybe_unused)
742 {
743 	return 0;
744 }
745 
746 static inline
747 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
748 				    bool on_exit __maybe_unused)
749 {
750 }
751 
752 static inline
753 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
754 {
755 	return 0;
756 }
757 
758 static inline
759 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
760 {
761 	return 0;
762 }
763 
764 static int record__auxtrace_init(struct record *rec __maybe_unused)
765 {
766 	return 0;
767 }
768 
769 #endif
770 
771 static int record__config_text_poke(struct evlist *evlist)
772 {
773 	struct evsel *evsel;
774 	int err;
775 
776 	/* Nothing to do if text poke is already configured */
777 	evlist__for_each_entry(evlist, evsel) {
778 		if (evsel->core.attr.text_poke)
779 			return 0;
780 	}
781 
782 	err = parse_events(evlist, "dummy:u", NULL);
783 	if (err)
784 		return err;
785 
786 	evsel = evlist__last(evlist);
787 
788 	evsel->core.attr.freq = 0;
789 	evsel->core.attr.sample_period = 1;
790 	evsel->core.attr.text_poke = 1;
791 	evsel->core.attr.ksymbol = 1;
792 
793 	evsel->core.system_wide = true;
794 	evsel->no_aux_samples = true;
795 	evsel->immediate = true;
796 
797 	/* Text poke must be collected on all CPUs */
798 	perf_cpu_map__put(evsel->core.own_cpus);
799 	evsel->core.own_cpus = perf_cpu_map__new(NULL);
800 	perf_cpu_map__put(evsel->core.cpus);
801 	evsel->core.cpus = perf_cpu_map__get(evsel->core.own_cpus);
802 
803 	evsel__set_sample_bit(evsel, TIME);
804 
805 	return 0;
806 }
807 
808 static bool record__kcore_readable(struct machine *machine)
809 {
810 	char kcore[PATH_MAX];
811 	int fd;
812 
813 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
814 
815 	fd = open(kcore, O_RDONLY);
816 	if (fd < 0)
817 		return false;
818 
819 	close(fd);
820 
821 	return true;
822 }
823 
824 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
825 {
826 	char from_dir[PATH_MAX];
827 	char kcore_dir[PATH_MAX];
828 	int ret;
829 
830 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
831 
832 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
833 	if (ret)
834 		return ret;
835 
836 	return kcore_copy(from_dir, kcore_dir);
837 }
838 
839 static int record__mmap_evlist(struct record *rec,
840 			       struct evlist *evlist)
841 {
842 	struct record_opts *opts = &rec->opts;
843 	bool auxtrace_overwrite = opts->auxtrace_snapshot_mode ||
844 				  opts->auxtrace_sample_mode;
845 	char msg[512];
846 
847 	if (opts->affinity != PERF_AFFINITY_SYS)
848 		cpu__setup_cpunode_map();
849 
850 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
851 				 opts->auxtrace_mmap_pages,
852 				 auxtrace_overwrite,
853 				 opts->nr_cblocks, opts->affinity,
854 				 opts->mmap_flush, opts->comp_level) < 0) {
855 		if (errno == EPERM) {
856 			pr_err("Permission error mapping pages.\n"
857 			       "Consider increasing "
858 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
859 			       "or try again with a smaller value of -m/--mmap_pages.\n"
860 			       "(current value: %u,%u)\n",
861 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
862 			return -errno;
863 		} else {
864 			pr_err("failed to mmap with %d (%s)\n", errno,
865 				str_error_r(errno, msg, sizeof(msg)));
866 			if (errno)
867 				return -errno;
868 			else
869 				return -EINVAL;
870 		}
871 	}
872 	return 0;
873 }
874 
875 static int record__mmap(struct record *rec)
876 {
877 	return record__mmap_evlist(rec, rec->evlist);
878 }
879 
880 static int record__open(struct record *rec)
881 {
882 	char msg[BUFSIZ];
883 	struct evsel *pos;
884 	struct evlist *evlist = rec->evlist;
885 	struct perf_session *session = rec->session;
886 	struct record_opts *opts = &rec->opts;
887 	int rc = 0;
888 
889 	/*
890 	 * For initial_delay or system wide, we need to add a dummy event so
891 	 * that we can track PERF_RECORD_MMAP to cover the delay of waiting or
892 	 * event synthesis.
893 	 */
894 	if (opts->initial_delay || target__has_cpu(&opts->target)) {
895 		pos = evlist__get_tracking_event(evlist);
896 		if (!evsel__is_dummy_event(pos)) {
897 			/* Set up dummy event. */
898 			if (evlist__add_dummy(evlist))
899 				return -ENOMEM;
900 			pos = evlist__last(evlist);
901 			evlist__set_tracking_event(evlist, pos);
902 		}
903 
904 		/*
905 		 * Enable the dummy event when the process is forked for
906 		 * initial_delay, immediately for system wide.
907 		 */
908 		if (opts->initial_delay && !pos->immediate)
909 			pos->core.attr.enable_on_exec = 1;
910 		else
911 			pos->immediate = 1;
912 	}
913 
914 	evlist__config(evlist, opts, &callchain_param);
915 
916 	evlist__for_each_entry(evlist, pos) {
917 try_again:
918 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
919 			if (evsel__fallback(pos, errno, msg, sizeof(msg))) {
920 				if (verbose > 0)
921 					ui__warning("%s\n", msg);
922 				goto try_again;
923 			}
924 			if ((errno == EINVAL || errno == EBADF) &&
925 			    pos->leader != pos &&
926 			    pos->weak_group) {
927 			        pos = evlist__reset_weak_group(evlist, pos, true);
928 				goto try_again;
929 			}
930 			rc = -errno;
931 			evsel__open_strerror(pos, &opts->target, errno, msg, sizeof(msg));
932 			ui__error("%s\n", msg);
933 			goto out;
934 		}
935 
936 		pos->supported = true;
937 	}
938 
939 	if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) {
940 		pr_warning(
941 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
942 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
943 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
944 "file is not found in the buildid cache or in the vmlinux path.\n\n"
945 "Samples in kernel modules won't be resolved at all.\n\n"
946 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
947 "even with a suitable vmlinux or kallsyms file.\n\n");
948 	}
949 
950 	if (evlist__apply_filters(evlist, &pos)) {
951 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
952 			pos->filter, evsel__name(pos), errno,
953 			str_error_r(errno, msg, sizeof(msg)));
954 		rc = -1;
955 		goto out;
956 	}
957 
958 	rc = record__mmap(rec);
959 	if (rc)
960 		goto out;
961 
962 	session->evlist = evlist;
963 	perf_session__set_id_hdr_size(session);
964 out:
965 	return rc;
966 }
967 
968 static int process_sample_event(struct perf_tool *tool,
969 				union perf_event *event,
970 				struct perf_sample *sample,
971 				struct evsel *evsel,
972 				struct machine *machine)
973 {
974 	struct record *rec = container_of(tool, struct record, tool);
975 
976 	if (rec->evlist->first_sample_time == 0)
977 		rec->evlist->first_sample_time = sample->time;
978 
979 	rec->evlist->last_sample_time = sample->time;
980 
981 	if (rec->buildid_all)
982 		return 0;
983 
984 	rec->samples++;
985 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
986 }
987 
988 static int process_buildids(struct record *rec)
989 {
990 	struct perf_session *session = rec->session;
991 
992 	if (perf_data__size(&rec->data) == 0)
993 		return 0;
994 
995 	/*
996 	 * During this process, it'll load kernel map and replace the
997 	 * dso->long_name to a real pathname it found.  In this case
998 	 * we prefer the vmlinux path like
999 	 *   /lib/modules/3.16.4/build/vmlinux
1000 	 *
1001 	 * rather than build-id path (in debug directory).
1002 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
1003 	 */
1004 	symbol_conf.ignore_vmlinux_buildid = true;
1005 
1006 	/*
1007 	 * If --buildid-all is given, it marks all DSO regardless of hits,
1008 	 * so no need to process samples. But if timestamp_boundary is enabled,
1009 	 * it still needs to walk on all samples to get the timestamps of
1010 	 * first/last samples.
1011 	 */
1012 	if (rec->buildid_all && !rec->timestamp_boundary)
1013 		rec->tool.sample = NULL;
1014 
1015 	return perf_session__process_events(session);
1016 }
1017 
1018 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
1019 {
1020 	int err;
1021 	struct perf_tool *tool = data;
1022 	/*
1023 	 *As for guest kernel when processing subcommand record&report,
1024 	 *we arrange module mmap prior to guest kernel mmap and trigger
1025 	 *a preload dso because default guest module symbols are loaded
1026 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
1027 	 *method is used to avoid symbol missing when the first addr is
1028 	 *in module instead of in guest kernel.
1029 	 */
1030 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
1031 					     machine);
1032 	if (err < 0)
1033 		pr_err("Couldn't record guest kernel [%d]'s reference"
1034 		       " relocation symbol.\n", machine->pid);
1035 
1036 	/*
1037 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
1038 	 * have no _text sometimes.
1039 	 */
1040 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1041 						 machine);
1042 	if (err < 0)
1043 		pr_err("Couldn't record guest kernel [%d]'s reference"
1044 		       " relocation symbol.\n", machine->pid);
1045 }
1046 
1047 static struct perf_event_header finished_round_event = {
1048 	.size = sizeof(struct perf_event_header),
1049 	.type = PERF_RECORD_FINISHED_ROUND,
1050 };
1051 
1052 static void record__adjust_affinity(struct record *rec, struct mmap *map)
1053 {
1054 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
1055 	    !bitmap_equal(rec->affinity_mask.bits, map->affinity_mask.bits,
1056 			  rec->affinity_mask.nbits)) {
1057 		bitmap_zero(rec->affinity_mask.bits, rec->affinity_mask.nbits);
1058 		bitmap_or(rec->affinity_mask.bits, rec->affinity_mask.bits,
1059 			  map->affinity_mask.bits, rec->affinity_mask.nbits);
1060 		sched_setaffinity(0, MMAP_CPU_MASK_BYTES(&rec->affinity_mask),
1061 				  (cpu_set_t *)rec->affinity_mask.bits);
1062 		if (verbose == 2)
1063 			mmap_cpu_mask__scnprintf(&rec->affinity_mask, "thread");
1064 	}
1065 }
1066 
1067 static size_t process_comp_header(void *record, size_t increment)
1068 {
1069 	struct perf_record_compressed *event = record;
1070 	size_t size = sizeof(*event);
1071 
1072 	if (increment) {
1073 		event->header.size += increment;
1074 		return increment;
1075 	}
1076 
1077 	event->header.type = PERF_RECORD_COMPRESSED;
1078 	event->header.size = size;
1079 
1080 	return size;
1081 }
1082 
1083 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
1084 			    void *src, size_t src_size)
1085 {
1086 	size_t compressed;
1087 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
1088 
1089 	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
1090 						     max_record_size, process_comp_header);
1091 
1092 	session->bytes_transferred += src_size;
1093 	session->bytes_compressed  += compressed;
1094 
1095 	return compressed;
1096 }
1097 
1098 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
1099 				    bool overwrite, bool synch)
1100 {
1101 	u64 bytes_written = rec->bytes_written;
1102 	int i;
1103 	int rc = 0;
1104 	struct mmap *maps;
1105 	int trace_fd = rec->data.file.fd;
1106 	off_t off = 0;
1107 
1108 	if (!evlist)
1109 		return 0;
1110 
1111 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1112 	if (!maps)
1113 		return 0;
1114 
1115 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1116 		return 0;
1117 
1118 	if (record__aio_enabled(rec))
1119 		off = record__aio_get_pos(trace_fd);
1120 
1121 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
1122 		u64 flush = 0;
1123 		struct mmap *map = &maps[i];
1124 
1125 		if (map->core.base) {
1126 			record__adjust_affinity(rec, map);
1127 			if (synch) {
1128 				flush = map->core.flush;
1129 				map->core.flush = 1;
1130 			}
1131 			if (!record__aio_enabled(rec)) {
1132 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1133 					if (synch)
1134 						map->core.flush = flush;
1135 					rc = -1;
1136 					goto out;
1137 				}
1138 			} else {
1139 				if (record__aio_push(rec, map, &off) < 0) {
1140 					record__aio_set_pos(trace_fd, off);
1141 					if (synch)
1142 						map->core.flush = flush;
1143 					rc = -1;
1144 					goto out;
1145 				}
1146 			}
1147 			if (synch)
1148 				map->core.flush = flush;
1149 		}
1150 
1151 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1152 		    !rec->opts.auxtrace_sample_mode &&
1153 		    record__auxtrace_mmap_read(rec, map) != 0) {
1154 			rc = -1;
1155 			goto out;
1156 		}
1157 	}
1158 
1159 	if (record__aio_enabled(rec))
1160 		record__aio_set_pos(trace_fd, off);
1161 
1162 	/*
1163 	 * Mark the round finished in case we wrote
1164 	 * at least one event.
1165 	 */
1166 	if (bytes_written != rec->bytes_written)
1167 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1168 
1169 	if (overwrite)
1170 		evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1171 out:
1172 	return rc;
1173 }
1174 
1175 static int record__mmap_read_all(struct record *rec, bool synch)
1176 {
1177 	int err;
1178 
1179 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1180 	if (err)
1181 		return err;
1182 
1183 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1184 }
1185 
1186 static void record__init_features(struct record *rec)
1187 {
1188 	struct perf_session *session = rec->session;
1189 	int feat;
1190 
1191 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1192 		perf_header__set_feat(&session->header, feat);
1193 
1194 	if (rec->no_buildid)
1195 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1196 
1197 	if (!have_tracepoints(&rec->evlist->core.entries))
1198 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1199 
1200 	if (!rec->opts.branch_stack)
1201 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1202 
1203 	if (!rec->opts.full_auxtrace)
1204 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1205 
1206 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1207 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1208 
1209 	if (!rec->opts.use_clockid)
1210 		perf_header__clear_feat(&session->header, HEADER_CLOCK_DATA);
1211 
1212 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1213 	if (!record__comp_enabled(rec))
1214 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1215 
1216 	perf_header__clear_feat(&session->header, HEADER_STAT);
1217 }
1218 
1219 static void
1220 record__finish_output(struct record *rec)
1221 {
1222 	struct perf_data *data = &rec->data;
1223 	int fd = perf_data__fd(data);
1224 
1225 	if (data->is_pipe)
1226 		return;
1227 
1228 	rec->session->header.data_size += rec->bytes_written;
1229 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1230 
1231 	if (!rec->no_buildid) {
1232 		process_buildids(rec);
1233 
1234 		if (rec->buildid_all)
1235 			dsos__hit_all(rec->session);
1236 	}
1237 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1238 
1239 	return;
1240 }
1241 
1242 static int record__synthesize_workload(struct record *rec, bool tail)
1243 {
1244 	int err;
1245 	struct perf_thread_map *thread_map;
1246 
1247 	if (rec->opts.tail_synthesize != tail)
1248 		return 0;
1249 
1250 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1251 	if (thread_map == NULL)
1252 		return -1;
1253 
1254 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1255 						 process_synthesized_event,
1256 						 &rec->session->machines.host,
1257 						 rec->opts.sample_address);
1258 	perf_thread_map__put(thread_map);
1259 	return err;
1260 }
1261 
1262 static int record__synthesize(struct record *rec, bool tail);
1263 
1264 static int
1265 record__switch_output(struct record *rec, bool at_exit)
1266 {
1267 	struct perf_data *data = &rec->data;
1268 	int fd, err;
1269 	char *new_filename;
1270 
1271 	/* Same Size:      "2015122520103046"*/
1272 	char timestamp[] = "InvalidTimestamp";
1273 
1274 	record__aio_mmap_read_sync(rec);
1275 
1276 	record__synthesize(rec, true);
1277 	if (target__none(&rec->opts.target))
1278 		record__synthesize_workload(rec, true);
1279 
1280 	rec->samples = 0;
1281 	record__finish_output(rec);
1282 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1283 	if (err) {
1284 		pr_err("Failed to get current timestamp\n");
1285 		return -EINVAL;
1286 	}
1287 
1288 	fd = perf_data__switch(data, timestamp,
1289 				    rec->session->header.data_offset,
1290 				    at_exit, &new_filename);
1291 	if (fd >= 0 && !at_exit) {
1292 		rec->bytes_written = 0;
1293 		rec->session->header.data_size = 0;
1294 	}
1295 
1296 	if (!quiet)
1297 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1298 			data->path, timestamp);
1299 
1300 	if (rec->switch_output.num_files) {
1301 		int n = rec->switch_output.cur_file + 1;
1302 
1303 		if (n >= rec->switch_output.num_files)
1304 			n = 0;
1305 		rec->switch_output.cur_file = n;
1306 		if (rec->switch_output.filenames[n]) {
1307 			remove(rec->switch_output.filenames[n]);
1308 			zfree(&rec->switch_output.filenames[n]);
1309 		}
1310 		rec->switch_output.filenames[n] = new_filename;
1311 	} else {
1312 		free(new_filename);
1313 	}
1314 
1315 	/* Output tracking events */
1316 	if (!at_exit) {
1317 		record__synthesize(rec, false);
1318 
1319 		/*
1320 		 * In 'perf record --switch-output' without -a,
1321 		 * record__synthesize() in record__switch_output() won't
1322 		 * generate tracking events because there's no thread_map
1323 		 * in evlist. Which causes newly created perf.data doesn't
1324 		 * contain map and comm information.
1325 		 * Create a fake thread_map and directly call
1326 		 * perf_event__synthesize_thread_map() for those events.
1327 		 */
1328 		if (target__none(&rec->opts.target))
1329 			record__synthesize_workload(rec, false);
1330 	}
1331 	return fd;
1332 }
1333 
1334 static volatile int workload_exec_errno;
1335 
1336 /*
1337  * evlist__prepare_workload will send a SIGUSR1
1338  * if the fork fails, since we asked by setting its
1339  * want_signal to true.
1340  */
1341 static void workload_exec_failed_signal(int signo __maybe_unused,
1342 					siginfo_t *info,
1343 					void *ucontext __maybe_unused)
1344 {
1345 	workload_exec_errno = info->si_value.sival_int;
1346 	done = 1;
1347 	child_finished = 1;
1348 }
1349 
1350 static void snapshot_sig_handler(int sig);
1351 static void alarm_sig_handler(int sig);
1352 
1353 static const struct perf_event_mmap_page *evlist__pick_pc(struct evlist *evlist)
1354 {
1355 	if (evlist) {
1356 		if (evlist->mmap && evlist->mmap[0].core.base)
1357 			return evlist->mmap[0].core.base;
1358 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1359 			return evlist->overwrite_mmap[0].core.base;
1360 	}
1361 	return NULL;
1362 }
1363 
1364 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1365 {
1366 	const struct perf_event_mmap_page *pc = evlist__pick_pc(rec->evlist);
1367 	if (pc)
1368 		return pc;
1369 	return NULL;
1370 }
1371 
1372 static int record__synthesize(struct record *rec, bool tail)
1373 {
1374 	struct perf_session *session = rec->session;
1375 	struct machine *machine = &session->machines.host;
1376 	struct perf_data *data = &rec->data;
1377 	struct record_opts *opts = &rec->opts;
1378 	struct perf_tool *tool = &rec->tool;
1379 	int fd = perf_data__fd(data);
1380 	int err = 0;
1381 	event_op f = process_synthesized_event;
1382 
1383 	if (rec->opts.tail_synthesize != tail)
1384 		return 0;
1385 
1386 	if (data->is_pipe) {
1387 		/*
1388 		 * We need to synthesize events first, because some
1389 		 * features works on top of them (on report side).
1390 		 */
1391 		err = perf_event__synthesize_attrs(tool, rec->evlist,
1392 						   process_synthesized_event);
1393 		if (err < 0) {
1394 			pr_err("Couldn't synthesize attrs.\n");
1395 			goto out;
1396 		}
1397 
1398 		err = perf_event__synthesize_features(tool, session, rec->evlist,
1399 						      process_synthesized_event);
1400 		if (err < 0) {
1401 			pr_err("Couldn't synthesize features.\n");
1402 			return err;
1403 		}
1404 
1405 		if (have_tracepoints(&rec->evlist->core.entries)) {
1406 			/*
1407 			 * FIXME err <= 0 here actually means that
1408 			 * there were no tracepoints so its not really
1409 			 * an error, just that we don't need to
1410 			 * synthesize anything.  We really have to
1411 			 * return this more properly and also
1412 			 * propagate errors that now are calling die()
1413 			 */
1414 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
1415 								  process_synthesized_event);
1416 			if (err <= 0) {
1417 				pr_err("Couldn't record tracing data.\n");
1418 				goto out;
1419 			}
1420 			rec->bytes_written += err;
1421 		}
1422 	}
1423 
1424 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1425 					  process_synthesized_event, machine);
1426 	if (err)
1427 		goto out;
1428 
1429 	/* Synthesize id_index before auxtrace_info */
1430 	if (rec->opts.auxtrace_sample_mode) {
1431 		err = perf_event__synthesize_id_index(tool,
1432 						      process_synthesized_event,
1433 						      session->evlist, machine);
1434 		if (err)
1435 			goto out;
1436 	}
1437 
1438 	if (rec->opts.full_auxtrace) {
1439 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1440 					session, process_synthesized_event);
1441 		if (err)
1442 			goto out;
1443 	}
1444 
1445 	if (!evlist__exclude_kernel(rec->evlist)) {
1446 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1447 							 machine);
1448 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1449 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1450 				   "Check /proc/kallsyms permission or run as root.\n");
1451 
1452 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1453 						     machine);
1454 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1455 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1456 				   "Check /proc/modules permission or run as root.\n");
1457 	}
1458 
1459 	if (perf_guest) {
1460 		machines__process_guests(&session->machines,
1461 					 perf_event__synthesize_guest_os, tool);
1462 	}
1463 
1464 	err = perf_event__synthesize_extra_attr(&rec->tool,
1465 						rec->evlist,
1466 						process_synthesized_event,
1467 						data->is_pipe);
1468 	if (err)
1469 		goto out;
1470 
1471 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1472 						 process_synthesized_event,
1473 						NULL);
1474 	if (err < 0) {
1475 		pr_err("Couldn't synthesize thread map.\n");
1476 		return err;
1477 	}
1478 
1479 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1480 					     process_synthesized_event, NULL);
1481 	if (err < 0) {
1482 		pr_err("Couldn't synthesize cpu map.\n");
1483 		return err;
1484 	}
1485 
1486 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1487 						machine, opts);
1488 	if (err < 0)
1489 		pr_warning("Couldn't synthesize bpf events.\n");
1490 
1491 	err = perf_event__synthesize_cgroups(tool, process_synthesized_event,
1492 					     machine);
1493 	if (err < 0)
1494 		pr_warning("Couldn't synthesize cgroup events.\n");
1495 
1496 	if (rec->opts.nr_threads_synthesize > 1) {
1497 		perf_set_multithreaded();
1498 		f = process_locked_synthesized_event;
1499 	}
1500 
1501 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1502 					    f, opts->sample_address,
1503 					    rec->opts.nr_threads_synthesize);
1504 
1505 	if (rec->opts.nr_threads_synthesize > 1)
1506 		perf_set_singlethreaded();
1507 
1508 out:
1509 	return err;
1510 }
1511 
1512 static int record__process_signal_event(union perf_event *event __maybe_unused, void *data)
1513 {
1514 	struct record *rec = data;
1515 	pthread_kill(rec->thread_id, SIGUSR2);
1516 	return 0;
1517 }
1518 
1519 static int record__setup_sb_evlist(struct record *rec)
1520 {
1521 	struct record_opts *opts = &rec->opts;
1522 
1523 	if (rec->sb_evlist != NULL) {
1524 		/*
1525 		 * We get here if --switch-output-event populated the
1526 		 * sb_evlist, so associate a callback that will send a SIGUSR2
1527 		 * to the main thread.
1528 		 */
1529 		evlist__set_cb(rec->sb_evlist, record__process_signal_event, rec);
1530 		rec->thread_id = pthread_self();
1531 	}
1532 #ifdef HAVE_LIBBPF_SUPPORT
1533 	if (!opts->no_bpf_event) {
1534 		if (rec->sb_evlist == NULL) {
1535 			rec->sb_evlist = evlist__new();
1536 
1537 			if (rec->sb_evlist == NULL) {
1538 				pr_err("Couldn't create side band evlist.\n.");
1539 				return -1;
1540 			}
1541 		}
1542 
1543 		if (evlist__add_bpf_sb_event(rec->sb_evlist, &rec->session->header.env)) {
1544 			pr_err("Couldn't ask for PERF_RECORD_BPF_EVENT side band events.\n.");
1545 			return -1;
1546 		}
1547 	}
1548 #endif
1549 	if (evlist__start_sb_thread(rec->sb_evlist, &rec->opts.target)) {
1550 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1551 		opts->no_bpf_event = true;
1552 	}
1553 
1554 	return 0;
1555 }
1556 
1557 static int record__init_clock(struct record *rec)
1558 {
1559 	struct perf_session *session = rec->session;
1560 	struct timespec ref_clockid;
1561 	struct timeval ref_tod;
1562 	u64 ref;
1563 
1564 	if (!rec->opts.use_clockid)
1565 		return 0;
1566 
1567 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1568 		session->header.env.clock.clockid_res_ns = rec->opts.clockid_res_ns;
1569 
1570 	session->header.env.clock.clockid = rec->opts.clockid;
1571 
1572 	if (gettimeofday(&ref_tod, NULL) != 0) {
1573 		pr_err("gettimeofday failed, cannot set reference time.\n");
1574 		return -1;
1575 	}
1576 
1577 	if (clock_gettime(rec->opts.clockid, &ref_clockid)) {
1578 		pr_err("clock_gettime failed, cannot set reference time.\n");
1579 		return -1;
1580 	}
1581 
1582 	ref = (u64) ref_tod.tv_sec * NSEC_PER_SEC +
1583 	      (u64) ref_tod.tv_usec * NSEC_PER_USEC;
1584 
1585 	session->header.env.clock.tod_ns = ref;
1586 
1587 	ref = (u64) ref_clockid.tv_sec * NSEC_PER_SEC +
1588 	      (u64) ref_clockid.tv_nsec;
1589 
1590 	session->header.env.clock.clockid_ns = ref;
1591 	return 0;
1592 }
1593 
1594 static void hit_auxtrace_snapshot_trigger(struct record *rec)
1595 {
1596 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
1597 		trigger_hit(&auxtrace_snapshot_trigger);
1598 		auxtrace_record__snapshot_started = 1;
1599 		if (auxtrace_record__snapshot_start(rec->itr))
1600 			trigger_error(&auxtrace_snapshot_trigger);
1601 	}
1602 }
1603 
1604 static int __cmd_record(struct record *rec, int argc, const char **argv)
1605 {
1606 	int err;
1607 	int status = 0;
1608 	unsigned long waking = 0;
1609 	const bool forks = argc > 0;
1610 	struct perf_tool *tool = &rec->tool;
1611 	struct record_opts *opts = &rec->opts;
1612 	struct perf_data *data = &rec->data;
1613 	struct perf_session *session;
1614 	bool disabled = false, draining = false;
1615 	int fd;
1616 	float ratio = 0;
1617 	enum evlist_ctl_cmd cmd = EVLIST_CTL_CMD_UNSUPPORTED;
1618 
1619 	atexit(record__sig_exit);
1620 	signal(SIGCHLD, sig_handler);
1621 	signal(SIGINT, sig_handler);
1622 	signal(SIGTERM, sig_handler);
1623 	signal(SIGSEGV, sigsegv_handler);
1624 
1625 	if (rec->opts.record_namespaces)
1626 		tool->namespace_events = true;
1627 
1628 	if (rec->opts.record_cgroup) {
1629 #ifdef HAVE_FILE_HANDLE
1630 		tool->cgroup_events = true;
1631 #else
1632 		pr_err("cgroup tracking is not supported\n");
1633 		return -1;
1634 #endif
1635 	}
1636 
1637 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1638 		signal(SIGUSR2, snapshot_sig_handler);
1639 		if (rec->opts.auxtrace_snapshot_mode)
1640 			trigger_on(&auxtrace_snapshot_trigger);
1641 		if (rec->switch_output.enabled)
1642 			trigger_on(&switch_output_trigger);
1643 	} else {
1644 		signal(SIGUSR2, SIG_IGN);
1645 	}
1646 
1647 	session = perf_session__new(data, false, tool);
1648 	if (IS_ERR(session)) {
1649 		pr_err("Perf session creation failed.\n");
1650 		return PTR_ERR(session);
1651 	}
1652 
1653 	fd = perf_data__fd(data);
1654 	rec->session = session;
1655 
1656 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1657 		pr_err("Compression initialization failed.\n");
1658 		return -1;
1659 	}
1660 #ifdef HAVE_EVENTFD_SUPPORT
1661 	done_fd = eventfd(0, EFD_NONBLOCK);
1662 	if (done_fd < 0) {
1663 		pr_err("Failed to create wakeup eventfd, error: %m\n");
1664 		status = -1;
1665 		goto out_delete_session;
1666 	}
1667 	err = evlist__add_pollfd(rec->evlist, done_fd);
1668 	if (err < 0) {
1669 		pr_err("Failed to add wakeup eventfd to poll list\n");
1670 		status = err;
1671 		goto out_delete_session;
1672 	}
1673 #endif // HAVE_EVENTFD_SUPPORT
1674 
1675 	session->header.env.comp_type  = PERF_COMP_ZSTD;
1676 	session->header.env.comp_level = rec->opts.comp_level;
1677 
1678 	if (rec->opts.kcore &&
1679 	    !record__kcore_readable(&session->machines.host)) {
1680 		pr_err("ERROR: kcore is not readable.\n");
1681 		return -1;
1682 	}
1683 
1684 	if (record__init_clock(rec))
1685 		return -1;
1686 
1687 	record__init_features(rec);
1688 
1689 	if (forks) {
1690 		err = evlist__prepare_workload(rec->evlist, &opts->target, argv, data->is_pipe,
1691 					       workload_exec_failed_signal);
1692 		if (err < 0) {
1693 			pr_err("Couldn't run the workload!\n");
1694 			status = err;
1695 			goto out_delete_session;
1696 		}
1697 	}
1698 
1699 	/*
1700 	 * If we have just single event and are sending data
1701 	 * through pipe, we need to force the ids allocation,
1702 	 * because we synthesize event name through the pipe
1703 	 * and need the id for that.
1704 	 */
1705 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1706 		rec->opts.sample_id = true;
1707 
1708 	if (record__open(rec) != 0) {
1709 		err = -1;
1710 		goto out_child;
1711 	}
1712 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1713 
1714 	if (rec->opts.kcore) {
1715 		err = record__kcore_copy(&session->machines.host, data);
1716 		if (err) {
1717 			pr_err("ERROR: Failed to copy kcore\n");
1718 			goto out_child;
1719 		}
1720 	}
1721 
1722 	err = bpf__apply_obj_config();
1723 	if (err) {
1724 		char errbuf[BUFSIZ];
1725 
1726 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1727 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1728 			 errbuf);
1729 		goto out_child;
1730 	}
1731 
1732 	/*
1733 	 * Normally perf_session__new would do this, but it doesn't have the
1734 	 * evlist.
1735 	 */
1736 	if (rec->tool.ordered_events && !evlist__sample_id_all(rec->evlist)) {
1737 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1738 		rec->tool.ordered_events = false;
1739 	}
1740 
1741 	if (!rec->evlist->nr_groups)
1742 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1743 
1744 	if (data->is_pipe) {
1745 		err = perf_header__write_pipe(fd);
1746 		if (err < 0)
1747 			goto out_child;
1748 	} else {
1749 		err = perf_session__write_header(session, rec->evlist, fd, false);
1750 		if (err < 0)
1751 			goto out_child;
1752 	}
1753 
1754 	err = -1;
1755 	if (!rec->no_buildid
1756 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1757 		pr_err("Couldn't generate buildids. "
1758 		       "Use --no-buildid to profile anyway.\n");
1759 		goto out_child;
1760 	}
1761 
1762 	err = record__setup_sb_evlist(rec);
1763 	if (err)
1764 		goto out_child;
1765 
1766 	err = record__synthesize(rec, false);
1767 	if (err < 0)
1768 		goto out_child;
1769 
1770 	if (rec->realtime_prio) {
1771 		struct sched_param param;
1772 
1773 		param.sched_priority = rec->realtime_prio;
1774 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1775 			pr_err("Could not set realtime priority.\n");
1776 			err = -1;
1777 			goto out_child;
1778 		}
1779 	}
1780 
1781 	/*
1782 	 * When perf is starting the traced process, all the events
1783 	 * (apart from group members) have enable_on_exec=1 set,
1784 	 * so don't spoil it by prematurely enabling them.
1785 	 */
1786 	if (!target__none(&opts->target) && !opts->initial_delay)
1787 		evlist__enable(rec->evlist);
1788 
1789 	/*
1790 	 * Let the child rip
1791 	 */
1792 	if (forks) {
1793 		struct machine *machine = &session->machines.host;
1794 		union perf_event *event;
1795 		pid_t tgid;
1796 
1797 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1798 		if (event == NULL) {
1799 			err = -ENOMEM;
1800 			goto out_child;
1801 		}
1802 
1803 		/*
1804 		 * Some H/W events are generated before COMM event
1805 		 * which is emitted during exec(), so perf script
1806 		 * cannot see a correct process name for those events.
1807 		 * Synthesize COMM event to prevent it.
1808 		 */
1809 		tgid = perf_event__synthesize_comm(tool, event,
1810 						   rec->evlist->workload.pid,
1811 						   process_synthesized_event,
1812 						   machine);
1813 		free(event);
1814 
1815 		if (tgid == -1)
1816 			goto out_child;
1817 
1818 		event = malloc(sizeof(event->namespaces) +
1819 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1820 			       machine->id_hdr_size);
1821 		if (event == NULL) {
1822 			err = -ENOMEM;
1823 			goto out_child;
1824 		}
1825 
1826 		/*
1827 		 * Synthesize NAMESPACES event for the command specified.
1828 		 */
1829 		perf_event__synthesize_namespaces(tool, event,
1830 						  rec->evlist->workload.pid,
1831 						  tgid, process_synthesized_event,
1832 						  machine);
1833 		free(event);
1834 
1835 		evlist__start_workload(rec->evlist);
1836 	}
1837 
1838 	if (evlist__initialize_ctlfd(rec->evlist, opts->ctl_fd, opts->ctl_fd_ack))
1839 		goto out_child;
1840 
1841 	if (opts->initial_delay) {
1842 		pr_info(EVLIST_DISABLED_MSG);
1843 		if (opts->initial_delay > 0) {
1844 			usleep(opts->initial_delay * USEC_PER_MSEC);
1845 			evlist__enable(rec->evlist);
1846 			pr_info(EVLIST_ENABLED_MSG);
1847 		}
1848 	}
1849 
1850 	trigger_ready(&auxtrace_snapshot_trigger);
1851 	trigger_ready(&switch_output_trigger);
1852 	perf_hooks__invoke_record_start();
1853 	for (;;) {
1854 		unsigned long long hits = rec->samples;
1855 
1856 		/*
1857 		 * rec->evlist->bkw_mmap_state is possible to be
1858 		 * BKW_MMAP_EMPTY here: when done == true and
1859 		 * hits != rec->samples in previous round.
1860 		 *
1861 		 * evlist__toggle_bkw_mmap ensure we never
1862 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1863 		 */
1864 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1865 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1866 
1867 		if (record__mmap_read_all(rec, false) < 0) {
1868 			trigger_error(&auxtrace_snapshot_trigger);
1869 			trigger_error(&switch_output_trigger);
1870 			err = -1;
1871 			goto out_child;
1872 		}
1873 
1874 		if (auxtrace_record__snapshot_started) {
1875 			auxtrace_record__snapshot_started = 0;
1876 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1877 				record__read_auxtrace_snapshot(rec, false);
1878 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1879 				pr_err("AUX area tracing snapshot failed\n");
1880 				err = -1;
1881 				goto out_child;
1882 			}
1883 		}
1884 
1885 		if (trigger_is_hit(&switch_output_trigger)) {
1886 			/*
1887 			 * If switch_output_trigger is hit, the data in
1888 			 * overwritable ring buffer should have been collected,
1889 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1890 			 *
1891 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1892 			 * record__mmap_read_all() didn't collect data from
1893 			 * overwritable ring buffer. Read again.
1894 			 */
1895 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1896 				continue;
1897 			trigger_ready(&switch_output_trigger);
1898 
1899 			/*
1900 			 * Reenable events in overwrite ring buffer after
1901 			 * record__mmap_read_all(): we should have collected
1902 			 * data from it.
1903 			 */
1904 			evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1905 
1906 			if (!quiet)
1907 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1908 					waking);
1909 			waking = 0;
1910 			fd = record__switch_output(rec, false);
1911 			if (fd < 0) {
1912 				pr_err("Failed to switch to new file\n");
1913 				trigger_error(&switch_output_trigger);
1914 				err = fd;
1915 				goto out_child;
1916 			}
1917 
1918 			/* re-arm the alarm */
1919 			if (rec->switch_output.time)
1920 				alarm(rec->switch_output.time);
1921 		}
1922 
1923 		if (hits == rec->samples) {
1924 			if (done || draining)
1925 				break;
1926 			err = evlist__poll(rec->evlist, -1);
1927 			/*
1928 			 * Propagate error, only if there's any. Ignore positive
1929 			 * number of returned events and interrupt error.
1930 			 */
1931 			if (err > 0 || (err < 0 && errno == EINTR))
1932 				err = 0;
1933 			waking++;
1934 
1935 			if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1936 				draining = true;
1937 		}
1938 
1939 		if (evlist__ctlfd_process(rec->evlist, &cmd) > 0) {
1940 			switch (cmd) {
1941 			case EVLIST_CTL_CMD_SNAPSHOT:
1942 				hit_auxtrace_snapshot_trigger(rec);
1943 				evlist__ctlfd_ack(rec->evlist);
1944 				break;
1945 			case EVLIST_CTL_CMD_STOP:
1946 				done = 1;
1947 				break;
1948 			case EVLIST_CTL_CMD_ACK:
1949 			case EVLIST_CTL_CMD_UNSUPPORTED:
1950 			case EVLIST_CTL_CMD_ENABLE:
1951 			case EVLIST_CTL_CMD_DISABLE:
1952 			case EVLIST_CTL_CMD_EVLIST:
1953 			case EVLIST_CTL_CMD_PING:
1954 			default:
1955 				break;
1956 			}
1957 		}
1958 
1959 		/*
1960 		 * When perf is starting the traced process, at the end events
1961 		 * die with the process and we wait for that. Thus no need to
1962 		 * disable events in this case.
1963 		 */
1964 		if (done && !disabled && !target__none(&opts->target)) {
1965 			trigger_off(&auxtrace_snapshot_trigger);
1966 			evlist__disable(rec->evlist);
1967 			disabled = true;
1968 		}
1969 	}
1970 
1971 	trigger_off(&auxtrace_snapshot_trigger);
1972 	trigger_off(&switch_output_trigger);
1973 
1974 	if (opts->auxtrace_snapshot_on_exit)
1975 		record__auxtrace_snapshot_exit(rec);
1976 
1977 	if (forks && workload_exec_errno) {
1978 		char msg[STRERR_BUFSIZE];
1979 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1980 		pr_err("Workload failed: %s\n", emsg);
1981 		err = -1;
1982 		goto out_child;
1983 	}
1984 
1985 	if (!quiet)
1986 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1987 
1988 	if (target__none(&rec->opts.target))
1989 		record__synthesize_workload(rec, true);
1990 
1991 out_child:
1992 	evlist__finalize_ctlfd(rec->evlist);
1993 	record__mmap_read_all(rec, true);
1994 	record__aio_mmap_read_sync(rec);
1995 
1996 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1997 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1998 		session->header.env.comp_ratio = ratio + 0.5;
1999 	}
2000 
2001 	if (forks) {
2002 		int exit_status;
2003 
2004 		if (!child_finished)
2005 			kill(rec->evlist->workload.pid, SIGTERM);
2006 
2007 		wait(&exit_status);
2008 
2009 		if (err < 0)
2010 			status = err;
2011 		else if (WIFEXITED(exit_status))
2012 			status = WEXITSTATUS(exit_status);
2013 		else if (WIFSIGNALED(exit_status))
2014 			signr = WTERMSIG(exit_status);
2015 	} else
2016 		status = err;
2017 
2018 	record__synthesize(rec, true);
2019 	/* this will be recalculated during process_buildids() */
2020 	rec->samples = 0;
2021 
2022 	if (!err) {
2023 		if (!rec->timestamp_filename) {
2024 			record__finish_output(rec);
2025 		} else {
2026 			fd = record__switch_output(rec, true);
2027 			if (fd < 0) {
2028 				status = fd;
2029 				goto out_delete_session;
2030 			}
2031 		}
2032 	}
2033 
2034 	perf_hooks__invoke_record_end();
2035 
2036 	if (!err && !quiet) {
2037 		char samples[128];
2038 		const char *postfix = rec->timestamp_filename ?
2039 					".<timestamp>" : "";
2040 
2041 		if (rec->samples && !rec->opts.full_auxtrace)
2042 			scnprintf(samples, sizeof(samples),
2043 				  " (%" PRIu64 " samples)", rec->samples);
2044 		else
2045 			samples[0] = '\0';
2046 
2047 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
2048 			perf_data__size(data) / 1024.0 / 1024.0,
2049 			data->path, postfix, samples);
2050 		if (ratio) {
2051 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
2052 					rec->session->bytes_transferred / 1024.0 / 1024.0,
2053 					ratio);
2054 		}
2055 		fprintf(stderr, " ]\n");
2056 	}
2057 
2058 out_delete_session:
2059 #ifdef HAVE_EVENTFD_SUPPORT
2060 	if (done_fd >= 0)
2061 		close(done_fd);
2062 #endif
2063 	zstd_fini(&session->zstd_data);
2064 	perf_session__delete(session);
2065 
2066 	if (!opts->no_bpf_event)
2067 		evlist__stop_sb_thread(rec->sb_evlist);
2068 	return status;
2069 }
2070 
2071 static void callchain_debug(struct callchain_param *callchain)
2072 {
2073 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
2074 
2075 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
2076 
2077 	if (callchain->record_mode == CALLCHAIN_DWARF)
2078 		pr_debug("callchain: stack dump size %d\n",
2079 			 callchain->dump_size);
2080 }
2081 
2082 int record_opts__parse_callchain(struct record_opts *record,
2083 				 struct callchain_param *callchain,
2084 				 const char *arg, bool unset)
2085 {
2086 	int ret;
2087 	callchain->enabled = !unset;
2088 
2089 	/* --no-call-graph */
2090 	if (unset) {
2091 		callchain->record_mode = CALLCHAIN_NONE;
2092 		pr_debug("callchain: disabled\n");
2093 		return 0;
2094 	}
2095 
2096 	ret = parse_callchain_record_opt(arg, callchain);
2097 	if (!ret) {
2098 		/* Enable data address sampling for DWARF unwind. */
2099 		if (callchain->record_mode == CALLCHAIN_DWARF)
2100 			record->sample_address = true;
2101 		callchain_debug(callchain);
2102 	}
2103 
2104 	return ret;
2105 }
2106 
2107 int record_parse_callchain_opt(const struct option *opt,
2108 			       const char *arg,
2109 			       int unset)
2110 {
2111 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
2112 }
2113 
2114 int record_callchain_opt(const struct option *opt,
2115 			 const char *arg __maybe_unused,
2116 			 int unset __maybe_unused)
2117 {
2118 	struct callchain_param *callchain = opt->value;
2119 
2120 	callchain->enabled = true;
2121 
2122 	if (callchain->record_mode == CALLCHAIN_NONE)
2123 		callchain->record_mode = CALLCHAIN_FP;
2124 
2125 	callchain_debug(callchain);
2126 	return 0;
2127 }
2128 
2129 static int perf_record_config(const char *var, const char *value, void *cb)
2130 {
2131 	struct record *rec = cb;
2132 
2133 	if (!strcmp(var, "record.build-id")) {
2134 		if (!strcmp(value, "cache"))
2135 			rec->no_buildid_cache = false;
2136 		else if (!strcmp(value, "no-cache"))
2137 			rec->no_buildid_cache = true;
2138 		else if (!strcmp(value, "skip"))
2139 			rec->no_buildid = true;
2140 		else if (!strcmp(value, "mmap"))
2141 			rec->buildid_mmap = true;
2142 		else
2143 			return -1;
2144 		return 0;
2145 	}
2146 	if (!strcmp(var, "record.call-graph")) {
2147 		var = "call-graph.record-mode";
2148 		return perf_default_config(var, value, cb);
2149 	}
2150 #ifdef HAVE_AIO_SUPPORT
2151 	if (!strcmp(var, "record.aio")) {
2152 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
2153 		if (!rec->opts.nr_cblocks)
2154 			rec->opts.nr_cblocks = nr_cblocks_default;
2155 	}
2156 #endif
2157 
2158 	return 0;
2159 }
2160 
2161 
2162 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
2163 {
2164 	struct record_opts *opts = (struct record_opts *)opt->value;
2165 
2166 	if (unset || !str)
2167 		return 0;
2168 
2169 	if (!strcasecmp(str, "node"))
2170 		opts->affinity = PERF_AFFINITY_NODE;
2171 	else if (!strcasecmp(str, "cpu"))
2172 		opts->affinity = PERF_AFFINITY_CPU;
2173 
2174 	return 0;
2175 }
2176 
2177 static int parse_output_max_size(const struct option *opt,
2178 				 const char *str, int unset)
2179 {
2180 	unsigned long *s = (unsigned long *)opt->value;
2181 	static struct parse_tag tags_size[] = {
2182 		{ .tag  = 'B', .mult = 1       },
2183 		{ .tag  = 'K', .mult = 1 << 10 },
2184 		{ .tag  = 'M', .mult = 1 << 20 },
2185 		{ .tag  = 'G', .mult = 1 << 30 },
2186 		{ .tag  = 0 },
2187 	};
2188 	unsigned long val;
2189 
2190 	if (unset) {
2191 		*s = 0;
2192 		return 0;
2193 	}
2194 
2195 	val = parse_tag_value(str, tags_size);
2196 	if (val != (unsigned long) -1) {
2197 		*s = val;
2198 		return 0;
2199 	}
2200 
2201 	return -1;
2202 }
2203 
2204 static int record__parse_mmap_pages(const struct option *opt,
2205 				    const char *str,
2206 				    int unset __maybe_unused)
2207 {
2208 	struct record_opts *opts = opt->value;
2209 	char *s, *p;
2210 	unsigned int mmap_pages;
2211 	int ret;
2212 
2213 	if (!str)
2214 		return -EINVAL;
2215 
2216 	s = strdup(str);
2217 	if (!s)
2218 		return -ENOMEM;
2219 
2220 	p = strchr(s, ',');
2221 	if (p)
2222 		*p = '\0';
2223 
2224 	if (*s) {
2225 		ret = __evlist__parse_mmap_pages(&mmap_pages, s);
2226 		if (ret)
2227 			goto out_free;
2228 		opts->mmap_pages = mmap_pages;
2229 	}
2230 
2231 	if (!p) {
2232 		ret = 0;
2233 		goto out_free;
2234 	}
2235 
2236 	ret = __evlist__parse_mmap_pages(&mmap_pages, p + 1);
2237 	if (ret)
2238 		goto out_free;
2239 
2240 	opts->auxtrace_mmap_pages = mmap_pages;
2241 
2242 out_free:
2243 	free(s);
2244 	return ret;
2245 }
2246 
2247 static int parse_control_option(const struct option *opt,
2248 				const char *str,
2249 				int unset __maybe_unused)
2250 {
2251 	struct record_opts *opts = opt->value;
2252 
2253 	return evlist__parse_control(str, &opts->ctl_fd, &opts->ctl_fd_ack, &opts->ctl_fd_close);
2254 }
2255 
2256 static void switch_output_size_warn(struct record *rec)
2257 {
2258 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2259 	struct switch_output *s = &rec->switch_output;
2260 
2261 	wakeup_size /= 2;
2262 
2263 	if (s->size < wakeup_size) {
2264 		char buf[100];
2265 
2266 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2267 		pr_warning("WARNING: switch-output data size lower than "
2268 			   "wakeup kernel buffer size (%s) "
2269 			   "expect bigger perf.data sizes\n", buf);
2270 	}
2271 }
2272 
2273 static int switch_output_setup(struct record *rec)
2274 {
2275 	struct switch_output *s = &rec->switch_output;
2276 	static struct parse_tag tags_size[] = {
2277 		{ .tag  = 'B', .mult = 1       },
2278 		{ .tag  = 'K', .mult = 1 << 10 },
2279 		{ .tag  = 'M', .mult = 1 << 20 },
2280 		{ .tag  = 'G', .mult = 1 << 30 },
2281 		{ .tag  = 0 },
2282 	};
2283 	static struct parse_tag tags_time[] = {
2284 		{ .tag  = 's', .mult = 1        },
2285 		{ .tag  = 'm', .mult = 60       },
2286 		{ .tag  = 'h', .mult = 60*60    },
2287 		{ .tag  = 'd', .mult = 60*60*24 },
2288 		{ .tag  = 0 },
2289 	};
2290 	unsigned long val;
2291 
2292 	/*
2293 	 * If we're using --switch-output-events, then we imply its
2294 	 * --switch-output=signal, as we'll send a SIGUSR2 from the side band
2295 	 *  thread to its parent.
2296 	 */
2297 	if (rec->switch_output_event_set)
2298 		goto do_signal;
2299 
2300 	if (!s->set)
2301 		return 0;
2302 
2303 	if (!strcmp(s->str, "signal")) {
2304 do_signal:
2305 		s->signal = true;
2306 		pr_debug("switch-output with SIGUSR2 signal\n");
2307 		goto enabled;
2308 	}
2309 
2310 	val = parse_tag_value(s->str, tags_size);
2311 	if (val != (unsigned long) -1) {
2312 		s->size = val;
2313 		pr_debug("switch-output with %s size threshold\n", s->str);
2314 		goto enabled;
2315 	}
2316 
2317 	val = parse_tag_value(s->str, tags_time);
2318 	if (val != (unsigned long) -1) {
2319 		s->time = val;
2320 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2321 			 s->str, s->time);
2322 		goto enabled;
2323 	}
2324 
2325 	return -1;
2326 
2327 enabled:
2328 	rec->timestamp_filename = true;
2329 	s->enabled              = true;
2330 
2331 	if (s->size && !rec->opts.no_buffering)
2332 		switch_output_size_warn(rec);
2333 
2334 	return 0;
2335 }
2336 
2337 static const char * const __record_usage[] = {
2338 	"perf record [<options>] [<command>]",
2339 	"perf record [<options>] -- <command> [<options>]",
2340 	NULL
2341 };
2342 const char * const *record_usage = __record_usage;
2343 
2344 static int build_id__process_mmap(struct perf_tool *tool, union perf_event *event,
2345 				  struct perf_sample *sample, struct machine *machine)
2346 {
2347 	/*
2348 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2349 	 * no need to add them twice.
2350 	 */
2351 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2352 		return 0;
2353 	return perf_event__process_mmap(tool, event, sample, machine);
2354 }
2355 
2356 static int build_id__process_mmap2(struct perf_tool *tool, union perf_event *event,
2357 				   struct perf_sample *sample, struct machine *machine)
2358 {
2359 	/*
2360 	 * We already have the kernel maps, put in place via perf_session__create_kernel_maps()
2361 	 * no need to add them twice.
2362 	 */
2363 	if (!(event->header.misc & PERF_RECORD_MISC_USER))
2364 		return 0;
2365 
2366 	return perf_event__process_mmap2(tool, event, sample, machine);
2367 }
2368 
2369 /*
2370  * XXX Ideally would be local to cmd_record() and passed to a record__new
2371  * because we need to have access to it in record__exit, that is called
2372  * after cmd_record() exits, but since record_options need to be accessible to
2373  * builtin-script, leave it here.
2374  *
2375  * At least we don't ouch it in all the other functions here directly.
2376  *
2377  * Just say no to tons of global variables, sigh.
2378  */
2379 static struct record record = {
2380 	.opts = {
2381 		.sample_time	     = true,
2382 		.mmap_pages	     = UINT_MAX,
2383 		.user_freq	     = UINT_MAX,
2384 		.user_interval	     = ULLONG_MAX,
2385 		.freq		     = 4000,
2386 		.target		     = {
2387 			.uses_mmap   = true,
2388 			.default_per_cpu = true,
2389 		},
2390 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2391 		.nr_threads_synthesize = 1,
2392 		.ctl_fd              = -1,
2393 		.ctl_fd_ack          = -1,
2394 	},
2395 	.tool = {
2396 		.sample		= process_sample_event,
2397 		.fork		= perf_event__process_fork,
2398 		.exit		= perf_event__process_exit,
2399 		.comm		= perf_event__process_comm,
2400 		.namespaces	= perf_event__process_namespaces,
2401 		.mmap		= build_id__process_mmap,
2402 		.mmap2		= build_id__process_mmap2,
2403 		.ordered_events	= true,
2404 	},
2405 };
2406 
2407 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2408 	"\n\t\t\t\tDefault: fp";
2409 
2410 static bool dry_run;
2411 
2412 /*
2413  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2414  * with it and switch to use the library functions in perf_evlist that came
2415  * from builtin-record.c, i.e. use record_opts,
2416  * evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2417  * using pipes, etc.
2418  */
2419 static struct option __record_options[] = {
2420 	OPT_CALLBACK('e', "event", &record.evlist, "event",
2421 		     "event selector. use 'perf list' to list available events",
2422 		     parse_events_option),
2423 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2424 		     "event filter", parse_filter),
2425 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2426 			   NULL, "don't record events from perf itself",
2427 			   exclude_perf),
2428 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2429 		    "record events on existing process id"),
2430 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2431 		    "record events on existing thread id"),
2432 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2433 		    "collect data with this RT SCHED_FIFO priority"),
2434 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2435 		    "collect data without buffering"),
2436 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2437 		    "collect raw sample records from all opened counters"),
2438 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2439 			    "system-wide collection from all CPUs"),
2440 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2441 		    "list of cpus to monitor"),
2442 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2443 	OPT_STRING('o', "output", &record.data.path, "file",
2444 		    "output file name"),
2445 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2446 			&record.opts.no_inherit_set,
2447 			"child tasks do not inherit counters"),
2448 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2449 		    "synthesize non-sample events at the end of output"),
2450 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2451 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "do not record bpf events"),
2452 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2453 		    "Fail if the specified frequency can't be used"),
2454 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2455 		     "profile at this frequency",
2456 		      record__parse_freq),
2457 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2458 		     "number of mmap data pages and AUX area tracing mmap pages",
2459 		     record__parse_mmap_pages),
2460 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2461 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2462 		     record__mmap_flush_parse),
2463 	OPT_BOOLEAN(0, "group", &record.opts.group,
2464 		    "put the counters into a counter group"),
2465 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2466 			   NULL, "enables call-graph recording" ,
2467 			   &record_callchain_opt),
2468 	OPT_CALLBACK(0, "call-graph", &record.opts,
2469 		     "record_mode[,record_size]", record_callchain_help,
2470 		     &record_parse_callchain_opt),
2471 	OPT_INCR('v', "verbose", &verbose,
2472 		    "be more verbose (show counter open errors, etc)"),
2473 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2474 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2475 		    "per thread counts"),
2476 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2477 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2478 		    "Record the sample physical addresses"),
2479 	OPT_BOOLEAN(0, "data-page-size", &record.opts.sample_data_page_size,
2480 		    "Record the sampled data address data page size"),
2481 	OPT_BOOLEAN(0, "code-page-size", &record.opts.sample_code_page_size,
2482 		    "Record the sampled code address (ip) page size"),
2483 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2484 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2485 			&record.opts.sample_time_set,
2486 			"Record the sample timestamps"),
2487 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2488 			"Record the sample period"),
2489 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2490 		    "don't sample"),
2491 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2492 			&record.no_buildid_cache_set,
2493 			"do not update the buildid cache"),
2494 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2495 			&record.no_buildid_set,
2496 			"do not collect buildids in perf.data"),
2497 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2498 		     "monitor event in cgroup name only",
2499 		     parse_cgroups),
2500 	OPT_INTEGER('D', "delay", &record.opts.initial_delay,
2501 		  "ms to wait before starting measurement after program start (-1: start with events disabled)"),
2502 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2503 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2504 		   "user to profile"),
2505 
2506 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2507 		     "branch any", "sample any taken branches",
2508 		     parse_branch_stack),
2509 
2510 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2511 		     "branch filter mask", "branch stack filter modes",
2512 		     parse_branch_stack),
2513 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2514 		    "sample by weight (on special events only)"),
2515 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2516 		    "sample transaction flags (special events only)"),
2517 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2518 		    "use per-thread mmaps"),
2519 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2520 		    "sample selected machine registers on interrupt,"
2521 		    " use '-I?' to list register names", parse_intr_regs),
2522 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2523 		    "sample selected machine registers on interrupt,"
2524 		    " use '--user-regs=?' to list register names", parse_user_regs),
2525 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2526 		    "Record running/enabled time of read (:S) events"),
2527 	OPT_CALLBACK('k', "clockid", &record.opts,
2528 	"clockid", "clockid to use for events, see clock_gettime()",
2529 	parse_clockid),
2530 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2531 			  "opts", "AUX area tracing Snapshot Mode", ""),
2532 	OPT_STRING_OPTARG(0, "aux-sample", &record.opts.auxtrace_sample_opts,
2533 			  "opts", "sample AUX area", ""),
2534 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2535 			"per thread proc mmap processing timeout in ms"),
2536 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2537 		    "Record namespaces events"),
2538 	OPT_BOOLEAN(0, "all-cgroups", &record.opts.record_cgroup,
2539 		    "Record cgroup events"),
2540 	OPT_BOOLEAN_SET(0, "switch-events", &record.opts.record_switch_events,
2541 			&record.opts.record_switch_events_set,
2542 			"Record context switch events"),
2543 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2544 			 "Configure all used events to run in kernel space.",
2545 			 PARSE_OPT_EXCLUSIVE),
2546 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2547 			 "Configure all used events to run in user space.",
2548 			 PARSE_OPT_EXCLUSIVE),
2549 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2550 		    "collect kernel callchains"),
2551 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2552 		    "collect user callchains"),
2553 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2554 		   "clang binary to use for compiling BPF scriptlets"),
2555 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2556 		   "options passed to clang when compiling BPF scriptlets"),
2557 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2558 		   "file", "vmlinux pathname"),
2559 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2560 		    "Record build-id of all DSOs regardless of hits"),
2561 	OPT_BOOLEAN(0, "buildid-mmap", &record.buildid_mmap,
2562 		    "Record build-id in map events"),
2563 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2564 		    "append timestamp to output filename"),
2565 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2566 		    "Record timestamp boundary (time of first/last samples)"),
2567 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2568 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2569 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2570 			  "signal"),
2571 	OPT_CALLBACK_SET(0, "switch-output-event", &record.sb_evlist, &record.switch_output_event_set, "switch output event",
2572 			 "switch output event selector. use 'perf list' to list available events",
2573 			 parse_events_option_new_evlist),
2574 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2575 		   "Limit number of switch output generated files"),
2576 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2577 		    "Parse options then exit"),
2578 #ifdef HAVE_AIO_SUPPORT
2579 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2580 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2581 		     record__aio_parse),
2582 #endif
2583 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2584 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2585 		     record__parse_affinity),
2586 #ifdef HAVE_ZSTD_SUPPORT
2587 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2588 			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2589 			    record__parse_comp_level),
2590 #endif
2591 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
2592 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
2593 	OPT_UINTEGER(0, "num-thread-synthesize",
2594 		     &record.opts.nr_threads_synthesize,
2595 		     "number of threads to run for event synthesis"),
2596 #ifdef HAVE_LIBPFM
2597 	OPT_CALLBACK(0, "pfm-events", &record.evlist, "event",
2598 		"libpfm4 event selector. use 'perf list' to list available events",
2599 		parse_libpfm_events_option),
2600 #endif
2601 	OPT_CALLBACK(0, "control", &record.opts, "fd:ctl-fd[,ack-fd] or fifo:ctl-fifo[,ack-fifo]",
2602 		     "Listen on ctl-fd descriptor for command to control measurement ('enable': enable events, 'disable': disable events,\n"
2603 		     "\t\t\t  'snapshot': AUX area tracing snapshot).\n"
2604 		     "\t\t\t  Optionally send control command completion ('ack\\n') to ack-fd descriptor.\n"
2605 		     "\t\t\t  Alternatively, ctl-fifo / ack-fifo will be opened and used as ctl-fd / ack-fd.",
2606 		      parse_control_option),
2607 	OPT_END()
2608 };
2609 
2610 struct option *record_options = __record_options;
2611 
2612 int cmd_record(int argc, const char **argv)
2613 {
2614 	int err;
2615 	struct record *rec = &record;
2616 	char errbuf[BUFSIZ];
2617 
2618 	setlocale(LC_ALL, "");
2619 
2620 #ifndef HAVE_LIBBPF_SUPPORT
2621 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2622 	set_nobuild('\0', "clang-path", true);
2623 	set_nobuild('\0', "clang-opt", true);
2624 # undef set_nobuild
2625 #endif
2626 
2627 #ifndef HAVE_BPF_PROLOGUE
2628 # if !defined (HAVE_DWARF_SUPPORT)
2629 #  define REASON  "NO_DWARF=1"
2630 # elif !defined (HAVE_LIBBPF_SUPPORT)
2631 #  define REASON  "NO_LIBBPF=1"
2632 # else
2633 #  define REASON  "this architecture doesn't support BPF prologue"
2634 # endif
2635 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2636 	set_nobuild('\0', "vmlinux", true);
2637 # undef set_nobuild
2638 # undef REASON
2639 #endif
2640 
2641 	rec->opts.affinity = PERF_AFFINITY_SYS;
2642 
2643 	rec->evlist = evlist__new();
2644 	if (rec->evlist == NULL)
2645 		return -ENOMEM;
2646 
2647 	err = perf_config(perf_record_config, rec);
2648 	if (err)
2649 		return err;
2650 
2651 	argc = parse_options(argc, argv, record_options, record_usage,
2652 			    PARSE_OPT_STOP_AT_NON_OPTION);
2653 	if (quiet)
2654 		perf_quiet_option();
2655 
2656 	/* Make system wide (-a) the default target. */
2657 	if (!argc && target__none(&rec->opts.target))
2658 		rec->opts.target.system_wide = true;
2659 
2660 	if (nr_cgroups && !rec->opts.target.system_wide) {
2661 		usage_with_options_msg(record_usage, record_options,
2662 			"cgroup monitoring only available in system-wide mode");
2663 
2664 	}
2665 
2666 	if (rec->buildid_mmap) {
2667 		if (!perf_can_record_build_id()) {
2668 			pr_err("Failed: no support to record build id in mmap events, update your kernel.\n");
2669 			err = -EINVAL;
2670 			goto out_opts;
2671 		}
2672 		pr_debug("Enabling build id in mmap2 events.\n");
2673 		/* Enable mmap build id synthesizing. */
2674 		symbol_conf.buildid_mmap2 = true;
2675 		/* Enable perf_event_attr::build_id bit. */
2676 		rec->opts.build_id = true;
2677 		/* Disable build id cache. */
2678 		rec->no_buildid = true;
2679 	}
2680 
2681 	if (rec->opts.kcore)
2682 		rec->data.is_dir = true;
2683 
2684 	if (rec->opts.comp_level != 0) {
2685 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2686 		rec->no_buildid = true;
2687 	}
2688 
2689 	if (rec->opts.record_switch_events &&
2690 	    !perf_can_record_switch_events()) {
2691 		ui__error("kernel does not support recording context switch events\n");
2692 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2693 		err = -EINVAL;
2694 		goto out_opts;
2695 	}
2696 
2697 	if (switch_output_setup(rec)) {
2698 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2699 		err = -EINVAL;
2700 		goto out_opts;
2701 	}
2702 
2703 	if (rec->switch_output.time) {
2704 		signal(SIGALRM, alarm_sig_handler);
2705 		alarm(rec->switch_output.time);
2706 	}
2707 
2708 	if (rec->switch_output.num_files) {
2709 		rec->switch_output.filenames = calloc(sizeof(char *),
2710 						      rec->switch_output.num_files);
2711 		if (!rec->switch_output.filenames) {
2712 			err = -EINVAL;
2713 			goto out_opts;
2714 		}
2715 	}
2716 
2717 	/*
2718 	 * Allow aliases to facilitate the lookup of symbols for address
2719 	 * filters. Refer to auxtrace_parse_filters().
2720 	 */
2721 	symbol_conf.allow_aliases = true;
2722 
2723 	symbol__init(NULL);
2724 
2725 	if (rec->opts.affinity != PERF_AFFINITY_SYS) {
2726 		rec->affinity_mask.nbits = cpu__max_cpu();
2727 		rec->affinity_mask.bits = bitmap_alloc(rec->affinity_mask.nbits);
2728 		if (!rec->affinity_mask.bits) {
2729 			pr_err("Failed to allocate thread mask for %zd cpus\n", rec->affinity_mask.nbits);
2730 			err = -ENOMEM;
2731 			goto out_opts;
2732 		}
2733 		pr_debug2("thread mask[%zd]: empty\n", rec->affinity_mask.nbits);
2734 	}
2735 
2736 	err = record__auxtrace_init(rec);
2737 	if (err)
2738 		goto out;
2739 
2740 	if (dry_run)
2741 		goto out;
2742 
2743 	err = bpf__setup_stdout(rec->evlist);
2744 	if (err) {
2745 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2746 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2747 			 errbuf);
2748 		goto out;
2749 	}
2750 
2751 	err = -ENOMEM;
2752 
2753 	if (rec->no_buildid_cache || rec->no_buildid) {
2754 		disable_buildid_cache();
2755 	} else if (rec->switch_output.enabled) {
2756 		/*
2757 		 * In 'perf record --switch-output', disable buildid
2758 		 * generation by default to reduce data file switching
2759 		 * overhead. Still generate buildid if they are required
2760 		 * explicitly using
2761 		 *
2762 		 *  perf record --switch-output --no-no-buildid \
2763 		 *              --no-no-buildid-cache
2764 		 *
2765 		 * Following code equals to:
2766 		 *
2767 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2768 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2769 		 *         disable_buildid_cache();
2770 		 */
2771 		bool disable = true;
2772 
2773 		if (rec->no_buildid_set && !rec->no_buildid)
2774 			disable = false;
2775 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2776 			disable = false;
2777 		if (disable) {
2778 			rec->no_buildid = true;
2779 			rec->no_buildid_cache = true;
2780 			disable_buildid_cache();
2781 		}
2782 	}
2783 
2784 	if (record.opts.overwrite)
2785 		record.opts.tail_synthesize = true;
2786 
2787 	if (rec->evlist->core.nr_entries == 0 &&
2788 	    __evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2789 		pr_err("Not enough memory for event selector list\n");
2790 		goto out;
2791 	}
2792 
2793 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2794 		rec->opts.no_inherit = true;
2795 
2796 	err = target__validate(&rec->opts.target);
2797 	if (err) {
2798 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2799 		ui__warning("%s\n", errbuf);
2800 	}
2801 
2802 	err = target__parse_uid(&rec->opts.target);
2803 	if (err) {
2804 		int saved_errno = errno;
2805 
2806 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2807 		ui__error("%s", errbuf);
2808 
2809 		err = -saved_errno;
2810 		goto out;
2811 	}
2812 
2813 	/* Enable ignoring missing threads when -u/-p option is defined. */
2814 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2815 
2816 	err = -ENOMEM;
2817 	if (evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2818 		usage_with_options(record_usage, record_options);
2819 
2820 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2821 	if (err)
2822 		goto out;
2823 
2824 	/*
2825 	 * We take all buildids when the file contains
2826 	 * AUX area tracing data because we do not decode the
2827 	 * trace because it would take too long.
2828 	 */
2829 	if (rec->opts.full_auxtrace)
2830 		rec->buildid_all = true;
2831 
2832 	if (rec->opts.text_poke) {
2833 		err = record__config_text_poke(rec->evlist);
2834 		if (err) {
2835 			pr_err("record__config_text_poke failed, error %d\n", err);
2836 			goto out;
2837 		}
2838 	}
2839 
2840 	if (record_opts__config(&rec->opts)) {
2841 		err = -EINVAL;
2842 		goto out;
2843 	}
2844 
2845 	if (rec->opts.nr_cblocks > nr_cblocks_max)
2846 		rec->opts.nr_cblocks = nr_cblocks_max;
2847 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2848 
2849 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2850 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2851 
2852 	if (rec->opts.comp_level > comp_level_max)
2853 		rec->opts.comp_level = comp_level_max;
2854 	pr_debug("comp level: %d\n", rec->opts.comp_level);
2855 
2856 	err = __cmd_record(&record, argc, argv);
2857 out:
2858 	bitmap_free(rec->affinity_mask.bits);
2859 	evlist__delete(rec->evlist);
2860 	symbol__exit();
2861 	auxtrace_record__free(rec->itr);
2862 out_opts:
2863 	evlist__close_control(rec->opts.ctl_fd, rec->opts.ctl_fd_ack, &rec->opts.ctl_fd_close);
2864 	return err;
2865 }
2866 
2867 static void snapshot_sig_handler(int sig __maybe_unused)
2868 {
2869 	struct record *rec = &record;
2870 
2871 	hit_auxtrace_snapshot_trigger(rec);
2872 
2873 	if (switch_output_signal(rec))
2874 		trigger_hit(&switch_output_trigger);
2875 }
2876 
2877 static void alarm_sig_handler(int sig __maybe_unused)
2878 {
2879 	struct record *rec = &record;
2880 
2881 	if (switch_output_time(rec))
2882 		trigger_hit(&switch_output_trigger);
2883 }
2884