xref: /linux/tools/perf/builtin-record.c (revision a44e4f3ab16bc808590763a543a93b6fbf3abcc4)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/mmap.h"
24 #include "util/target.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/record.h"
29 #include "util/cpumap.h"
30 #include "util/thread_map.h"
31 #include "util/data.h"
32 #include "util/perf_regs.h"
33 #include "util/auxtrace.h"
34 #include "util/tsc.h"
35 #include "util/parse-branch-options.h"
36 #include "util/parse-regs-options.h"
37 #include "util/llvm-utils.h"
38 #include "util/bpf-loader.h"
39 #include "util/trigger.h"
40 #include "util/perf-hooks.h"
41 #include "util/cpu-set-sched.h"
42 #include "util/synthetic-events.h"
43 #include "util/time-utils.h"
44 #include "util/units.h"
45 #include "util/bpf-event.h"
46 #include "asm/bug.h"
47 #include "perf.h"
48 
49 #include <errno.h>
50 #include <inttypes.h>
51 #include <locale.h>
52 #include <poll.h>
53 #include <unistd.h>
54 #include <sched.h>
55 #include <signal.h>
56 #include <sys/mman.h>
57 #include <sys/wait.h>
58 #include <sys/types.h>
59 #include <sys/stat.h>
60 #include <fcntl.h>
61 #include <linux/err.h>
62 #include <linux/string.h>
63 #include <linux/time64.h>
64 #include <linux/zalloc.h>
65 
66 struct switch_output {
67 	bool		 enabled;
68 	bool		 signal;
69 	unsigned long	 size;
70 	unsigned long	 time;
71 	const char	*str;
72 	bool		 set;
73 	char		 **filenames;
74 	int		 num_files;
75 	int		 cur_file;
76 };
77 
78 struct record {
79 	struct perf_tool	tool;
80 	struct record_opts	opts;
81 	u64			bytes_written;
82 	struct perf_data	data;
83 	struct auxtrace_record	*itr;
84 	struct evlist	*evlist;
85 	struct perf_session	*session;
86 	int			realtime_prio;
87 	bool			no_buildid;
88 	bool			no_buildid_set;
89 	bool			no_buildid_cache;
90 	bool			no_buildid_cache_set;
91 	bool			buildid_all;
92 	bool			timestamp_filename;
93 	bool			timestamp_boundary;
94 	struct switch_output	switch_output;
95 	unsigned long long	samples;
96 	cpu_set_t		affinity_mask;
97 	unsigned long		output_max_size;	/* = 0: unlimited */
98 };
99 
100 static volatile int done;
101 
102 static volatile int auxtrace_record__snapshot_started;
103 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
104 static DEFINE_TRIGGER(switch_output_trigger);
105 
106 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
107 	"SYS", "NODE", "CPU"
108 };
109 
110 static bool switch_output_signal(struct record *rec)
111 {
112 	return rec->switch_output.signal &&
113 	       trigger_is_ready(&switch_output_trigger);
114 }
115 
116 static bool switch_output_size(struct record *rec)
117 {
118 	return rec->switch_output.size &&
119 	       trigger_is_ready(&switch_output_trigger) &&
120 	       (rec->bytes_written >= rec->switch_output.size);
121 }
122 
123 static bool switch_output_time(struct record *rec)
124 {
125 	return rec->switch_output.time &&
126 	       trigger_is_ready(&switch_output_trigger);
127 }
128 
129 static bool record__output_max_size_exceeded(struct record *rec)
130 {
131 	return rec->output_max_size &&
132 	       (rec->bytes_written >= rec->output_max_size);
133 }
134 
135 static int record__write(struct record *rec, struct mmap *map __maybe_unused,
136 			 void *bf, size_t size)
137 {
138 	struct perf_data_file *file = &rec->session->data->file;
139 
140 	if (perf_data_file__write(file, bf, size) < 0) {
141 		pr_err("failed to write perf data, error: %m\n");
142 		return -1;
143 	}
144 
145 	rec->bytes_written += size;
146 
147 	if (record__output_max_size_exceeded(rec) && !done) {
148 		fprintf(stderr, "[ perf record: perf size limit reached (%" PRIu64 " KB),"
149 				" stopping session ]\n",
150 				rec->bytes_written >> 10);
151 		done = 1;
152 	}
153 
154 	if (switch_output_size(rec))
155 		trigger_hit(&switch_output_trigger);
156 
157 	return 0;
158 }
159 
160 static int record__aio_enabled(struct record *rec);
161 static int record__comp_enabled(struct record *rec);
162 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
163 			    void *src, size_t src_size);
164 
165 #ifdef HAVE_AIO_SUPPORT
166 static int record__aio_write(struct aiocb *cblock, int trace_fd,
167 		void *buf, size_t size, off_t off)
168 {
169 	int rc;
170 
171 	cblock->aio_fildes = trace_fd;
172 	cblock->aio_buf    = buf;
173 	cblock->aio_nbytes = size;
174 	cblock->aio_offset = off;
175 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
176 
177 	do {
178 		rc = aio_write(cblock);
179 		if (rc == 0) {
180 			break;
181 		} else if (errno != EAGAIN) {
182 			cblock->aio_fildes = -1;
183 			pr_err("failed to queue perf data, error: %m\n");
184 			break;
185 		}
186 	} while (1);
187 
188 	return rc;
189 }
190 
191 static int record__aio_complete(struct mmap *md, struct aiocb *cblock)
192 {
193 	void *rem_buf;
194 	off_t rem_off;
195 	size_t rem_size;
196 	int rc, aio_errno;
197 	ssize_t aio_ret, written;
198 
199 	aio_errno = aio_error(cblock);
200 	if (aio_errno == EINPROGRESS)
201 		return 0;
202 
203 	written = aio_ret = aio_return(cblock);
204 	if (aio_ret < 0) {
205 		if (aio_errno != EINTR)
206 			pr_err("failed to write perf data, error: %m\n");
207 		written = 0;
208 	}
209 
210 	rem_size = cblock->aio_nbytes - written;
211 
212 	if (rem_size == 0) {
213 		cblock->aio_fildes = -1;
214 		/*
215 		 * md->refcount is incremented in record__aio_pushfn() for
216 		 * every aio write request started in record__aio_push() so
217 		 * decrement it because the request is now complete.
218 		 */
219 		perf_mmap__put(&md->core);
220 		rc = 1;
221 	} else {
222 		/*
223 		 * aio write request may require restart with the
224 		 * reminder if the kernel didn't write whole
225 		 * chunk at once.
226 		 */
227 		rem_off = cblock->aio_offset + written;
228 		rem_buf = (void *)(cblock->aio_buf + written);
229 		record__aio_write(cblock, cblock->aio_fildes,
230 				rem_buf, rem_size, rem_off);
231 		rc = 0;
232 	}
233 
234 	return rc;
235 }
236 
237 static int record__aio_sync(struct mmap *md, bool sync_all)
238 {
239 	struct aiocb **aiocb = md->aio.aiocb;
240 	struct aiocb *cblocks = md->aio.cblocks;
241 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
242 	int i, do_suspend;
243 
244 	do {
245 		do_suspend = 0;
246 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
247 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
248 				if (sync_all)
249 					aiocb[i] = NULL;
250 				else
251 					return i;
252 			} else {
253 				/*
254 				 * Started aio write is not complete yet
255 				 * so it has to be waited before the
256 				 * next allocation.
257 				 */
258 				aiocb[i] = &cblocks[i];
259 				do_suspend = 1;
260 			}
261 		}
262 		if (!do_suspend)
263 			return -1;
264 
265 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
266 			if (!(errno == EAGAIN || errno == EINTR))
267 				pr_err("failed to sync perf data, error: %m\n");
268 		}
269 	} while (1);
270 }
271 
272 struct record_aio {
273 	struct record	*rec;
274 	void		*data;
275 	size_t		size;
276 };
277 
278 static int record__aio_pushfn(struct mmap *map, void *to, void *buf, size_t size)
279 {
280 	struct record_aio *aio = to;
281 
282 	/*
283 	 * map->core.base data pointed by buf is copied into free map->aio.data[] buffer
284 	 * to release space in the kernel buffer as fast as possible, calling
285 	 * perf_mmap__consume() from perf_mmap__push() function.
286 	 *
287 	 * That lets the kernel to proceed with storing more profiling data into
288 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
289 	 *
290 	 * Coping can be done in two steps in case the chunk of profiling data
291 	 * crosses the upper bound of the kernel buffer. In this case we first move
292 	 * part of data from map->start till the upper bound and then the reminder
293 	 * from the beginning of the kernel buffer till the end of the data chunk.
294 	 */
295 
296 	if (record__comp_enabled(aio->rec)) {
297 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
298 				     mmap__mmap_len(map) - aio->size,
299 				     buf, size);
300 	} else {
301 		memcpy(aio->data + aio->size, buf, size);
302 	}
303 
304 	if (!aio->size) {
305 		/*
306 		 * Increment map->refcount to guard map->aio.data[] buffer
307 		 * from premature deallocation because map object can be
308 		 * released earlier than aio write request started on
309 		 * map->aio.data[] buffer is complete.
310 		 *
311 		 * perf_mmap__put() is done at record__aio_complete()
312 		 * after started aio request completion or at record__aio_push()
313 		 * if the request failed to start.
314 		 */
315 		perf_mmap__get(&map->core);
316 	}
317 
318 	aio->size += size;
319 
320 	return size;
321 }
322 
323 static int record__aio_push(struct record *rec, struct mmap *map, off_t *off)
324 {
325 	int ret, idx;
326 	int trace_fd = rec->session->data->file.fd;
327 	struct record_aio aio = { .rec = rec, .size = 0 };
328 
329 	/*
330 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
331 	 * becomes available after previous aio write operation.
332 	 */
333 
334 	idx = record__aio_sync(map, false);
335 	aio.data = map->aio.data[idx];
336 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
337 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
338 		return ret;
339 
340 	rec->samples++;
341 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
342 	if (!ret) {
343 		*off += aio.size;
344 		rec->bytes_written += aio.size;
345 		if (switch_output_size(rec))
346 			trigger_hit(&switch_output_trigger);
347 	} else {
348 		/*
349 		 * Decrement map->refcount incremented in record__aio_pushfn()
350 		 * back if record__aio_write() operation failed to start, otherwise
351 		 * map->refcount is decremented in record__aio_complete() after
352 		 * aio write operation finishes successfully.
353 		 */
354 		perf_mmap__put(&map->core);
355 	}
356 
357 	return ret;
358 }
359 
360 static off_t record__aio_get_pos(int trace_fd)
361 {
362 	return lseek(trace_fd, 0, SEEK_CUR);
363 }
364 
365 static void record__aio_set_pos(int trace_fd, off_t pos)
366 {
367 	lseek(trace_fd, pos, SEEK_SET);
368 }
369 
370 static void record__aio_mmap_read_sync(struct record *rec)
371 {
372 	int i;
373 	struct evlist *evlist = rec->evlist;
374 	struct mmap *maps = evlist->mmap;
375 
376 	if (!record__aio_enabled(rec))
377 		return;
378 
379 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
380 		struct mmap *map = &maps[i];
381 
382 		if (map->core.base)
383 			record__aio_sync(map, true);
384 	}
385 }
386 
387 static int nr_cblocks_default = 1;
388 static int nr_cblocks_max = 4;
389 
390 static int record__aio_parse(const struct option *opt,
391 			     const char *str,
392 			     int unset)
393 {
394 	struct record_opts *opts = (struct record_opts *)opt->value;
395 
396 	if (unset) {
397 		opts->nr_cblocks = 0;
398 	} else {
399 		if (str)
400 			opts->nr_cblocks = strtol(str, NULL, 0);
401 		if (!opts->nr_cblocks)
402 			opts->nr_cblocks = nr_cblocks_default;
403 	}
404 
405 	return 0;
406 }
407 #else /* HAVE_AIO_SUPPORT */
408 static int nr_cblocks_max = 0;
409 
410 static int record__aio_push(struct record *rec __maybe_unused, struct mmap *map __maybe_unused,
411 			    off_t *off __maybe_unused)
412 {
413 	return -1;
414 }
415 
416 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
417 {
418 	return -1;
419 }
420 
421 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
422 {
423 }
424 
425 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
426 {
427 }
428 #endif
429 
430 static int record__aio_enabled(struct record *rec)
431 {
432 	return rec->opts.nr_cblocks > 0;
433 }
434 
435 #define MMAP_FLUSH_DEFAULT 1
436 static int record__mmap_flush_parse(const struct option *opt,
437 				    const char *str,
438 				    int unset)
439 {
440 	int flush_max;
441 	struct record_opts *opts = (struct record_opts *)opt->value;
442 	static struct parse_tag tags[] = {
443 			{ .tag  = 'B', .mult = 1       },
444 			{ .tag  = 'K', .mult = 1 << 10 },
445 			{ .tag  = 'M', .mult = 1 << 20 },
446 			{ .tag  = 'G', .mult = 1 << 30 },
447 			{ .tag  = 0 },
448 	};
449 
450 	if (unset)
451 		return 0;
452 
453 	if (str) {
454 		opts->mmap_flush = parse_tag_value(str, tags);
455 		if (opts->mmap_flush == (int)-1)
456 			opts->mmap_flush = strtol(str, NULL, 0);
457 	}
458 
459 	if (!opts->mmap_flush)
460 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
461 
462 	flush_max = evlist__mmap_size(opts->mmap_pages);
463 	flush_max /= 4;
464 	if (opts->mmap_flush > flush_max)
465 		opts->mmap_flush = flush_max;
466 
467 	return 0;
468 }
469 
470 #ifdef HAVE_ZSTD_SUPPORT
471 static unsigned int comp_level_default = 1;
472 
473 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
474 {
475 	struct record_opts *opts = opt->value;
476 
477 	if (unset) {
478 		opts->comp_level = 0;
479 	} else {
480 		if (str)
481 			opts->comp_level = strtol(str, NULL, 0);
482 		if (!opts->comp_level)
483 			opts->comp_level = comp_level_default;
484 	}
485 
486 	return 0;
487 }
488 #endif
489 static unsigned int comp_level_max = 22;
490 
491 static int record__comp_enabled(struct record *rec)
492 {
493 	return rec->opts.comp_level > 0;
494 }
495 
496 static int process_synthesized_event(struct perf_tool *tool,
497 				     union perf_event *event,
498 				     struct perf_sample *sample __maybe_unused,
499 				     struct machine *machine __maybe_unused)
500 {
501 	struct record *rec = container_of(tool, struct record, tool);
502 	return record__write(rec, NULL, event, event->header.size);
503 }
504 
505 static int record__pushfn(struct mmap *map, void *to, void *bf, size_t size)
506 {
507 	struct record *rec = to;
508 
509 	if (record__comp_enabled(rec)) {
510 		size = zstd_compress(rec->session, map->data, mmap__mmap_len(map), bf, size);
511 		bf   = map->data;
512 	}
513 
514 	rec->samples++;
515 	return record__write(rec, map, bf, size);
516 }
517 
518 static volatile int signr = -1;
519 static volatile int child_finished;
520 
521 static void sig_handler(int sig)
522 {
523 	if (sig == SIGCHLD)
524 		child_finished = 1;
525 	else
526 		signr = sig;
527 
528 	done = 1;
529 }
530 
531 static void sigsegv_handler(int sig)
532 {
533 	perf_hooks__recover();
534 	sighandler_dump_stack(sig);
535 }
536 
537 static void record__sig_exit(void)
538 {
539 	if (signr == -1)
540 		return;
541 
542 	signal(signr, SIG_DFL);
543 	raise(signr);
544 }
545 
546 #ifdef HAVE_AUXTRACE_SUPPORT
547 
548 static int record__process_auxtrace(struct perf_tool *tool,
549 				    struct mmap *map,
550 				    union perf_event *event, void *data1,
551 				    size_t len1, void *data2, size_t len2)
552 {
553 	struct record *rec = container_of(tool, struct record, tool);
554 	struct perf_data *data = &rec->data;
555 	size_t padding;
556 	u8 pad[8] = {0};
557 
558 	if (!perf_data__is_pipe(data) && perf_data__is_single_file(data)) {
559 		off_t file_offset;
560 		int fd = perf_data__fd(data);
561 		int err;
562 
563 		file_offset = lseek(fd, 0, SEEK_CUR);
564 		if (file_offset == -1)
565 			return -1;
566 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
567 						     event, file_offset);
568 		if (err)
569 			return err;
570 	}
571 
572 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
573 	padding = (len1 + len2) & 7;
574 	if (padding)
575 		padding = 8 - padding;
576 
577 	record__write(rec, map, event, event->header.size);
578 	record__write(rec, map, data1, len1);
579 	if (len2)
580 		record__write(rec, map, data2, len2);
581 	record__write(rec, map, &pad, padding);
582 
583 	return 0;
584 }
585 
586 static int record__auxtrace_mmap_read(struct record *rec,
587 				      struct mmap *map)
588 {
589 	int ret;
590 
591 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
592 				  record__process_auxtrace);
593 	if (ret < 0)
594 		return ret;
595 
596 	if (ret)
597 		rec->samples++;
598 
599 	return 0;
600 }
601 
602 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
603 					       struct mmap *map)
604 {
605 	int ret;
606 
607 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
608 					   record__process_auxtrace,
609 					   rec->opts.auxtrace_snapshot_size);
610 	if (ret < 0)
611 		return ret;
612 
613 	if (ret)
614 		rec->samples++;
615 
616 	return 0;
617 }
618 
619 static int record__auxtrace_read_snapshot_all(struct record *rec)
620 {
621 	int i;
622 	int rc = 0;
623 
624 	for (i = 0; i < rec->evlist->core.nr_mmaps; i++) {
625 		struct mmap *map = &rec->evlist->mmap[i];
626 
627 		if (!map->auxtrace_mmap.base)
628 			continue;
629 
630 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
631 			rc = -1;
632 			goto out;
633 		}
634 	}
635 out:
636 	return rc;
637 }
638 
639 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
640 {
641 	pr_debug("Recording AUX area tracing snapshot\n");
642 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
643 		trigger_error(&auxtrace_snapshot_trigger);
644 	} else {
645 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
646 			trigger_error(&auxtrace_snapshot_trigger);
647 		else
648 			trigger_ready(&auxtrace_snapshot_trigger);
649 	}
650 }
651 
652 static int record__auxtrace_snapshot_exit(struct record *rec)
653 {
654 	if (trigger_is_error(&auxtrace_snapshot_trigger))
655 		return 0;
656 
657 	if (!auxtrace_record__snapshot_started &&
658 	    auxtrace_record__snapshot_start(rec->itr))
659 		return -1;
660 
661 	record__read_auxtrace_snapshot(rec, true);
662 	if (trigger_is_error(&auxtrace_snapshot_trigger))
663 		return -1;
664 
665 	return 0;
666 }
667 
668 static int record__auxtrace_init(struct record *rec)
669 {
670 	int err;
671 
672 	if (!rec->itr) {
673 		rec->itr = auxtrace_record__init(rec->evlist, &err);
674 		if (err)
675 			return err;
676 	}
677 
678 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
679 					      rec->opts.auxtrace_snapshot_opts);
680 	if (err)
681 		return err;
682 
683 	return auxtrace_parse_filters(rec->evlist);
684 }
685 
686 #else
687 
688 static inline
689 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
690 			       struct mmap *map __maybe_unused)
691 {
692 	return 0;
693 }
694 
695 static inline
696 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
697 				    bool on_exit __maybe_unused)
698 {
699 }
700 
701 static inline
702 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
703 {
704 	return 0;
705 }
706 
707 static inline
708 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
709 {
710 	return 0;
711 }
712 
713 static int record__auxtrace_init(struct record *rec __maybe_unused)
714 {
715 	return 0;
716 }
717 
718 #endif
719 
720 static bool record__kcore_readable(struct machine *machine)
721 {
722 	char kcore[PATH_MAX];
723 	int fd;
724 
725 	scnprintf(kcore, sizeof(kcore), "%s/proc/kcore", machine->root_dir);
726 
727 	fd = open(kcore, O_RDONLY);
728 	if (fd < 0)
729 		return false;
730 
731 	close(fd);
732 
733 	return true;
734 }
735 
736 static int record__kcore_copy(struct machine *machine, struct perf_data *data)
737 {
738 	char from_dir[PATH_MAX];
739 	char kcore_dir[PATH_MAX];
740 	int ret;
741 
742 	snprintf(from_dir, sizeof(from_dir), "%s/proc", machine->root_dir);
743 
744 	ret = perf_data__make_kcore_dir(data, kcore_dir, sizeof(kcore_dir));
745 	if (ret)
746 		return ret;
747 
748 	return kcore_copy(from_dir, kcore_dir);
749 }
750 
751 static int record__mmap_evlist(struct record *rec,
752 			       struct evlist *evlist)
753 {
754 	struct record_opts *opts = &rec->opts;
755 	char msg[512];
756 
757 	if (opts->affinity != PERF_AFFINITY_SYS)
758 		cpu__setup_cpunode_map();
759 
760 	if (evlist__mmap_ex(evlist, opts->mmap_pages,
761 				 opts->auxtrace_mmap_pages,
762 				 opts->auxtrace_snapshot_mode,
763 				 opts->nr_cblocks, opts->affinity,
764 				 opts->mmap_flush, opts->comp_level) < 0) {
765 		if (errno == EPERM) {
766 			pr_err("Permission error mapping pages.\n"
767 			       "Consider increasing "
768 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
769 			       "or try again with a smaller value of -m/--mmap_pages.\n"
770 			       "(current value: %u,%u)\n",
771 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
772 			return -errno;
773 		} else {
774 			pr_err("failed to mmap with %d (%s)\n", errno,
775 				str_error_r(errno, msg, sizeof(msg)));
776 			if (errno)
777 				return -errno;
778 			else
779 				return -EINVAL;
780 		}
781 	}
782 	return 0;
783 }
784 
785 static int record__mmap(struct record *rec)
786 {
787 	return record__mmap_evlist(rec, rec->evlist);
788 }
789 
790 static int record__open(struct record *rec)
791 {
792 	char msg[BUFSIZ];
793 	struct evsel *pos;
794 	struct evlist *evlist = rec->evlist;
795 	struct perf_session *session = rec->session;
796 	struct record_opts *opts = &rec->opts;
797 	int rc = 0;
798 
799 	/*
800 	 * For initial_delay we need to add a dummy event so that we can track
801 	 * PERF_RECORD_MMAP while we wait for the initial delay to enable the
802 	 * real events, the ones asked by the user.
803 	 */
804 	if (opts->initial_delay) {
805 		if (perf_evlist__add_dummy(evlist))
806 			return -ENOMEM;
807 
808 		pos = evlist__first(evlist);
809 		pos->tracking = 0;
810 		pos = evlist__last(evlist);
811 		pos->tracking = 1;
812 		pos->core.attr.enable_on_exec = 1;
813 	}
814 
815 	perf_evlist__config(evlist, opts, &callchain_param);
816 
817 	evlist__for_each_entry(evlist, pos) {
818 try_again:
819 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
820 			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
821 				if (verbose > 0)
822 					ui__warning("%s\n", msg);
823 				goto try_again;
824 			}
825 			if ((errno == EINVAL || errno == EBADF) &&
826 			    pos->leader != pos &&
827 			    pos->weak_group) {
828 			        pos = perf_evlist__reset_weak_group(evlist, pos);
829 				goto try_again;
830 			}
831 			rc = -errno;
832 			perf_evsel__open_strerror(pos, &opts->target,
833 						  errno, msg, sizeof(msg));
834 			ui__error("%s\n", msg);
835 			goto out;
836 		}
837 
838 		pos->supported = true;
839 	}
840 
841 	if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(evlist)) {
842 		pr_warning(
843 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
844 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
845 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
846 "file is not found in the buildid cache or in the vmlinux path.\n\n"
847 "Samples in kernel modules won't be resolved at all.\n\n"
848 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
849 "even with a suitable vmlinux or kallsyms file.\n\n");
850 	}
851 
852 	if (perf_evlist__apply_filters(evlist, &pos)) {
853 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
854 			pos->filter, perf_evsel__name(pos), errno,
855 			str_error_r(errno, msg, sizeof(msg)));
856 		rc = -1;
857 		goto out;
858 	}
859 
860 	rc = record__mmap(rec);
861 	if (rc)
862 		goto out;
863 
864 	session->evlist = evlist;
865 	perf_session__set_id_hdr_size(session);
866 out:
867 	return rc;
868 }
869 
870 static int process_sample_event(struct perf_tool *tool,
871 				union perf_event *event,
872 				struct perf_sample *sample,
873 				struct evsel *evsel,
874 				struct machine *machine)
875 {
876 	struct record *rec = container_of(tool, struct record, tool);
877 
878 	if (rec->evlist->first_sample_time == 0)
879 		rec->evlist->first_sample_time = sample->time;
880 
881 	rec->evlist->last_sample_time = sample->time;
882 
883 	if (rec->buildid_all)
884 		return 0;
885 
886 	rec->samples++;
887 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
888 }
889 
890 static int process_buildids(struct record *rec)
891 {
892 	struct perf_session *session = rec->session;
893 
894 	if (perf_data__size(&rec->data) == 0)
895 		return 0;
896 
897 	/*
898 	 * During this process, it'll load kernel map and replace the
899 	 * dso->long_name to a real pathname it found.  In this case
900 	 * we prefer the vmlinux path like
901 	 *   /lib/modules/3.16.4/build/vmlinux
902 	 *
903 	 * rather than build-id path (in debug directory).
904 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
905 	 */
906 	symbol_conf.ignore_vmlinux_buildid = true;
907 
908 	/*
909 	 * If --buildid-all is given, it marks all DSO regardless of hits,
910 	 * so no need to process samples. But if timestamp_boundary is enabled,
911 	 * it still needs to walk on all samples to get the timestamps of
912 	 * first/last samples.
913 	 */
914 	if (rec->buildid_all && !rec->timestamp_boundary)
915 		rec->tool.sample = NULL;
916 
917 	return perf_session__process_events(session);
918 }
919 
920 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
921 {
922 	int err;
923 	struct perf_tool *tool = data;
924 	/*
925 	 *As for guest kernel when processing subcommand record&report,
926 	 *we arrange module mmap prior to guest kernel mmap and trigger
927 	 *a preload dso because default guest module symbols are loaded
928 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
929 	 *method is used to avoid symbol missing when the first addr is
930 	 *in module instead of in guest kernel.
931 	 */
932 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
933 					     machine);
934 	if (err < 0)
935 		pr_err("Couldn't record guest kernel [%d]'s reference"
936 		       " relocation symbol.\n", machine->pid);
937 
938 	/*
939 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
940 	 * have no _text sometimes.
941 	 */
942 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
943 						 machine);
944 	if (err < 0)
945 		pr_err("Couldn't record guest kernel [%d]'s reference"
946 		       " relocation symbol.\n", machine->pid);
947 }
948 
949 static struct perf_event_header finished_round_event = {
950 	.size = sizeof(struct perf_event_header),
951 	.type = PERF_RECORD_FINISHED_ROUND,
952 };
953 
954 static void record__adjust_affinity(struct record *rec, struct mmap *map)
955 {
956 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
957 	    !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
958 		CPU_ZERO(&rec->affinity_mask);
959 		CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
960 		sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
961 	}
962 }
963 
964 static size_t process_comp_header(void *record, size_t increment)
965 {
966 	struct perf_record_compressed *event = record;
967 	size_t size = sizeof(*event);
968 
969 	if (increment) {
970 		event->header.size += increment;
971 		return increment;
972 	}
973 
974 	event->header.type = PERF_RECORD_COMPRESSED;
975 	event->header.size = size;
976 
977 	return size;
978 }
979 
980 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
981 			    void *src, size_t src_size)
982 {
983 	size_t compressed;
984 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
985 
986 	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
987 						     max_record_size, process_comp_header);
988 
989 	session->bytes_transferred += src_size;
990 	session->bytes_compressed  += compressed;
991 
992 	return compressed;
993 }
994 
995 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
996 				    bool overwrite, bool synch)
997 {
998 	u64 bytes_written = rec->bytes_written;
999 	int i;
1000 	int rc = 0;
1001 	struct mmap *maps;
1002 	int trace_fd = rec->data.file.fd;
1003 	off_t off = 0;
1004 
1005 	if (!evlist)
1006 		return 0;
1007 
1008 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
1009 	if (!maps)
1010 		return 0;
1011 
1012 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
1013 		return 0;
1014 
1015 	if (record__aio_enabled(rec))
1016 		off = record__aio_get_pos(trace_fd);
1017 
1018 	for (i = 0; i < evlist->core.nr_mmaps; i++) {
1019 		u64 flush = 0;
1020 		struct mmap *map = &maps[i];
1021 
1022 		if (map->core.base) {
1023 			record__adjust_affinity(rec, map);
1024 			if (synch) {
1025 				flush = map->core.flush;
1026 				map->core.flush = 1;
1027 			}
1028 			if (!record__aio_enabled(rec)) {
1029 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
1030 					if (synch)
1031 						map->core.flush = flush;
1032 					rc = -1;
1033 					goto out;
1034 				}
1035 			} else {
1036 				if (record__aio_push(rec, map, &off) < 0) {
1037 					record__aio_set_pos(trace_fd, off);
1038 					if (synch)
1039 						map->core.flush = flush;
1040 					rc = -1;
1041 					goto out;
1042 				}
1043 			}
1044 			if (synch)
1045 				map->core.flush = flush;
1046 		}
1047 
1048 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
1049 		    record__auxtrace_mmap_read(rec, map) != 0) {
1050 			rc = -1;
1051 			goto out;
1052 		}
1053 	}
1054 
1055 	if (record__aio_enabled(rec))
1056 		record__aio_set_pos(trace_fd, off);
1057 
1058 	/*
1059 	 * Mark the round finished in case we wrote
1060 	 * at least one event.
1061 	 */
1062 	if (bytes_written != rec->bytes_written)
1063 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1064 
1065 	if (overwrite)
1066 		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1067 out:
1068 	return rc;
1069 }
1070 
1071 static int record__mmap_read_all(struct record *rec, bool synch)
1072 {
1073 	int err;
1074 
1075 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1076 	if (err)
1077 		return err;
1078 
1079 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1080 }
1081 
1082 static void record__init_features(struct record *rec)
1083 {
1084 	struct perf_session *session = rec->session;
1085 	int feat;
1086 
1087 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1088 		perf_header__set_feat(&session->header, feat);
1089 
1090 	if (rec->no_buildid)
1091 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1092 
1093 	if (!have_tracepoints(&rec->evlist->core.entries))
1094 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1095 
1096 	if (!rec->opts.branch_stack)
1097 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1098 
1099 	if (!rec->opts.full_auxtrace)
1100 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1101 
1102 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1103 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1104 
1105 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1106 	if (!record__comp_enabled(rec))
1107 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1108 
1109 	perf_header__clear_feat(&session->header, HEADER_STAT);
1110 }
1111 
1112 static void
1113 record__finish_output(struct record *rec)
1114 {
1115 	struct perf_data *data = &rec->data;
1116 	int fd = perf_data__fd(data);
1117 
1118 	if (data->is_pipe)
1119 		return;
1120 
1121 	rec->session->header.data_size += rec->bytes_written;
1122 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1123 
1124 	if (!rec->no_buildid) {
1125 		process_buildids(rec);
1126 
1127 		if (rec->buildid_all)
1128 			dsos__hit_all(rec->session);
1129 	}
1130 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1131 
1132 	return;
1133 }
1134 
1135 static int record__synthesize_workload(struct record *rec, bool tail)
1136 {
1137 	int err;
1138 	struct perf_thread_map *thread_map;
1139 
1140 	if (rec->opts.tail_synthesize != tail)
1141 		return 0;
1142 
1143 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1144 	if (thread_map == NULL)
1145 		return -1;
1146 
1147 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1148 						 process_synthesized_event,
1149 						 &rec->session->machines.host,
1150 						 rec->opts.sample_address);
1151 	perf_thread_map__put(thread_map);
1152 	return err;
1153 }
1154 
1155 static int record__synthesize(struct record *rec, bool tail);
1156 
1157 static int
1158 record__switch_output(struct record *rec, bool at_exit)
1159 {
1160 	struct perf_data *data = &rec->data;
1161 	int fd, err;
1162 	char *new_filename;
1163 
1164 	/* Same Size:      "2015122520103046"*/
1165 	char timestamp[] = "InvalidTimestamp";
1166 
1167 	record__aio_mmap_read_sync(rec);
1168 
1169 	record__synthesize(rec, true);
1170 	if (target__none(&rec->opts.target))
1171 		record__synthesize_workload(rec, true);
1172 
1173 	rec->samples = 0;
1174 	record__finish_output(rec);
1175 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1176 	if (err) {
1177 		pr_err("Failed to get current timestamp\n");
1178 		return -EINVAL;
1179 	}
1180 
1181 	fd = perf_data__switch(data, timestamp,
1182 				    rec->session->header.data_offset,
1183 				    at_exit, &new_filename);
1184 	if (fd >= 0 && !at_exit) {
1185 		rec->bytes_written = 0;
1186 		rec->session->header.data_size = 0;
1187 	}
1188 
1189 	if (!quiet)
1190 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1191 			data->path, timestamp);
1192 
1193 	if (rec->switch_output.num_files) {
1194 		int n = rec->switch_output.cur_file + 1;
1195 
1196 		if (n >= rec->switch_output.num_files)
1197 			n = 0;
1198 		rec->switch_output.cur_file = n;
1199 		if (rec->switch_output.filenames[n]) {
1200 			remove(rec->switch_output.filenames[n]);
1201 			zfree(&rec->switch_output.filenames[n]);
1202 		}
1203 		rec->switch_output.filenames[n] = new_filename;
1204 	} else {
1205 		free(new_filename);
1206 	}
1207 
1208 	/* Output tracking events */
1209 	if (!at_exit) {
1210 		record__synthesize(rec, false);
1211 
1212 		/*
1213 		 * In 'perf record --switch-output' without -a,
1214 		 * record__synthesize() in record__switch_output() won't
1215 		 * generate tracking events because there's no thread_map
1216 		 * in evlist. Which causes newly created perf.data doesn't
1217 		 * contain map and comm information.
1218 		 * Create a fake thread_map and directly call
1219 		 * perf_event__synthesize_thread_map() for those events.
1220 		 */
1221 		if (target__none(&rec->opts.target))
1222 			record__synthesize_workload(rec, false);
1223 	}
1224 	return fd;
1225 }
1226 
1227 static volatile int workload_exec_errno;
1228 
1229 /*
1230  * perf_evlist__prepare_workload will send a SIGUSR1
1231  * if the fork fails, since we asked by setting its
1232  * want_signal to true.
1233  */
1234 static void workload_exec_failed_signal(int signo __maybe_unused,
1235 					siginfo_t *info,
1236 					void *ucontext __maybe_unused)
1237 {
1238 	workload_exec_errno = info->si_value.sival_int;
1239 	done = 1;
1240 	child_finished = 1;
1241 }
1242 
1243 static void snapshot_sig_handler(int sig);
1244 static void alarm_sig_handler(int sig);
1245 
1246 static const struct perf_event_mmap_page *
1247 perf_evlist__pick_pc(struct evlist *evlist)
1248 {
1249 	if (evlist) {
1250 		if (evlist->mmap && evlist->mmap[0].core.base)
1251 			return evlist->mmap[0].core.base;
1252 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].core.base)
1253 			return evlist->overwrite_mmap[0].core.base;
1254 	}
1255 	return NULL;
1256 }
1257 
1258 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1259 {
1260 	const struct perf_event_mmap_page *pc;
1261 
1262 	pc = perf_evlist__pick_pc(rec->evlist);
1263 	if (pc)
1264 		return pc;
1265 	return NULL;
1266 }
1267 
1268 static int record__synthesize(struct record *rec, bool tail)
1269 {
1270 	struct perf_session *session = rec->session;
1271 	struct machine *machine = &session->machines.host;
1272 	struct perf_data *data = &rec->data;
1273 	struct record_opts *opts = &rec->opts;
1274 	struct perf_tool *tool = &rec->tool;
1275 	int fd = perf_data__fd(data);
1276 	int err = 0;
1277 
1278 	if (rec->opts.tail_synthesize != tail)
1279 		return 0;
1280 
1281 	if (data->is_pipe) {
1282 		/*
1283 		 * We need to synthesize events first, because some
1284 		 * features works on top of them (on report side).
1285 		 */
1286 		err = perf_event__synthesize_attrs(tool, rec->evlist,
1287 						   process_synthesized_event);
1288 		if (err < 0) {
1289 			pr_err("Couldn't synthesize attrs.\n");
1290 			goto out;
1291 		}
1292 
1293 		err = perf_event__synthesize_features(tool, session, rec->evlist,
1294 						      process_synthesized_event);
1295 		if (err < 0) {
1296 			pr_err("Couldn't synthesize features.\n");
1297 			return err;
1298 		}
1299 
1300 		if (have_tracepoints(&rec->evlist->core.entries)) {
1301 			/*
1302 			 * FIXME err <= 0 here actually means that
1303 			 * there were no tracepoints so its not really
1304 			 * an error, just that we don't need to
1305 			 * synthesize anything.  We really have to
1306 			 * return this more properly and also
1307 			 * propagate errors that now are calling die()
1308 			 */
1309 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
1310 								  process_synthesized_event);
1311 			if (err <= 0) {
1312 				pr_err("Couldn't record tracing data.\n");
1313 				goto out;
1314 			}
1315 			rec->bytes_written += err;
1316 		}
1317 	}
1318 
1319 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1320 					  process_synthesized_event, machine);
1321 	if (err)
1322 		goto out;
1323 
1324 	if (rec->opts.full_auxtrace) {
1325 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1326 					session, process_synthesized_event);
1327 		if (err)
1328 			goto out;
1329 	}
1330 
1331 	if (!perf_evlist__exclude_kernel(rec->evlist)) {
1332 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1333 							 machine);
1334 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1335 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1336 				   "Check /proc/kallsyms permission or run as root.\n");
1337 
1338 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1339 						     machine);
1340 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1341 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1342 				   "Check /proc/modules permission or run as root.\n");
1343 	}
1344 
1345 	if (perf_guest) {
1346 		machines__process_guests(&session->machines,
1347 					 perf_event__synthesize_guest_os, tool);
1348 	}
1349 
1350 	err = perf_event__synthesize_extra_attr(&rec->tool,
1351 						rec->evlist,
1352 						process_synthesized_event,
1353 						data->is_pipe);
1354 	if (err)
1355 		goto out;
1356 
1357 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1358 						 process_synthesized_event,
1359 						NULL);
1360 	if (err < 0) {
1361 		pr_err("Couldn't synthesize thread map.\n");
1362 		return err;
1363 	}
1364 
1365 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1366 					     process_synthesized_event, NULL);
1367 	if (err < 0) {
1368 		pr_err("Couldn't synthesize cpu map.\n");
1369 		return err;
1370 	}
1371 
1372 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1373 						machine, opts);
1374 	if (err < 0)
1375 		pr_warning("Couldn't synthesize bpf events.\n");
1376 
1377 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1378 					    process_synthesized_event, opts->sample_address,
1379 					    1);
1380 out:
1381 	return err;
1382 }
1383 
1384 static int __cmd_record(struct record *rec, int argc, const char **argv)
1385 {
1386 	int err;
1387 	int status = 0;
1388 	unsigned long waking = 0;
1389 	const bool forks = argc > 0;
1390 	struct perf_tool *tool = &rec->tool;
1391 	struct record_opts *opts = &rec->opts;
1392 	struct perf_data *data = &rec->data;
1393 	struct perf_session *session;
1394 	bool disabled = false, draining = false;
1395 	struct evlist *sb_evlist = NULL;
1396 	int fd;
1397 	float ratio = 0;
1398 
1399 	atexit(record__sig_exit);
1400 	signal(SIGCHLD, sig_handler);
1401 	signal(SIGINT, sig_handler);
1402 	signal(SIGTERM, sig_handler);
1403 	signal(SIGSEGV, sigsegv_handler);
1404 
1405 	if (rec->opts.record_namespaces)
1406 		tool->namespace_events = true;
1407 
1408 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1409 		signal(SIGUSR2, snapshot_sig_handler);
1410 		if (rec->opts.auxtrace_snapshot_mode)
1411 			trigger_on(&auxtrace_snapshot_trigger);
1412 		if (rec->switch_output.enabled)
1413 			trigger_on(&switch_output_trigger);
1414 	} else {
1415 		signal(SIGUSR2, SIG_IGN);
1416 	}
1417 
1418 	session = perf_session__new(data, false, tool);
1419 	if (IS_ERR(session)) {
1420 		pr_err("Perf session creation failed.\n");
1421 		return PTR_ERR(session);
1422 	}
1423 
1424 	fd = perf_data__fd(data);
1425 	rec->session = session;
1426 
1427 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1428 		pr_err("Compression initialization failed.\n");
1429 		return -1;
1430 	}
1431 
1432 	session->header.env.comp_type  = PERF_COMP_ZSTD;
1433 	session->header.env.comp_level = rec->opts.comp_level;
1434 
1435 	if (rec->opts.kcore &&
1436 	    !record__kcore_readable(&session->machines.host)) {
1437 		pr_err("ERROR: kcore is not readable.\n");
1438 		return -1;
1439 	}
1440 
1441 	record__init_features(rec);
1442 
1443 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1444 		session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1445 
1446 	if (forks) {
1447 		err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1448 						    argv, data->is_pipe,
1449 						    workload_exec_failed_signal);
1450 		if (err < 0) {
1451 			pr_err("Couldn't run the workload!\n");
1452 			status = err;
1453 			goto out_delete_session;
1454 		}
1455 	}
1456 
1457 	/*
1458 	 * If we have just single event and are sending data
1459 	 * through pipe, we need to force the ids allocation,
1460 	 * because we synthesize event name through the pipe
1461 	 * and need the id for that.
1462 	 */
1463 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1464 		rec->opts.sample_id = true;
1465 
1466 	if (record__open(rec) != 0) {
1467 		err = -1;
1468 		goto out_child;
1469 	}
1470 	session->header.env.comp_mmap_len = session->evlist->core.mmap_len;
1471 
1472 	if (rec->opts.kcore) {
1473 		err = record__kcore_copy(&session->machines.host, data);
1474 		if (err) {
1475 			pr_err("ERROR: Failed to copy kcore\n");
1476 			goto out_child;
1477 		}
1478 	}
1479 
1480 	err = bpf__apply_obj_config();
1481 	if (err) {
1482 		char errbuf[BUFSIZ];
1483 
1484 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1485 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1486 			 errbuf);
1487 		goto out_child;
1488 	}
1489 
1490 	/*
1491 	 * Normally perf_session__new would do this, but it doesn't have the
1492 	 * evlist.
1493 	 */
1494 	if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1495 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1496 		rec->tool.ordered_events = false;
1497 	}
1498 
1499 	if (!rec->evlist->nr_groups)
1500 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1501 
1502 	if (data->is_pipe) {
1503 		err = perf_header__write_pipe(fd);
1504 		if (err < 0)
1505 			goto out_child;
1506 	} else {
1507 		err = perf_session__write_header(session, rec->evlist, fd, false);
1508 		if (err < 0)
1509 			goto out_child;
1510 	}
1511 
1512 	if (!rec->no_buildid
1513 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1514 		pr_err("Couldn't generate buildids. "
1515 		       "Use --no-buildid to profile anyway.\n");
1516 		err = -1;
1517 		goto out_child;
1518 	}
1519 
1520 	if (!opts->no_bpf_event)
1521 		bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1522 
1523 	if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1524 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1525 		opts->no_bpf_event = true;
1526 	}
1527 
1528 	err = record__synthesize(rec, false);
1529 	if (err < 0)
1530 		goto out_child;
1531 
1532 	if (rec->realtime_prio) {
1533 		struct sched_param param;
1534 
1535 		param.sched_priority = rec->realtime_prio;
1536 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1537 			pr_err("Could not set realtime priority.\n");
1538 			err = -1;
1539 			goto out_child;
1540 		}
1541 	}
1542 
1543 	/*
1544 	 * When perf is starting the traced process, all the events
1545 	 * (apart from group members) have enable_on_exec=1 set,
1546 	 * so don't spoil it by prematurely enabling them.
1547 	 */
1548 	if (!target__none(&opts->target) && !opts->initial_delay)
1549 		evlist__enable(rec->evlist);
1550 
1551 	/*
1552 	 * Let the child rip
1553 	 */
1554 	if (forks) {
1555 		struct machine *machine = &session->machines.host;
1556 		union perf_event *event;
1557 		pid_t tgid;
1558 
1559 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1560 		if (event == NULL) {
1561 			err = -ENOMEM;
1562 			goto out_child;
1563 		}
1564 
1565 		/*
1566 		 * Some H/W events are generated before COMM event
1567 		 * which is emitted during exec(), so perf script
1568 		 * cannot see a correct process name for those events.
1569 		 * Synthesize COMM event to prevent it.
1570 		 */
1571 		tgid = perf_event__synthesize_comm(tool, event,
1572 						   rec->evlist->workload.pid,
1573 						   process_synthesized_event,
1574 						   machine);
1575 		free(event);
1576 
1577 		if (tgid == -1)
1578 			goto out_child;
1579 
1580 		event = malloc(sizeof(event->namespaces) +
1581 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1582 			       machine->id_hdr_size);
1583 		if (event == NULL) {
1584 			err = -ENOMEM;
1585 			goto out_child;
1586 		}
1587 
1588 		/*
1589 		 * Synthesize NAMESPACES event for the command specified.
1590 		 */
1591 		perf_event__synthesize_namespaces(tool, event,
1592 						  rec->evlist->workload.pid,
1593 						  tgid, process_synthesized_event,
1594 						  machine);
1595 		free(event);
1596 
1597 		perf_evlist__start_workload(rec->evlist);
1598 	}
1599 
1600 	if (opts->initial_delay) {
1601 		usleep(opts->initial_delay * USEC_PER_MSEC);
1602 		evlist__enable(rec->evlist);
1603 	}
1604 
1605 	trigger_ready(&auxtrace_snapshot_trigger);
1606 	trigger_ready(&switch_output_trigger);
1607 	perf_hooks__invoke_record_start();
1608 	for (;;) {
1609 		unsigned long long hits = rec->samples;
1610 
1611 		/*
1612 		 * rec->evlist->bkw_mmap_state is possible to be
1613 		 * BKW_MMAP_EMPTY here: when done == true and
1614 		 * hits != rec->samples in previous round.
1615 		 *
1616 		 * perf_evlist__toggle_bkw_mmap ensure we never
1617 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1618 		 */
1619 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1620 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1621 
1622 		if (record__mmap_read_all(rec, false) < 0) {
1623 			trigger_error(&auxtrace_snapshot_trigger);
1624 			trigger_error(&switch_output_trigger);
1625 			err = -1;
1626 			goto out_child;
1627 		}
1628 
1629 		if (auxtrace_record__snapshot_started) {
1630 			auxtrace_record__snapshot_started = 0;
1631 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1632 				record__read_auxtrace_snapshot(rec, false);
1633 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1634 				pr_err("AUX area tracing snapshot failed\n");
1635 				err = -1;
1636 				goto out_child;
1637 			}
1638 		}
1639 
1640 		if (trigger_is_hit(&switch_output_trigger)) {
1641 			/*
1642 			 * If switch_output_trigger is hit, the data in
1643 			 * overwritable ring buffer should have been collected,
1644 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1645 			 *
1646 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1647 			 * record__mmap_read_all() didn't collect data from
1648 			 * overwritable ring buffer. Read again.
1649 			 */
1650 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1651 				continue;
1652 			trigger_ready(&switch_output_trigger);
1653 
1654 			/*
1655 			 * Reenable events in overwrite ring buffer after
1656 			 * record__mmap_read_all(): we should have collected
1657 			 * data from it.
1658 			 */
1659 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1660 
1661 			if (!quiet)
1662 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1663 					waking);
1664 			waking = 0;
1665 			fd = record__switch_output(rec, false);
1666 			if (fd < 0) {
1667 				pr_err("Failed to switch to new file\n");
1668 				trigger_error(&switch_output_trigger);
1669 				err = fd;
1670 				goto out_child;
1671 			}
1672 
1673 			/* re-arm the alarm */
1674 			if (rec->switch_output.time)
1675 				alarm(rec->switch_output.time);
1676 		}
1677 
1678 		if (hits == rec->samples) {
1679 			if (done || draining)
1680 				break;
1681 			err = evlist__poll(rec->evlist, -1);
1682 			/*
1683 			 * Propagate error, only if there's any. Ignore positive
1684 			 * number of returned events and interrupt error.
1685 			 */
1686 			if (err > 0 || (err < 0 && errno == EINTR))
1687 				err = 0;
1688 			waking++;
1689 
1690 			if (evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1691 				draining = true;
1692 		}
1693 
1694 		/*
1695 		 * When perf is starting the traced process, at the end events
1696 		 * die with the process and we wait for that. Thus no need to
1697 		 * disable events in this case.
1698 		 */
1699 		if (done && !disabled && !target__none(&opts->target)) {
1700 			trigger_off(&auxtrace_snapshot_trigger);
1701 			evlist__disable(rec->evlist);
1702 			disabled = true;
1703 		}
1704 	}
1705 
1706 	trigger_off(&auxtrace_snapshot_trigger);
1707 	trigger_off(&switch_output_trigger);
1708 
1709 	if (opts->auxtrace_snapshot_on_exit)
1710 		record__auxtrace_snapshot_exit(rec);
1711 
1712 	if (forks && workload_exec_errno) {
1713 		char msg[STRERR_BUFSIZE];
1714 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1715 		pr_err("Workload failed: %s\n", emsg);
1716 		err = -1;
1717 		goto out_child;
1718 	}
1719 
1720 	if (!quiet)
1721 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1722 
1723 	if (target__none(&rec->opts.target))
1724 		record__synthesize_workload(rec, true);
1725 
1726 out_child:
1727 	record__mmap_read_all(rec, true);
1728 	record__aio_mmap_read_sync(rec);
1729 
1730 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1731 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1732 		session->header.env.comp_ratio = ratio + 0.5;
1733 	}
1734 
1735 	if (forks) {
1736 		int exit_status;
1737 
1738 		if (!child_finished)
1739 			kill(rec->evlist->workload.pid, SIGTERM);
1740 
1741 		wait(&exit_status);
1742 
1743 		if (err < 0)
1744 			status = err;
1745 		else if (WIFEXITED(exit_status))
1746 			status = WEXITSTATUS(exit_status);
1747 		else if (WIFSIGNALED(exit_status))
1748 			signr = WTERMSIG(exit_status);
1749 	} else
1750 		status = err;
1751 
1752 	record__synthesize(rec, true);
1753 	/* this will be recalculated during process_buildids() */
1754 	rec->samples = 0;
1755 
1756 	if (!err) {
1757 		if (!rec->timestamp_filename) {
1758 			record__finish_output(rec);
1759 		} else {
1760 			fd = record__switch_output(rec, true);
1761 			if (fd < 0) {
1762 				status = fd;
1763 				goto out_delete_session;
1764 			}
1765 		}
1766 	}
1767 
1768 	perf_hooks__invoke_record_end();
1769 
1770 	if (!err && !quiet) {
1771 		char samples[128];
1772 		const char *postfix = rec->timestamp_filename ?
1773 					".<timestamp>" : "";
1774 
1775 		if (rec->samples && !rec->opts.full_auxtrace)
1776 			scnprintf(samples, sizeof(samples),
1777 				  " (%" PRIu64 " samples)", rec->samples);
1778 		else
1779 			samples[0] = '\0';
1780 
1781 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
1782 			perf_data__size(data) / 1024.0 / 1024.0,
1783 			data->path, postfix, samples);
1784 		if (ratio) {
1785 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
1786 					rec->session->bytes_transferred / 1024.0 / 1024.0,
1787 					ratio);
1788 		}
1789 		fprintf(stderr, " ]\n");
1790 	}
1791 
1792 out_delete_session:
1793 	zstd_fini(&session->zstd_data);
1794 	perf_session__delete(session);
1795 
1796 	if (!opts->no_bpf_event)
1797 		perf_evlist__stop_sb_thread(sb_evlist);
1798 	return status;
1799 }
1800 
1801 static void callchain_debug(struct callchain_param *callchain)
1802 {
1803 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1804 
1805 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1806 
1807 	if (callchain->record_mode == CALLCHAIN_DWARF)
1808 		pr_debug("callchain: stack dump size %d\n",
1809 			 callchain->dump_size);
1810 }
1811 
1812 int record_opts__parse_callchain(struct record_opts *record,
1813 				 struct callchain_param *callchain,
1814 				 const char *arg, bool unset)
1815 {
1816 	int ret;
1817 	callchain->enabled = !unset;
1818 
1819 	/* --no-call-graph */
1820 	if (unset) {
1821 		callchain->record_mode = CALLCHAIN_NONE;
1822 		pr_debug("callchain: disabled\n");
1823 		return 0;
1824 	}
1825 
1826 	ret = parse_callchain_record_opt(arg, callchain);
1827 	if (!ret) {
1828 		/* Enable data address sampling for DWARF unwind. */
1829 		if (callchain->record_mode == CALLCHAIN_DWARF)
1830 			record->sample_address = true;
1831 		callchain_debug(callchain);
1832 	}
1833 
1834 	return ret;
1835 }
1836 
1837 int record_parse_callchain_opt(const struct option *opt,
1838 			       const char *arg,
1839 			       int unset)
1840 {
1841 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1842 }
1843 
1844 int record_callchain_opt(const struct option *opt,
1845 			 const char *arg __maybe_unused,
1846 			 int unset __maybe_unused)
1847 {
1848 	struct callchain_param *callchain = opt->value;
1849 
1850 	callchain->enabled = true;
1851 
1852 	if (callchain->record_mode == CALLCHAIN_NONE)
1853 		callchain->record_mode = CALLCHAIN_FP;
1854 
1855 	callchain_debug(callchain);
1856 	return 0;
1857 }
1858 
1859 static int perf_record_config(const char *var, const char *value, void *cb)
1860 {
1861 	struct record *rec = cb;
1862 
1863 	if (!strcmp(var, "record.build-id")) {
1864 		if (!strcmp(value, "cache"))
1865 			rec->no_buildid_cache = false;
1866 		else if (!strcmp(value, "no-cache"))
1867 			rec->no_buildid_cache = true;
1868 		else if (!strcmp(value, "skip"))
1869 			rec->no_buildid = true;
1870 		else
1871 			return -1;
1872 		return 0;
1873 	}
1874 	if (!strcmp(var, "record.call-graph")) {
1875 		var = "call-graph.record-mode";
1876 		return perf_default_config(var, value, cb);
1877 	}
1878 #ifdef HAVE_AIO_SUPPORT
1879 	if (!strcmp(var, "record.aio")) {
1880 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
1881 		if (!rec->opts.nr_cblocks)
1882 			rec->opts.nr_cblocks = nr_cblocks_default;
1883 	}
1884 #endif
1885 
1886 	return 0;
1887 }
1888 
1889 struct clockid_map {
1890 	const char *name;
1891 	int clockid;
1892 };
1893 
1894 #define CLOCKID_MAP(n, c)	\
1895 	{ .name = n, .clockid = (c), }
1896 
1897 #define CLOCKID_END	{ .name = NULL, }
1898 
1899 
1900 /*
1901  * Add the missing ones, we need to build on many distros...
1902  */
1903 #ifndef CLOCK_MONOTONIC_RAW
1904 #define CLOCK_MONOTONIC_RAW 4
1905 #endif
1906 #ifndef CLOCK_BOOTTIME
1907 #define CLOCK_BOOTTIME 7
1908 #endif
1909 #ifndef CLOCK_TAI
1910 #define CLOCK_TAI 11
1911 #endif
1912 
1913 static const struct clockid_map clockids[] = {
1914 	/* available for all events, NMI safe */
1915 	CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1916 	CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1917 
1918 	/* available for some events */
1919 	CLOCKID_MAP("realtime", CLOCK_REALTIME),
1920 	CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1921 	CLOCKID_MAP("tai", CLOCK_TAI),
1922 
1923 	/* available for the lazy */
1924 	CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1925 	CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1926 	CLOCKID_MAP("real", CLOCK_REALTIME),
1927 	CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1928 
1929 	CLOCKID_END,
1930 };
1931 
1932 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1933 {
1934 	struct timespec res;
1935 
1936 	*res_ns = 0;
1937 	if (!clock_getres(clk_id, &res))
1938 		*res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1939 	else
1940 		pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1941 
1942 	return 0;
1943 }
1944 
1945 static int parse_clockid(const struct option *opt, const char *str, int unset)
1946 {
1947 	struct record_opts *opts = (struct record_opts *)opt->value;
1948 	const struct clockid_map *cm;
1949 	const char *ostr = str;
1950 
1951 	if (unset) {
1952 		opts->use_clockid = 0;
1953 		return 0;
1954 	}
1955 
1956 	/* no arg passed */
1957 	if (!str)
1958 		return 0;
1959 
1960 	/* no setting it twice */
1961 	if (opts->use_clockid)
1962 		return -1;
1963 
1964 	opts->use_clockid = true;
1965 
1966 	/* if its a number, we're done */
1967 	if (sscanf(str, "%d", &opts->clockid) == 1)
1968 		return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1969 
1970 	/* allow a "CLOCK_" prefix to the name */
1971 	if (!strncasecmp(str, "CLOCK_", 6))
1972 		str += 6;
1973 
1974 	for (cm = clockids; cm->name; cm++) {
1975 		if (!strcasecmp(str, cm->name)) {
1976 			opts->clockid = cm->clockid;
1977 			return get_clockid_res(opts->clockid,
1978 					       &opts->clockid_res_ns);
1979 		}
1980 	}
1981 
1982 	opts->use_clockid = false;
1983 	ui__warning("unknown clockid %s, check man page\n", ostr);
1984 	return -1;
1985 }
1986 
1987 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1988 {
1989 	struct record_opts *opts = (struct record_opts *)opt->value;
1990 
1991 	if (unset || !str)
1992 		return 0;
1993 
1994 	if (!strcasecmp(str, "node"))
1995 		opts->affinity = PERF_AFFINITY_NODE;
1996 	else if (!strcasecmp(str, "cpu"))
1997 		opts->affinity = PERF_AFFINITY_CPU;
1998 
1999 	return 0;
2000 }
2001 
2002 static int parse_output_max_size(const struct option *opt,
2003 				 const char *str, int unset)
2004 {
2005 	unsigned long *s = (unsigned long *)opt->value;
2006 	static struct parse_tag tags_size[] = {
2007 		{ .tag  = 'B', .mult = 1       },
2008 		{ .tag  = 'K', .mult = 1 << 10 },
2009 		{ .tag  = 'M', .mult = 1 << 20 },
2010 		{ .tag  = 'G', .mult = 1 << 30 },
2011 		{ .tag  = 0 },
2012 	};
2013 	unsigned long val;
2014 
2015 	if (unset) {
2016 		*s = 0;
2017 		return 0;
2018 	}
2019 
2020 	val = parse_tag_value(str, tags_size);
2021 	if (val != (unsigned long) -1) {
2022 		*s = val;
2023 		return 0;
2024 	}
2025 
2026 	return -1;
2027 }
2028 
2029 static int record__parse_mmap_pages(const struct option *opt,
2030 				    const char *str,
2031 				    int unset __maybe_unused)
2032 {
2033 	struct record_opts *opts = opt->value;
2034 	char *s, *p;
2035 	unsigned int mmap_pages;
2036 	int ret;
2037 
2038 	if (!str)
2039 		return -EINVAL;
2040 
2041 	s = strdup(str);
2042 	if (!s)
2043 		return -ENOMEM;
2044 
2045 	p = strchr(s, ',');
2046 	if (p)
2047 		*p = '\0';
2048 
2049 	if (*s) {
2050 		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
2051 		if (ret)
2052 			goto out_free;
2053 		opts->mmap_pages = mmap_pages;
2054 	}
2055 
2056 	if (!p) {
2057 		ret = 0;
2058 		goto out_free;
2059 	}
2060 
2061 	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
2062 	if (ret)
2063 		goto out_free;
2064 
2065 	opts->auxtrace_mmap_pages = mmap_pages;
2066 
2067 out_free:
2068 	free(s);
2069 	return ret;
2070 }
2071 
2072 static void switch_output_size_warn(struct record *rec)
2073 {
2074 	u64 wakeup_size = evlist__mmap_size(rec->opts.mmap_pages);
2075 	struct switch_output *s = &rec->switch_output;
2076 
2077 	wakeup_size /= 2;
2078 
2079 	if (s->size < wakeup_size) {
2080 		char buf[100];
2081 
2082 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
2083 		pr_warning("WARNING: switch-output data size lower than "
2084 			   "wakeup kernel buffer size (%s) "
2085 			   "expect bigger perf.data sizes\n", buf);
2086 	}
2087 }
2088 
2089 static int switch_output_setup(struct record *rec)
2090 {
2091 	struct switch_output *s = &rec->switch_output;
2092 	static struct parse_tag tags_size[] = {
2093 		{ .tag  = 'B', .mult = 1       },
2094 		{ .tag  = 'K', .mult = 1 << 10 },
2095 		{ .tag  = 'M', .mult = 1 << 20 },
2096 		{ .tag  = 'G', .mult = 1 << 30 },
2097 		{ .tag  = 0 },
2098 	};
2099 	static struct parse_tag tags_time[] = {
2100 		{ .tag  = 's', .mult = 1        },
2101 		{ .tag  = 'm', .mult = 60       },
2102 		{ .tag  = 'h', .mult = 60*60    },
2103 		{ .tag  = 'd', .mult = 60*60*24 },
2104 		{ .tag  = 0 },
2105 	};
2106 	unsigned long val;
2107 
2108 	if (!s->set)
2109 		return 0;
2110 
2111 	if (!strcmp(s->str, "signal")) {
2112 		s->signal = true;
2113 		pr_debug("switch-output with SIGUSR2 signal\n");
2114 		goto enabled;
2115 	}
2116 
2117 	val = parse_tag_value(s->str, tags_size);
2118 	if (val != (unsigned long) -1) {
2119 		s->size = val;
2120 		pr_debug("switch-output with %s size threshold\n", s->str);
2121 		goto enabled;
2122 	}
2123 
2124 	val = parse_tag_value(s->str, tags_time);
2125 	if (val != (unsigned long) -1) {
2126 		s->time = val;
2127 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2128 			 s->str, s->time);
2129 		goto enabled;
2130 	}
2131 
2132 	return -1;
2133 
2134 enabled:
2135 	rec->timestamp_filename = true;
2136 	s->enabled              = true;
2137 
2138 	if (s->size && !rec->opts.no_buffering)
2139 		switch_output_size_warn(rec);
2140 
2141 	return 0;
2142 }
2143 
2144 static const char * const __record_usage[] = {
2145 	"perf record [<options>] [<command>]",
2146 	"perf record [<options>] -- <command> [<options>]",
2147 	NULL
2148 };
2149 const char * const *record_usage = __record_usage;
2150 
2151 /*
2152  * XXX Ideally would be local to cmd_record() and passed to a record__new
2153  * because we need to have access to it in record__exit, that is called
2154  * after cmd_record() exits, but since record_options need to be accessible to
2155  * builtin-script, leave it here.
2156  *
2157  * At least we don't ouch it in all the other functions here directly.
2158  *
2159  * Just say no to tons of global variables, sigh.
2160  */
2161 static struct record record = {
2162 	.opts = {
2163 		.sample_time	     = true,
2164 		.mmap_pages	     = UINT_MAX,
2165 		.user_freq	     = UINT_MAX,
2166 		.user_interval	     = ULLONG_MAX,
2167 		.freq		     = 4000,
2168 		.target		     = {
2169 			.uses_mmap   = true,
2170 			.default_per_cpu = true,
2171 		},
2172 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2173 	},
2174 	.tool = {
2175 		.sample		= process_sample_event,
2176 		.fork		= perf_event__process_fork,
2177 		.exit		= perf_event__process_exit,
2178 		.comm		= perf_event__process_comm,
2179 		.namespaces	= perf_event__process_namespaces,
2180 		.mmap		= perf_event__process_mmap,
2181 		.mmap2		= perf_event__process_mmap2,
2182 		.ordered_events	= true,
2183 	},
2184 };
2185 
2186 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2187 	"\n\t\t\t\tDefault: fp";
2188 
2189 static bool dry_run;
2190 
2191 /*
2192  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2193  * with it and switch to use the library functions in perf_evlist that came
2194  * from builtin-record.c, i.e. use record_opts,
2195  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2196  * using pipes, etc.
2197  */
2198 static struct option __record_options[] = {
2199 	OPT_CALLBACK('e', "event", &record.evlist, "event",
2200 		     "event selector. use 'perf list' to list available events",
2201 		     parse_events_option),
2202 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2203 		     "event filter", parse_filter),
2204 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2205 			   NULL, "don't record events from perf itself",
2206 			   exclude_perf),
2207 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2208 		    "record events on existing process id"),
2209 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2210 		    "record events on existing thread id"),
2211 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2212 		    "collect data with this RT SCHED_FIFO priority"),
2213 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2214 		    "collect data without buffering"),
2215 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2216 		    "collect raw sample records from all opened counters"),
2217 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2218 			    "system-wide collection from all CPUs"),
2219 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2220 		    "list of cpus to monitor"),
2221 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2222 	OPT_STRING('o', "output", &record.data.path, "file",
2223 		    "output file name"),
2224 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2225 			&record.opts.no_inherit_set,
2226 			"child tasks do not inherit counters"),
2227 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2228 		    "synthesize non-sample events at the end of output"),
2229 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2230 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2231 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2232 		    "Fail if the specified frequency can't be used"),
2233 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2234 		     "profile at this frequency",
2235 		      record__parse_freq),
2236 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2237 		     "number of mmap data pages and AUX area tracing mmap pages",
2238 		     record__parse_mmap_pages),
2239 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2240 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2241 		     record__mmap_flush_parse),
2242 	OPT_BOOLEAN(0, "group", &record.opts.group,
2243 		    "put the counters into a counter group"),
2244 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2245 			   NULL, "enables call-graph recording" ,
2246 			   &record_callchain_opt),
2247 	OPT_CALLBACK(0, "call-graph", &record.opts,
2248 		     "record_mode[,record_size]", record_callchain_help,
2249 		     &record_parse_callchain_opt),
2250 	OPT_INCR('v', "verbose", &verbose,
2251 		    "be more verbose (show counter open errors, etc)"),
2252 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2253 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2254 		    "per thread counts"),
2255 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2256 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2257 		    "Record the sample physical addresses"),
2258 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2259 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2260 			&record.opts.sample_time_set,
2261 			"Record the sample timestamps"),
2262 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2263 			"Record the sample period"),
2264 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2265 		    "don't sample"),
2266 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2267 			&record.no_buildid_cache_set,
2268 			"do not update the buildid cache"),
2269 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2270 			&record.no_buildid_set,
2271 			"do not collect buildids in perf.data"),
2272 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2273 		     "monitor event in cgroup name only",
2274 		     parse_cgroups),
2275 	OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2276 		  "ms to wait before starting measurement after program start"),
2277 	OPT_BOOLEAN(0, "kcore", &record.opts.kcore, "copy /proc/kcore"),
2278 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2279 		   "user to profile"),
2280 
2281 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2282 		     "branch any", "sample any taken branches",
2283 		     parse_branch_stack),
2284 
2285 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2286 		     "branch filter mask", "branch stack filter modes",
2287 		     parse_branch_stack),
2288 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2289 		    "sample by weight (on special events only)"),
2290 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2291 		    "sample transaction flags (special events only)"),
2292 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2293 		    "use per-thread mmaps"),
2294 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2295 		    "sample selected machine registers on interrupt,"
2296 		    " use '-I?' to list register names", parse_intr_regs),
2297 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2298 		    "sample selected machine registers on interrupt,"
2299 		    " use '--user-regs=?' to list register names", parse_user_regs),
2300 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2301 		    "Record running/enabled time of read (:S) events"),
2302 	OPT_CALLBACK('k', "clockid", &record.opts,
2303 	"clockid", "clockid to use for events, see clock_gettime()",
2304 	parse_clockid),
2305 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2306 			  "opts", "AUX area tracing Snapshot Mode", ""),
2307 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2308 			"per thread proc mmap processing timeout in ms"),
2309 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2310 		    "Record namespaces events"),
2311 	OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2312 		    "Record context switch events"),
2313 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2314 			 "Configure all used events to run in kernel space.",
2315 			 PARSE_OPT_EXCLUSIVE),
2316 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2317 			 "Configure all used events to run in user space.",
2318 			 PARSE_OPT_EXCLUSIVE),
2319 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2320 		    "collect kernel callchains"),
2321 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2322 		    "collect user callchains"),
2323 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2324 		   "clang binary to use for compiling BPF scriptlets"),
2325 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2326 		   "options passed to clang when compiling BPF scriptlets"),
2327 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2328 		   "file", "vmlinux pathname"),
2329 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2330 		    "Record build-id of all DSOs regardless of hits"),
2331 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2332 		    "append timestamp to output filename"),
2333 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2334 		    "Record timestamp boundary (time of first/last samples)"),
2335 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2336 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2337 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2338 			  "signal"),
2339 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2340 		   "Limit number of switch output generated files"),
2341 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2342 		    "Parse options then exit"),
2343 #ifdef HAVE_AIO_SUPPORT
2344 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2345 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2346 		     record__aio_parse),
2347 #endif
2348 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2349 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2350 		     record__parse_affinity),
2351 #ifdef HAVE_ZSTD_SUPPORT
2352 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2353 			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2354 			    record__parse_comp_level),
2355 #endif
2356 	OPT_CALLBACK(0, "max-size", &record.output_max_size,
2357 		     "size", "Limit the maximum size of the output file", parse_output_max_size),
2358 	OPT_END()
2359 };
2360 
2361 struct option *record_options = __record_options;
2362 
2363 int cmd_record(int argc, const char **argv)
2364 {
2365 	int err;
2366 	struct record *rec = &record;
2367 	char errbuf[BUFSIZ];
2368 
2369 	setlocale(LC_ALL, "");
2370 
2371 #ifndef HAVE_LIBBPF_SUPPORT
2372 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2373 	set_nobuild('\0', "clang-path", true);
2374 	set_nobuild('\0', "clang-opt", true);
2375 # undef set_nobuild
2376 #endif
2377 
2378 #ifndef HAVE_BPF_PROLOGUE
2379 # if !defined (HAVE_DWARF_SUPPORT)
2380 #  define REASON  "NO_DWARF=1"
2381 # elif !defined (HAVE_LIBBPF_SUPPORT)
2382 #  define REASON  "NO_LIBBPF=1"
2383 # else
2384 #  define REASON  "this architecture doesn't support BPF prologue"
2385 # endif
2386 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2387 	set_nobuild('\0', "vmlinux", true);
2388 # undef set_nobuild
2389 # undef REASON
2390 #endif
2391 
2392 	CPU_ZERO(&rec->affinity_mask);
2393 	rec->opts.affinity = PERF_AFFINITY_SYS;
2394 
2395 	rec->evlist = evlist__new();
2396 	if (rec->evlist == NULL)
2397 		return -ENOMEM;
2398 
2399 	err = perf_config(perf_record_config, rec);
2400 	if (err)
2401 		return err;
2402 
2403 	argc = parse_options(argc, argv, record_options, record_usage,
2404 			    PARSE_OPT_STOP_AT_NON_OPTION);
2405 	if (quiet)
2406 		perf_quiet_option();
2407 
2408 	/* Make system wide (-a) the default target. */
2409 	if (!argc && target__none(&rec->opts.target))
2410 		rec->opts.target.system_wide = true;
2411 
2412 	if (nr_cgroups && !rec->opts.target.system_wide) {
2413 		usage_with_options_msg(record_usage, record_options,
2414 			"cgroup monitoring only available in system-wide mode");
2415 
2416 	}
2417 
2418 	if (rec->opts.kcore)
2419 		rec->data.is_dir = true;
2420 
2421 	if (rec->opts.comp_level != 0) {
2422 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2423 		rec->no_buildid = true;
2424 	}
2425 
2426 	if (rec->opts.record_switch_events &&
2427 	    !perf_can_record_switch_events()) {
2428 		ui__error("kernel does not support recording context switch events\n");
2429 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2430 		return -EINVAL;
2431 	}
2432 
2433 	if (switch_output_setup(rec)) {
2434 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2435 		return -EINVAL;
2436 	}
2437 
2438 	if (rec->switch_output.time) {
2439 		signal(SIGALRM, alarm_sig_handler);
2440 		alarm(rec->switch_output.time);
2441 	}
2442 
2443 	if (rec->switch_output.num_files) {
2444 		rec->switch_output.filenames = calloc(sizeof(char *),
2445 						      rec->switch_output.num_files);
2446 		if (!rec->switch_output.filenames)
2447 			return -EINVAL;
2448 	}
2449 
2450 	/*
2451 	 * Allow aliases to facilitate the lookup of symbols for address
2452 	 * filters. Refer to auxtrace_parse_filters().
2453 	 */
2454 	symbol_conf.allow_aliases = true;
2455 
2456 	symbol__init(NULL);
2457 
2458 	err = record__auxtrace_init(rec);
2459 	if (err)
2460 		goto out;
2461 
2462 	if (dry_run)
2463 		goto out;
2464 
2465 	err = bpf__setup_stdout(rec->evlist);
2466 	if (err) {
2467 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2468 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2469 			 errbuf);
2470 		goto out;
2471 	}
2472 
2473 	err = -ENOMEM;
2474 
2475 	if (rec->no_buildid_cache || rec->no_buildid) {
2476 		disable_buildid_cache();
2477 	} else if (rec->switch_output.enabled) {
2478 		/*
2479 		 * In 'perf record --switch-output', disable buildid
2480 		 * generation by default to reduce data file switching
2481 		 * overhead. Still generate buildid if they are required
2482 		 * explicitly using
2483 		 *
2484 		 *  perf record --switch-output --no-no-buildid \
2485 		 *              --no-no-buildid-cache
2486 		 *
2487 		 * Following code equals to:
2488 		 *
2489 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2490 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2491 		 *         disable_buildid_cache();
2492 		 */
2493 		bool disable = true;
2494 
2495 		if (rec->no_buildid_set && !rec->no_buildid)
2496 			disable = false;
2497 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2498 			disable = false;
2499 		if (disable) {
2500 			rec->no_buildid = true;
2501 			rec->no_buildid_cache = true;
2502 			disable_buildid_cache();
2503 		}
2504 	}
2505 
2506 	if (record.opts.overwrite)
2507 		record.opts.tail_synthesize = true;
2508 
2509 	if (rec->evlist->core.nr_entries == 0 &&
2510 	    __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2511 		pr_err("Not enough memory for event selector list\n");
2512 		goto out;
2513 	}
2514 
2515 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2516 		rec->opts.no_inherit = true;
2517 
2518 	err = target__validate(&rec->opts.target);
2519 	if (err) {
2520 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2521 		ui__warning("%s\n", errbuf);
2522 	}
2523 
2524 	err = target__parse_uid(&rec->opts.target);
2525 	if (err) {
2526 		int saved_errno = errno;
2527 
2528 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2529 		ui__error("%s", errbuf);
2530 
2531 		err = -saved_errno;
2532 		goto out;
2533 	}
2534 
2535 	/* Enable ignoring missing threads when -u/-p option is defined. */
2536 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2537 
2538 	err = -ENOMEM;
2539 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2540 		usage_with_options(record_usage, record_options);
2541 
2542 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2543 	if (err)
2544 		goto out;
2545 
2546 	/*
2547 	 * We take all buildids when the file contains
2548 	 * AUX area tracing data because we do not decode the
2549 	 * trace because it would take too long.
2550 	 */
2551 	if (rec->opts.full_auxtrace)
2552 		rec->buildid_all = true;
2553 
2554 	if (record_opts__config(&rec->opts)) {
2555 		err = -EINVAL;
2556 		goto out;
2557 	}
2558 
2559 	if (rec->opts.nr_cblocks > nr_cblocks_max)
2560 		rec->opts.nr_cblocks = nr_cblocks_max;
2561 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2562 
2563 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2564 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2565 
2566 	if (rec->opts.comp_level > comp_level_max)
2567 		rec->opts.comp_level = comp_level_max;
2568 	pr_debug("comp level: %d\n", rec->opts.comp_level);
2569 
2570 	err = __cmd_record(&record, argc, argv);
2571 out:
2572 	evlist__delete(rec->evlist);
2573 	symbol__exit();
2574 	auxtrace_record__free(rec->itr);
2575 	return err;
2576 }
2577 
2578 static void snapshot_sig_handler(int sig __maybe_unused)
2579 {
2580 	struct record *rec = &record;
2581 
2582 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2583 		trigger_hit(&auxtrace_snapshot_trigger);
2584 		auxtrace_record__snapshot_started = 1;
2585 		if (auxtrace_record__snapshot_start(record.itr))
2586 			trigger_error(&auxtrace_snapshot_trigger);
2587 	}
2588 
2589 	if (switch_output_signal(rec))
2590 		trigger_hit(&switch_output_trigger);
2591 }
2592 
2593 static void alarm_sig_handler(int sig __maybe_unused)
2594 {
2595 	struct record *rec = &record;
2596 
2597 	if (switch_output_time(rec))
2598 		trigger_hit(&switch_output_trigger);
2599 }
2600