xref: /linux/tools/perf/builtin-record.c (revision c1a604dff486399ae0be95e6396e0158df95ad5d)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "util/build-id.h"
12 #include <subcmd/parse-options.h>
13 #include "util/parse-events.h"
14 #include "util/config.h"
15 
16 #include "util/callchain.h"
17 #include "util/cgroup.h"
18 #include "util/header.h"
19 #include "util/event.h"
20 #include "util/evlist.h"
21 #include "util/evsel.h"
22 #include "util/debug.h"
23 #include "util/target.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/record.h"
28 #include "util/cpumap.h"
29 #include "util/thread_map.h"
30 #include "util/data.h"
31 #include "util/perf_regs.h"
32 #include "util/auxtrace.h"
33 #include "util/tsc.h"
34 #include "util/parse-branch-options.h"
35 #include "util/parse-regs-options.h"
36 #include "util/llvm-utils.h"
37 #include "util/bpf-loader.h"
38 #include "util/trigger.h"
39 #include "util/perf-hooks.h"
40 #include "util/cpu-set-sched.h"
41 #include "util/time-utils.h"
42 #include "util/units.h"
43 #include "util/bpf-event.h"
44 #include "asm/bug.h"
45 #include "perf.h"
46 
47 #include <errno.h>
48 #include <inttypes.h>
49 #include <locale.h>
50 #include <poll.h>
51 #include <unistd.h>
52 #include <sched.h>
53 #include <signal.h>
54 #include <sys/mman.h>
55 #include <sys/wait.h>
56 #include <linux/time64.h>
57 #include <linux/zalloc.h>
58 
59 struct switch_output {
60 	bool		 enabled;
61 	bool		 signal;
62 	unsigned long	 size;
63 	unsigned long	 time;
64 	const char	*str;
65 	bool		 set;
66 	char		 **filenames;
67 	int		 num_files;
68 	int		 cur_file;
69 };
70 
71 struct record {
72 	struct perf_tool	tool;
73 	struct record_opts	opts;
74 	u64			bytes_written;
75 	struct perf_data	data;
76 	struct auxtrace_record	*itr;
77 	struct evlist	*evlist;
78 	struct perf_session	*session;
79 	int			realtime_prio;
80 	bool			no_buildid;
81 	bool			no_buildid_set;
82 	bool			no_buildid_cache;
83 	bool			no_buildid_cache_set;
84 	bool			buildid_all;
85 	bool			timestamp_filename;
86 	bool			timestamp_boundary;
87 	struct switch_output	switch_output;
88 	unsigned long long	samples;
89 	cpu_set_t		affinity_mask;
90 };
91 
92 static volatile int auxtrace_record__snapshot_started;
93 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
94 static DEFINE_TRIGGER(switch_output_trigger);
95 
96 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
97 	"SYS", "NODE", "CPU"
98 };
99 
100 static bool switch_output_signal(struct record *rec)
101 {
102 	return rec->switch_output.signal &&
103 	       trigger_is_ready(&switch_output_trigger);
104 }
105 
106 static bool switch_output_size(struct record *rec)
107 {
108 	return rec->switch_output.size &&
109 	       trigger_is_ready(&switch_output_trigger) &&
110 	       (rec->bytes_written >= rec->switch_output.size);
111 }
112 
113 static bool switch_output_time(struct record *rec)
114 {
115 	return rec->switch_output.time &&
116 	       trigger_is_ready(&switch_output_trigger);
117 }
118 
119 static int record__write(struct record *rec, struct perf_mmap *map __maybe_unused,
120 			 void *bf, size_t size)
121 {
122 	struct perf_data_file *file = &rec->session->data->file;
123 
124 	if (perf_data_file__write(file, bf, size) < 0) {
125 		pr_err("failed to write perf data, error: %m\n");
126 		return -1;
127 	}
128 
129 	rec->bytes_written += size;
130 
131 	if (switch_output_size(rec))
132 		trigger_hit(&switch_output_trigger);
133 
134 	return 0;
135 }
136 
137 static int record__aio_enabled(struct record *rec);
138 static int record__comp_enabled(struct record *rec);
139 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
140 			    void *src, size_t src_size);
141 
142 #ifdef HAVE_AIO_SUPPORT
143 static int record__aio_write(struct aiocb *cblock, int trace_fd,
144 		void *buf, size_t size, off_t off)
145 {
146 	int rc;
147 
148 	cblock->aio_fildes = trace_fd;
149 	cblock->aio_buf    = buf;
150 	cblock->aio_nbytes = size;
151 	cblock->aio_offset = off;
152 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
153 
154 	do {
155 		rc = aio_write(cblock);
156 		if (rc == 0) {
157 			break;
158 		} else if (errno != EAGAIN) {
159 			cblock->aio_fildes = -1;
160 			pr_err("failed to queue perf data, error: %m\n");
161 			break;
162 		}
163 	} while (1);
164 
165 	return rc;
166 }
167 
168 static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
169 {
170 	void *rem_buf;
171 	off_t rem_off;
172 	size_t rem_size;
173 	int rc, aio_errno;
174 	ssize_t aio_ret, written;
175 
176 	aio_errno = aio_error(cblock);
177 	if (aio_errno == EINPROGRESS)
178 		return 0;
179 
180 	written = aio_ret = aio_return(cblock);
181 	if (aio_ret < 0) {
182 		if (aio_errno != EINTR)
183 			pr_err("failed to write perf data, error: %m\n");
184 		written = 0;
185 	}
186 
187 	rem_size = cblock->aio_nbytes - written;
188 
189 	if (rem_size == 0) {
190 		cblock->aio_fildes = -1;
191 		/*
192 		 * md->refcount is incremented in record__aio_pushfn() for
193 		 * every aio write request started in record__aio_push() so
194 		 * decrement it because the request is now complete.
195 		 */
196 		perf_mmap__put(md);
197 		rc = 1;
198 	} else {
199 		/*
200 		 * aio write request may require restart with the
201 		 * reminder if the kernel didn't write whole
202 		 * chunk at once.
203 		 */
204 		rem_off = cblock->aio_offset + written;
205 		rem_buf = (void *)(cblock->aio_buf + written);
206 		record__aio_write(cblock, cblock->aio_fildes,
207 				rem_buf, rem_size, rem_off);
208 		rc = 0;
209 	}
210 
211 	return rc;
212 }
213 
214 static int record__aio_sync(struct perf_mmap *md, bool sync_all)
215 {
216 	struct aiocb **aiocb = md->aio.aiocb;
217 	struct aiocb *cblocks = md->aio.cblocks;
218 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
219 	int i, do_suspend;
220 
221 	do {
222 		do_suspend = 0;
223 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
224 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
225 				if (sync_all)
226 					aiocb[i] = NULL;
227 				else
228 					return i;
229 			} else {
230 				/*
231 				 * Started aio write is not complete yet
232 				 * so it has to be waited before the
233 				 * next allocation.
234 				 */
235 				aiocb[i] = &cblocks[i];
236 				do_suspend = 1;
237 			}
238 		}
239 		if (!do_suspend)
240 			return -1;
241 
242 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
243 			if (!(errno == EAGAIN || errno == EINTR))
244 				pr_err("failed to sync perf data, error: %m\n");
245 		}
246 	} while (1);
247 }
248 
249 struct record_aio {
250 	struct record	*rec;
251 	void		*data;
252 	size_t		size;
253 };
254 
255 static int record__aio_pushfn(struct perf_mmap *map, void *to, void *buf, size_t size)
256 {
257 	struct record_aio *aio = to;
258 
259 	/*
260 	 * map->base data pointed by buf is copied into free map->aio.data[] buffer
261 	 * to release space in the kernel buffer as fast as possible, calling
262 	 * perf_mmap__consume() from perf_mmap__push() function.
263 	 *
264 	 * That lets the kernel to proceed with storing more profiling data into
265 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
266 	 *
267 	 * Coping can be done in two steps in case the chunk of profiling data
268 	 * crosses the upper bound of the kernel buffer. In this case we first move
269 	 * part of data from map->start till the upper bound and then the reminder
270 	 * from the beginning of the kernel buffer till the end of the data chunk.
271 	 */
272 
273 	if (record__comp_enabled(aio->rec)) {
274 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
275 				     perf_mmap__mmap_len(map) - aio->size,
276 				     buf, size);
277 	} else {
278 		memcpy(aio->data + aio->size, buf, size);
279 	}
280 
281 	if (!aio->size) {
282 		/*
283 		 * Increment map->refcount to guard map->aio.data[] buffer
284 		 * from premature deallocation because map object can be
285 		 * released earlier than aio write request started on
286 		 * map->aio.data[] buffer is complete.
287 		 *
288 		 * perf_mmap__put() is done at record__aio_complete()
289 		 * after started aio request completion or at record__aio_push()
290 		 * if the request failed to start.
291 		 */
292 		perf_mmap__get(map);
293 	}
294 
295 	aio->size += size;
296 
297 	return size;
298 }
299 
300 static int record__aio_push(struct record *rec, struct perf_mmap *map, off_t *off)
301 {
302 	int ret, idx;
303 	int trace_fd = rec->session->data->file.fd;
304 	struct record_aio aio = { .rec = rec, .size = 0 };
305 
306 	/*
307 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
308 	 * becomes available after previous aio write operation.
309 	 */
310 
311 	idx = record__aio_sync(map, false);
312 	aio.data = map->aio.data[idx];
313 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
314 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
315 		return ret;
316 
317 	rec->samples++;
318 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
319 	if (!ret) {
320 		*off += aio.size;
321 		rec->bytes_written += aio.size;
322 		if (switch_output_size(rec))
323 			trigger_hit(&switch_output_trigger);
324 	} else {
325 		/*
326 		 * Decrement map->refcount incremented in record__aio_pushfn()
327 		 * back if record__aio_write() operation failed to start, otherwise
328 		 * map->refcount is decremented in record__aio_complete() after
329 		 * aio write operation finishes successfully.
330 		 */
331 		perf_mmap__put(map);
332 	}
333 
334 	return ret;
335 }
336 
337 static off_t record__aio_get_pos(int trace_fd)
338 {
339 	return lseek(trace_fd, 0, SEEK_CUR);
340 }
341 
342 static void record__aio_set_pos(int trace_fd, off_t pos)
343 {
344 	lseek(trace_fd, pos, SEEK_SET);
345 }
346 
347 static void record__aio_mmap_read_sync(struct record *rec)
348 {
349 	int i;
350 	struct evlist *evlist = rec->evlist;
351 	struct perf_mmap *maps = evlist->mmap;
352 
353 	if (!record__aio_enabled(rec))
354 		return;
355 
356 	for (i = 0; i < evlist->nr_mmaps; i++) {
357 		struct perf_mmap *map = &maps[i];
358 
359 		if (map->base)
360 			record__aio_sync(map, true);
361 	}
362 }
363 
364 static int nr_cblocks_default = 1;
365 static int nr_cblocks_max = 4;
366 
367 static int record__aio_parse(const struct option *opt,
368 			     const char *str,
369 			     int unset)
370 {
371 	struct record_opts *opts = (struct record_opts *)opt->value;
372 
373 	if (unset) {
374 		opts->nr_cblocks = 0;
375 	} else {
376 		if (str)
377 			opts->nr_cblocks = strtol(str, NULL, 0);
378 		if (!opts->nr_cblocks)
379 			opts->nr_cblocks = nr_cblocks_default;
380 	}
381 
382 	return 0;
383 }
384 #else /* HAVE_AIO_SUPPORT */
385 static int nr_cblocks_max = 0;
386 
387 static int record__aio_push(struct record *rec __maybe_unused, struct perf_mmap *map __maybe_unused,
388 			    off_t *off __maybe_unused)
389 {
390 	return -1;
391 }
392 
393 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
394 {
395 	return -1;
396 }
397 
398 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
399 {
400 }
401 
402 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
403 {
404 }
405 #endif
406 
407 static int record__aio_enabled(struct record *rec)
408 {
409 	return rec->opts.nr_cblocks > 0;
410 }
411 
412 #define MMAP_FLUSH_DEFAULT 1
413 static int record__mmap_flush_parse(const struct option *opt,
414 				    const char *str,
415 				    int unset)
416 {
417 	int flush_max;
418 	struct record_opts *opts = (struct record_opts *)opt->value;
419 	static struct parse_tag tags[] = {
420 			{ .tag  = 'B', .mult = 1       },
421 			{ .tag  = 'K', .mult = 1 << 10 },
422 			{ .tag  = 'M', .mult = 1 << 20 },
423 			{ .tag  = 'G', .mult = 1 << 30 },
424 			{ .tag  = 0 },
425 	};
426 
427 	if (unset)
428 		return 0;
429 
430 	if (str) {
431 		opts->mmap_flush = parse_tag_value(str, tags);
432 		if (opts->mmap_flush == (int)-1)
433 			opts->mmap_flush = strtol(str, NULL, 0);
434 	}
435 
436 	if (!opts->mmap_flush)
437 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
438 
439 	flush_max = perf_evlist__mmap_size(opts->mmap_pages);
440 	flush_max /= 4;
441 	if (opts->mmap_flush > flush_max)
442 		opts->mmap_flush = flush_max;
443 
444 	return 0;
445 }
446 
447 #ifdef HAVE_ZSTD_SUPPORT
448 static unsigned int comp_level_default = 1;
449 
450 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
451 {
452 	struct record_opts *opts = opt->value;
453 
454 	if (unset) {
455 		opts->comp_level = 0;
456 	} else {
457 		if (str)
458 			opts->comp_level = strtol(str, NULL, 0);
459 		if (!opts->comp_level)
460 			opts->comp_level = comp_level_default;
461 	}
462 
463 	return 0;
464 }
465 #endif
466 static unsigned int comp_level_max = 22;
467 
468 static int record__comp_enabled(struct record *rec)
469 {
470 	return rec->opts.comp_level > 0;
471 }
472 
473 static int process_synthesized_event(struct perf_tool *tool,
474 				     union perf_event *event,
475 				     struct perf_sample *sample __maybe_unused,
476 				     struct machine *machine __maybe_unused)
477 {
478 	struct record *rec = container_of(tool, struct record, tool);
479 	return record__write(rec, NULL, event, event->header.size);
480 }
481 
482 static int record__pushfn(struct perf_mmap *map, void *to, void *bf, size_t size)
483 {
484 	struct record *rec = to;
485 
486 	if (record__comp_enabled(rec)) {
487 		size = zstd_compress(rec->session, map->data, perf_mmap__mmap_len(map), bf, size);
488 		bf   = map->data;
489 	}
490 
491 	rec->samples++;
492 	return record__write(rec, map, bf, size);
493 }
494 
495 static volatile int done;
496 static volatile int signr = -1;
497 static volatile int child_finished;
498 
499 static void sig_handler(int sig)
500 {
501 	if (sig == SIGCHLD)
502 		child_finished = 1;
503 	else
504 		signr = sig;
505 
506 	done = 1;
507 }
508 
509 static void sigsegv_handler(int sig)
510 {
511 	perf_hooks__recover();
512 	sighandler_dump_stack(sig);
513 }
514 
515 static void record__sig_exit(void)
516 {
517 	if (signr == -1)
518 		return;
519 
520 	signal(signr, SIG_DFL);
521 	raise(signr);
522 }
523 
524 #ifdef HAVE_AUXTRACE_SUPPORT
525 
526 static int record__process_auxtrace(struct perf_tool *tool,
527 				    struct perf_mmap *map,
528 				    union perf_event *event, void *data1,
529 				    size_t len1, void *data2, size_t len2)
530 {
531 	struct record *rec = container_of(tool, struct record, tool);
532 	struct perf_data *data = &rec->data;
533 	size_t padding;
534 	u8 pad[8] = {0};
535 
536 	if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) {
537 		off_t file_offset;
538 		int fd = perf_data__fd(data);
539 		int err;
540 
541 		file_offset = lseek(fd, 0, SEEK_CUR);
542 		if (file_offset == -1)
543 			return -1;
544 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
545 						     event, file_offset);
546 		if (err)
547 			return err;
548 	}
549 
550 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
551 	padding = (len1 + len2) & 7;
552 	if (padding)
553 		padding = 8 - padding;
554 
555 	record__write(rec, map, event, event->header.size);
556 	record__write(rec, map, data1, len1);
557 	if (len2)
558 		record__write(rec, map, data2, len2);
559 	record__write(rec, map, &pad, padding);
560 
561 	return 0;
562 }
563 
564 static int record__auxtrace_mmap_read(struct record *rec,
565 				      struct perf_mmap *map)
566 {
567 	int ret;
568 
569 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
570 				  record__process_auxtrace);
571 	if (ret < 0)
572 		return ret;
573 
574 	if (ret)
575 		rec->samples++;
576 
577 	return 0;
578 }
579 
580 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
581 					       struct perf_mmap *map)
582 {
583 	int ret;
584 
585 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
586 					   record__process_auxtrace,
587 					   rec->opts.auxtrace_snapshot_size);
588 	if (ret < 0)
589 		return ret;
590 
591 	if (ret)
592 		rec->samples++;
593 
594 	return 0;
595 }
596 
597 static int record__auxtrace_read_snapshot_all(struct record *rec)
598 {
599 	int i;
600 	int rc = 0;
601 
602 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
603 		struct perf_mmap *map = &rec->evlist->mmap[i];
604 
605 		if (!map->auxtrace_mmap.base)
606 			continue;
607 
608 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
609 			rc = -1;
610 			goto out;
611 		}
612 	}
613 out:
614 	return rc;
615 }
616 
617 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
618 {
619 	pr_debug("Recording AUX area tracing snapshot\n");
620 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
621 		trigger_error(&auxtrace_snapshot_trigger);
622 	} else {
623 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
624 			trigger_error(&auxtrace_snapshot_trigger);
625 		else
626 			trigger_ready(&auxtrace_snapshot_trigger);
627 	}
628 }
629 
630 static int record__auxtrace_snapshot_exit(struct record *rec)
631 {
632 	if (trigger_is_error(&auxtrace_snapshot_trigger))
633 		return 0;
634 
635 	if (!auxtrace_record__snapshot_started &&
636 	    auxtrace_record__snapshot_start(rec->itr))
637 		return -1;
638 
639 	record__read_auxtrace_snapshot(rec, true);
640 	if (trigger_is_error(&auxtrace_snapshot_trigger))
641 		return -1;
642 
643 	return 0;
644 }
645 
646 static int record__auxtrace_init(struct record *rec)
647 {
648 	int err;
649 
650 	if (!rec->itr) {
651 		rec->itr = auxtrace_record__init(rec->evlist, &err);
652 		if (err)
653 			return err;
654 	}
655 
656 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
657 					      rec->opts.auxtrace_snapshot_opts);
658 	if (err)
659 		return err;
660 
661 	return auxtrace_parse_filters(rec->evlist);
662 }
663 
664 #else
665 
666 static inline
667 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
668 			       struct perf_mmap *map __maybe_unused)
669 {
670 	return 0;
671 }
672 
673 static inline
674 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
675 				    bool on_exit __maybe_unused)
676 {
677 }
678 
679 static inline
680 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
681 {
682 	return 0;
683 }
684 
685 static inline
686 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
687 {
688 	return 0;
689 }
690 
691 static int record__auxtrace_init(struct record *rec __maybe_unused)
692 {
693 	return 0;
694 }
695 
696 #endif
697 
698 static int record__mmap_evlist(struct record *rec,
699 			       struct evlist *evlist)
700 {
701 	struct record_opts *opts = &rec->opts;
702 	char msg[512];
703 
704 	if (opts->affinity != PERF_AFFINITY_SYS)
705 		cpu__setup_cpunode_map();
706 
707 	if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
708 				 opts->auxtrace_mmap_pages,
709 				 opts->auxtrace_snapshot_mode,
710 				 opts->nr_cblocks, opts->affinity,
711 				 opts->mmap_flush, opts->comp_level) < 0) {
712 		if (errno == EPERM) {
713 			pr_err("Permission error mapping pages.\n"
714 			       "Consider increasing "
715 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
716 			       "or try again with a smaller value of -m/--mmap_pages.\n"
717 			       "(current value: %u,%u)\n",
718 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
719 			return -errno;
720 		} else {
721 			pr_err("failed to mmap with %d (%s)\n", errno,
722 				str_error_r(errno, msg, sizeof(msg)));
723 			if (errno)
724 				return -errno;
725 			else
726 				return -EINVAL;
727 		}
728 	}
729 	return 0;
730 }
731 
732 static int record__mmap(struct record *rec)
733 {
734 	return record__mmap_evlist(rec, rec->evlist);
735 }
736 
737 static int record__open(struct record *rec)
738 {
739 	char msg[BUFSIZ];
740 	struct evsel *pos;
741 	struct evlist *evlist = rec->evlist;
742 	struct perf_session *session = rec->session;
743 	struct record_opts *opts = &rec->opts;
744 	int rc = 0;
745 
746 	/*
747 	 * For initial_delay we need to add a dummy event so that we can track
748 	 * PERF_RECORD_MMAP while we wait for the initial delay to enable the
749 	 * real events, the ones asked by the user.
750 	 */
751 	if (opts->initial_delay) {
752 		if (perf_evlist__add_dummy(evlist))
753 			return -ENOMEM;
754 
755 		pos = perf_evlist__first(evlist);
756 		pos->tracking = 0;
757 		pos = perf_evlist__last(evlist);
758 		pos->tracking = 1;
759 		pos->core.attr.enable_on_exec = 1;
760 	}
761 
762 	perf_evlist__config(evlist, opts, &callchain_param);
763 
764 	evlist__for_each_entry(evlist, pos) {
765 try_again:
766 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
767 			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
768 				if (verbose > 0)
769 					ui__warning("%s\n", msg);
770 				goto try_again;
771 			}
772 			if ((errno == EINVAL || errno == EBADF) &&
773 			    pos->leader != pos &&
774 			    pos->weak_group) {
775 			        pos = perf_evlist__reset_weak_group(evlist, pos);
776 				goto try_again;
777 			}
778 			rc = -errno;
779 			perf_evsel__open_strerror(pos, &opts->target,
780 						  errno, msg, sizeof(msg));
781 			ui__error("%s\n", msg);
782 			goto out;
783 		}
784 
785 		pos->supported = true;
786 	}
787 
788 	if (perf_evlist__apply_filters(evlist, &pos)) {
789 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
790 			pos->filter, perf_evsel__name(pos), errno,
791 			str_error_r(errno, msg, sizeof(msg)));
792 		rc = -1;
793 		goto out;
794 	}
795 
796 	rc = record__mmap(rec);
797 	if (rc)
798 		goto out;
799 
800 	session->evlist = evlist;
801 	perf_session__set_id_hdr_size(session);
802 out:
803 	return rc;
804 }
805 
806 static int process_sample_event(struct perf_tool *tool,
807 				union perf_event *event,
808 				struct perf_sample *sample,
809 				struct evsel *evsel,
810 				struct machine *machine)
811 {
812 	struct record *rec = container_of(tool, struct record, tool);
813 
814 	if (rec->evlist->first_sample_time == 0)
815 		rec->evlist->first_sample_time = sample->time;
816 
817 	rec->evlist->last_sample_time = sample->time;
818 
819 	if (rec->buildid_all)
820 		return 0;
821 
822 	rec->samples++;
823 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
824 }
825 
826 static int process_buildids(struct record *rec)
827 {
828 	struct perf_session *session = rec->session;
829 
830 	if (perf_data__size(&rec->data) == 0)
831 		return 0;
832 
833 	/*
834 	 * During this process, it'll load kernel map and replace the
835 	 * dso->long_name to a real pathname it found.  In this case
836 	 * we prefer the vmlinux path like
837 	 *   /lib/modules/3.16.4/build/vmlinux
838 	 *
839 	 * rather than build-id path (in debug directory).
840 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
841 	 */
842 	symbol_conf.ignore_vmlinux_buildid = true;
843 
844 	/*
845 	 * If --buildid-all is given, it marks all DSO regardless of hits,
846 	 * so no need to process samples. But if timestamp_boundary is enabled,
847 	 * it still needs to walk on all samples to get the timestamps of
848 	 * first/last samples.
849 	 */
850 	if (rec->buildid_all && !rec->timestamp_boundary)
851 		rec->tool.sample = NULL;
852 
853 	return perf_session__process_events(session);
854 }
855 
856 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
857 {
858 	int err;
859 	struct perf_tool *tool = data;
860 	/*
861 	 *As for guest kernel when processing subcommand record&report,
862 	 *we arrange module mmap prior to guest kernel mmap and trigger
863 	 *a preload dso because default guest module symbols are loaded
864 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
865 	 *method is used to avoid symbol missing when the first addr is
866 	 *in module instead of in guest kernel.
867 	 */
868 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
869 					     machine);
870 	if (err < 0)
871 		pr_err("Couldn't record guest kernel [%d]'s reference"
872 		       " relocation symbol.\n", machine->pid);
873 
874 	/*
875 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
876 	 * have no _text sometimes.
877 	 */
878 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
879 						 machine);
880 	if (err < 0)
881 		pr_err("Couldn't record guest kernel [%d]'s reference"
882 		       " relocation symbol.\n", machine->pid);
883 }
884 
885 static struct perf_event_header finished_round_event = {
886 	.size = sizeof(struct perf_event_header),
887 	.type = PERF_RECORD_FINISHED_ROUND,
888 };
889 
890 static void record__adjust_affinity(struct record *rec, struct perf_mmap *map)
891 {
892 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
893 	    !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
894 		CPU_ZERO(&rec->affinity_mask);
895 		CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
896 		sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
897 	}
898 }
899 
900 static size_t process_comp_header(void *record, size_t increment)
901 {
902 	struct perf_record_compressed *event = record;
903 	size_t size = sizeof(*event);
904 
905 	if (increment) {
906 		event->header.size += increment;
907 		return increment;
908 	}
909 
910 	event->header.type = PERF_RECORD_COMPRESSED;
911 	event->header.size = size;
912 
913 	return size;
914 }
915 
916 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
917 			    void *src, size_t src_size)
918 {
919 	size_t compressed;
920 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct perf_record_compressed) - 1;
921 
922 	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
923 						     max_record_size, process_comp_header);
924 
925 	session->bytes_transferred += src_size;
926 	session->bytes_compressed  += compressed;
927 
928 	return compressed;
929 }
930 
931 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
932 				    bool overwrite, bool synch)
933 {
934 	u64 bytes_written = rec->bytes_written;
935 	int i;
936 	int rc = 0;
937 	struct perf_mmap *maps;
938 	int trace_fd = rec->data.file.fd;
939 	off_t off = 0;
940 
941 	if (!evlist)
942 		return 0;
943 
944 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
945 	if (!maps)
946 		return 0;
947 
948 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
949 		return 0;
950 
951 	if (record__aio_enabled(rec))
952 		off = record__aio_get_pos(trace_fd);
953 
954 	for (i = 0; i < evlist->nr_mmaps; i++) {
955 		u64 flush = 0;
956 		struct perf_mmap *map = &maps[i];
957 
958 		if (map->base) {
959 			record__adjust_affinity(rec, map);
960 			if (synch) {
961 				flush = map->flush;
962 				map->flush = 1;
963 			}
964 			if (!record__aio_enabled(rec)) {
965 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
966 					if (synch)
967 						map->flush = flush;
968 					rc = -1;
969 					goto out;
970 				}
971 			} else {
972 				if (record__aio_push(rec, map, &off) < 0) {
973 					record__aio_set_pos(trace_fd, off);
974 					if (synch)
975 						map->flush = flush;
976 					rc = -1;
977 					goto out;
978 				}
979 			}
980 			if (synch)
981 				map->flush = flush;
982 		}
983 
984 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
985 		    record__auxtrace_mmap_read(rec, map) != 0) {
986 			rc = -1;
987 			goto out;
988 		}
989 	}
990 
991 	if (record__aio_enabled(rec))
992 		record__aio_set_pos(trace_fd, off);
993 
994 	/*
995 	 * Mark the round finished in case we wrote
996 	 * at least one event.
997 	 */
998 	if (bytes_written != rec->bytes_written)
999 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
1000 
1001 	if (overwrite)
1002 		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1003 out:
1004 	return rc;
1005 }
1006 
1007 static int record__mmap_read_all(struct record *rec, bool synch)
1008 {
1009 	int err;
1010 
1011 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1012 	if (err)
1013 		return err;
1014 
1015 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1016 }
1017 
1018 static void record__init_features(struct record *rec)
1019 {
1020 	struct perf_session *session = rec->session;
1021 	int feat;
1022 
1023 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1024 		perf_header__set_feat(&session->header, feat);
1025 
1026 	if (rec->no_buildid)
1027 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1028 
1029 	if (!have_tracepoints(&rec->evlist->core.entries))
1030 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1031 
1032 	if (!rec->opts.branch_stack)
1033 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1034 
1035 	if (!rec->opts.full_auxtrace)
1036 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1037 
1038 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1039 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1040 
1041 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1042 	if (!record__comp_enabled(rec))
1043 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1044 
1045 	perf_header__clear_feat(&session->header, HEADER_STAT);
1046 }
1047 
1048 static void
1049 record__finish_output(struct record *rec)
1050 {
1051 	struct perf_data *data = &rec->data;
1052 	int fd = perf_data__fd(data);
1053 
1054 	if (data->is_pipe)
1055 		return;
1056 
1057 	rec->session->header.data_size += rec->bytes_written;
1058 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1059 
1060 	if (!rec->no_buildid) {
1061 		process_buildids(rec);
1062 
1063 		if (rec->buildid_all)
1064 			dsos__hit_all(rec->session);
1065 	}
1066 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1067 
1068 	return;
1069 }
1070 
1071 static int record__synthesize_workload(struct record *rec, bool tail)
1072 {
1073 	int err;
1074 	struct perf_thread_map *thread_map;
1075 
1076 	if (rec->opts.tail_synthesize != tail)
1077 		return 0;
1078 
1079 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1080 	if (thread_map == NULL)
1081 		return -1;
1082 
1083 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1084 						 process_synthesized_event,
1085 						 &rec->session->machines.host,
1086 						 rec->opts.sample_address);
1087 	perf_thread_map__put(thread_map);
1088 	return err;
1089 }
1090 
1091 static int record__synthesize(struct record *rec, bool tail);
1092 
1093 static int
1094 record__switch_output(struct record *rec, bool at_exit)
1095 {
1096 	struct perf_data *data = &rec->data;
1097 	int fd, err;
1098 	char *new_filename;
1099 
1100 	/* Same Size:      "2015122520103046"*/
1101 	char timestamp[] = "InvalidTimestamp";
1102 
1103 	record__aio_mmap_read_sync(rec);
1104 
1105 	record__synthesize(rec, true);
1106 	if (target__none(&rec->opts.target))
1107 		record__synthesize_workload(rec, true);
1108 
1109 	rec->samples = 0;
1110 	record__finish_output(rec);
1111 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1112 	if (err) {
1113 		pr_err("Failed to get current timestamp\n");
1114 		return -EINVAL;
1115 	}
1116 
1117 	fd = perf_data__switch(data, timestamp,
1118 				    rec->session->header.data_offset,
1119 				    at_exit, &new_filename);
1120 	if (fd >= 0 && !at_exit) {
1121 		rec->bytes_written = 0;
1122 		rec->session->header.data_size = 0;
1123 	}
1124 
1125 	if (!quiet)
1126 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1127 			data->path, timestamp);
1128 
1129 	if (rec->switch_output.num_files) {
1130 		int n = rec->switch_output.cur_file + 1;
1131 
1132 		if (n >= rec->switch_output.num_files)
1133 			n = 0;
1134 		rec->switch_output.cur_file = n;
1135 		if (rec->switch_output.filenames[n]) {
1136 			remove(rec->switch_output.filenames[n]);
1137 			zfree(&rec->switch_output.filenames[n]);
1138 		}
1139 		rec->switch_output.filenames[n] = new_filename;
1140 	} else {
1141 		free(new_filename);
1142 	}
1143 
1144 	/* Output tracking events */
1145 	if (!at_exit) {
1146 		record__synthesize(rec, false);
1147 
1148 		/*
1149 		 * In 'perf record --switch-output' without -a,
1150 		 * record__synthesize() in record__switch_output() won't
1151 		 * generate tracking events because there's no thread_map
1152 		 * in evlist. Which causes newly created perf.data doesn't
1153 		 * contain map and comm information.
1154 		 * Create a fake thread_map and directly call
1155 		 * perf_event__synthesize_thread_map() for those events.
1156 		 */
1157 		if (target__none(&rec->opts.target))
1158 			record__synthesize_workload(rec, false);
1159 	}
1160 	return fd;
1161 }
1162 
1163 static volatile int workload_exec_errno;
1164 
1165 /*
1166  * perf_evlist__prepare_workload will send a SIGUSR1
1167  * if the fork fails, since we asked by setting its
1168  * want_signal to true.
1169  */
1170 static void workload_exec_failed_signal(int signo __maybe_unused,
1171 					siginfo_t *info,
1172 					void *ucontext __maybe_unused)
1173 {
1174 	workload_exec_errno = info->si_value.sival_int;
1175 	done = 1;
1176 	child_finished = 1;
1177 }
1178 
1179 static void snapshot_sig_handler(int sig);
1180 static void alarm_sig_handler(int sig);
1181 
1182 int __weak
1183 perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
1184 			    struct perf_tool *tool __maybe_unused,
1185 			    perf_event__handler_t process __maybe_unused,
1186 			    struct machine *machine __maybe_unused)
1187 {
1188 	return 0;
1189 }
1190 
1191 static const struct perf_event_mmap_page *
1192 perf_evlist__pick_pc(struct evlist *evlist)
1193 {
1194 	if (evlist) {
1195 		if (evlist->mmap && evlist->mmap[0].base)
1196 			return evlist->mmap[0].base;
1197 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
1198 			return evlist->overwrite_mmap[0].base;
1199 	}
1200 	return NULL;
1201 }
1202 
1203 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1204 {
1205 	const struct perf_event_mmap_page *pc;
1206 
1207 	pc = perf_evlist__pick_pc(rec->evlist);
1208 	if (pc)
1209 		return pc;
1210 	return NULL;
1211 }
1212 
1213 static int record__synthesize(struct record *rec, bool tail)
1214 {
1215 	struct perf_session *session = rec->session;
1216 	struct machine *machine = &session->machines.host;
1217 	struct perf_data *data = &rec->data;
1218 	struct record_opts *opts = &rec->opts;
1219 	struct perf_tool *tool = &rec->tool;
1220 	int fd = perf_data__fd(data);
1221 	int err = 0;
1222 
1223 	if (rec->opts.tail_synthesize != tail)
1224 		return 0;
1225 
1226 	if (data->is_pipe) {
1227 		/*
1228 		 * We need to synthesize events first, because some
1229 		 * features works on top of them (on report side).
1230 		 */
1231 		err = perf_event__synthesize_attrs(tool, rec->evlist,
1232 						   process_synthesized_event);
1233 		if (err < 0) {
1234 			pr_err("Couldn't synthesize attrs.\n");
1235 			goto out;
1236 		}
1237 
1238 		err = perf_event__synthesize_features(tool, session, rec->evlist,
1239 						      process_synthesized_event);
1240 		if (err < 0) {
1241 			pr_err("Couldn't synthesize features.\n");
1242 			return err;
1243 		}
1244 
1245 		if (have_tracepoints(&rec->evlist->core.entries)) {
1246 			/*
1247 			 * FIXME err <= 0 here actually means that
1248 			 * there were no tracepoints so its not really
1249 			 * an error, just that we don't need to
1250 			 * synthesize anything.  We really have to
1251 			 * return this more properly and also
1252 			 * propagate errors that now are calling die()
1253 			 */
1254 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
1255 								  process_synthesized_event);
1256 			if (err <= 0) {
1257 				pr_err("Couldn't record tracing data.\n");
1258 				goto out;
1259 			}
1260 			rec->bytes_written += err;
1261 		}
1262 	}
1263 
1264 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1265 					  process_synthesized_event, machine);
1266 	if (err)
1267 		goto out;
1268 
1269 	if (rec->opts.full_auxtrace) {
1270 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1271 					session, process_synthesized_event);
1272 		if (err)
1273 			goto out;
1274 	}
1275 
1276 	if (!perf_evlist__exclude_kernel(rec->evlist)) {
1277 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1278 							 machine);
1279 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1280 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1281 				   "Check /proc/kallsyms permission or run as root.\n");
1282 
1283 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1284 						     machine);
1285 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1286 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1287 				   "Check /proc/modules permission or run as root.\n");
1288 	}
1289 
1290 	if (perf_guest) {
1291 		machines__process_guests(&session->machines,
1292 					 perf_event__synthesize_guest_os, tool);
1293 	}
1294 
1295 	err = perf_event__synthesize_extra_attr(&rec->tool,
1296 						rec->evlist,
1297 						process_synthesized_event,
1298 						data->is_pipe);
1299 	if (err)
1300 		goto out;
1301 
1302 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1303 						 process_synthesized_event,
1304 						NULL);
1305 	if (err < 0) {
1306 		pr_err("Couldn't synthesize thread map.\n");
1307 		return err;
1308 	}
1309 
1310 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1311 					     process_synthesized_event, NULL);
1312 	if (err < 0) {
1313 		pr_err("Couldn't synthesize cpu map.\n");
1314 		return err;
1315 	}
1316 
1317 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1318 						machine, opts);
1319 	if (err < 0)
1320 		pr_warning("Couldn't synthesize bpf events.\n");
1321 
1322 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1323 					    process_synthesized_event, opts->sample_address,
1324 					    1);
1325 out:
1326 	return err;
1327 }
1328 
1329 static int __cmd_record(struct record *rec, int argc, const char **argv)
1330 {
1331 	int err;
1332 	int status = 0;
1333 	unsigned long waking = 0;
1334 	const bool forks = argc > 0;
1335 	struct perf_tool *tool = &rec->tool;
1336 	struct record_opts *opts = &rec->opts;
1337 	struct perf_data *data = &rec->data;
1338 	struct perf_session *session;
1339 	bool disabled = false, draining = false;
1340 	struct evlist *sb_evlist = NULL;
1341 	int fd;
1342 	float ratio = 0;
1343 
1344 	atexit(record__sig_exit);
1345 	signal(SIGCHLD, sig_handler);
1346 	signal(SIGINT, sig_handler);
1347 	signal(SIGTERM, sig_handler);
1348 	signal(SIGSEGV, sigsegv_handler);
1349 
1350 	if (rec->opts.record_namespaces)
1351 		tool->namespace_events = true;
1352 
1353 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1354 		signal(SIGUSR2, snapshot_sig_handler);
1355 		if (rec->opts.auxtrace_snapshot_mode)
1356 			trigger_on(&auxtrace_snapshot_trigger);
1357 		if (rec->switch_output.enabled)
1358 			trigger_on(&switch_output_trigger);
1359 	} else {
1360 		signal(SIGUSR2, SIG_IGN);
1361 	}
1362 
1363 	session = perf_session__new(data, false, tool);
1364 	if (session == NULL) {
1365 		pr_err("Perf session creation failed.\n");
1366 		return -1;
1367 	}
1368 
1369 	fd = perf_data__fd(data);
1370 	rec->session = session;
1371 
1372 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1373 		pr_err("Compression initialization failed.\n");
1374 		return -1;
1375 	}
1376 
1377 	session->header.env.comp_type  = PERF_COMP_ZSTD;
1378 	session->header.env.comp_level = rec->opts.comp_level;
1379 
1380 	record__init_features(rec);
1381 
1382 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1383 		session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1384 
1385 	if (forks) {
1386 		err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1387 						    argv, data->is_pipe,
1388 						    workload_exec_failed_signal);
1389 		if (err < 0) {
1390 			pr_err("Couldn't run the workload!\n");
1391 			status = err;
1392 			goto out_delete_session;
1393 		}
1394 	}
1395 
1396 	/*
1397 	 * If we have just single event and are sending data
1398 	 * through pipe, we need to force the ids allocation,
1399 	 * because we synthesize event name through the pipe
1400 	 * and need the id for that.
1401 	 */
1402 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1403 		rec->opts.sample_id = true;
1404 
1405 	if (record__open(rec) != 0) {
1406 		err = -1;
1407 		goto out_child;
1408 	}
1409 	session->header.env.comp_mmap_len = session->evlist->mmap_len;
1410 
1411 	err = bpf__apply_obj_config();
1412 	if (err) {
1413 		char errbuf[BUFSIZ];
1414 
1415 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1416 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1417 			 errbuf);
1418 		goto out_child;
1419 	}
1420 
1421 	/*
1422 	 * Normally perf_session__new would do this, but it doesn't have the
1423 	 * evlist.
1424 	 */
1425 	if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1426 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1427 		rec->tool.ordered_events = false;
1428 	}
1429 
1430 	if (!rec->evlist->nr_groups)
1431 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1432 
1433 	if (data->is_pipe) {
1434 		err = perf_header__write_pipe(fd);
1435 		if (err < 0)
1436 			goto out_child;
1437 	} else {
1438 		err = perf_session__write_header(session, rec->evlist, fd, false);
1439 		if (err < 0)
1440 			goto out_child;
1441 	}
1442 
1443 	if (!rec->no_buildid
1444 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1445 		pr_err("Couldn't generate buildids. "
1446 		       "Use --no-buildid to profile anyway.\n");
1447 		err = -1;
1448 		goto out_child;
1449 	}
1450 
1451 	if (!opts->no_bpf_event)
1452 		bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1453 
1454 	if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1455 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1456 		opts->no_bpf_event = true;
1457 	}
1458 
1459 	err = record__synthesize(rec, false);
1460 	if (err < 0)
1461 		goto out_child;
1462 
1463 	if (rec->realtime_prio) {
1464 		struct sched_param param;
1465 
1466 		param.sched_priority = rec->realtime_prio;
1467 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1468 			pr_err("Could not set realtime priority.\n");
1469 			err = -1;
1470 			goto out_child;
1471 		}
1472 	}
1473 
1474 	/*
1475 	 * When perf is starting the traced process, all the events
1476 	 * (apart from group members) have enable_on_exec=1 set,
1477 	 * so don't spoil it by prematurely enabling them.
1478 	 */
1479 	if (!target__none(&opts->target) && !opts->initial_delay)
1480 		evlist__enable(rec->evlist);
1481 
1482 	/*
1483 	 * Let the child rip
1484 	 */
1485 	if (forks) {
1486 		struct machine *machine = &session->machines.host;
1487 		union perf_event *event;
1488 		pid_t tgid;
1489 
1490 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1491 		if (event == NULL) {
1492 			err = -ENOMEM;
1493 			goto out_child;
1494 		}
1495 
1496 		/*
1497 		 * Some H/W events are generated before COMM event
1498 		 * which is emitted during exec(), so perf script
1499 		 * cannot see a correct process name for those events.
1500 		 * Synthesize COMM event to prevent it.
1501 		 */
1502 		tgid = perf_event__synthesize_comm(tool, event,
1503 						   rec->evlist->workload.pid,
1504 						   process_synthesized_event,
1505 						   machine);
1506 		free(event);
1507 
1508 		if (tgid == -1)
1509 			goto out_child;
1510 
1511 		event = malloc(sizeof(event->namespaces) +
1512 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1513 			       machine->id_hdr_size);
1514 		if (event == NULL) {
1515 			err = -ENOMEM;
1516 			goto out_child;
1517 		}
1518 
1519 		/*
1520 		 * Synthesize NAMESPACES event for the command specified.
1521 		 */
1522 		perf_event__synthesize_namespaces(tool, event,
1523 						  rec->evlist->workload.pid,
1524 						  tgid, process_synthesized_event,
1525 						  machine);
1526 		free(event);
1527 
1528 		perf_evlist__start_workload(rec->evlist);
1529 	}
1530 
1531 	if (opts->initial_delay) {
1532 		usleep(opts->initial_delay * USEC_PER_MSEC);
1533 		evlist__enable(rec->evlist);
1534 	}
1535 
1536 	trigger_ready(&auxtrace_snapshot_trigger);
1537 	trigger_ready(&switch_output_trigger);
1538 	perf_hooks__invoke_record_start();
1539 	for (;;) {
1540 		unsigned long long hits = rec->samples;
1541 
1542 		/*
1543 		 * rec->evlist->bkw_mmap_state is possible to be
1544 		 * BKW_MMAP_EMPTY here: when done == true and
1545 		 * hits != rec->samples in previous round.
1546 		 *
1547 		 * perf_evlist__toggle_bkw_mmap ensure we never
1548 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1549 		 */
1550 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1551 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1552 
1553 		if (record__mmap_read_all(rec, false) < 0) {
1554 			trigger_error(&auxtrace_snapshot_trigger);
1555 			trigger_error(&switch_output_trigger);
1556 			err = -1;
1557 			goto out_child;
1558 		}
1559 
1560 		if (auxtrace_record__snapshot_started) {
1561 			auxtrace_record__snapshot_started = 0;
1562 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1563 				record__read_auxtrace_snapshot(rec, false);
1564 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1565 				pr_err("AUX area tracing snapshot failed\n");
1566 				err = -1;
1567 				goto out_child;
1568 			}
1569 		}
1570 
1571 		if (trigger_is_hit(&switch_output_trigger)) {
1572 			/*
1573 			 * If switch_output_trigger is hit, the data in
1574 			 * overwritable ring buffer should have been collected,
1575 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1576 			 *
1577 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1578 			 * record__mmap_read_all() didn't collect data from
1579 			 * overwritable ring buffer. Read again.
1580 			 */
1581 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1582 				continue;
1583 			trigger_ready(&switch_output_trigger);
1584 
1585 			/*
1586 			 * Reenable events in overwrite ring buffer after
1587 			 * record__mmap_read_all(): we should have collected
1588 			 * data from it.
1589 			 */
1590 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1591 
1592 			if (!quiet)
1593 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1594 					waking);
1595 			waking = 0;
1596 			fd = record__switch_output(rec, false);
1597 			if (fd < 0) {
1598 				pr_err("Failed to switch to new file\n");
1599 				trigger_error(&switch_output_trigger);
1600 				err = fd;
1601 				goto out_child;
1602 			}
1603 
1604 			/* re-arm the alarm */
1605 			if (rec->switch_output.time)
1606 				alarm(rec->switch_output.time);
1607 		}
1608 
1609 		if (hits == rec->samples) {
1610 			if (done || draining)
1611 				break;
1612 			err = perf_evlist__poll(rec->evlist, -1);
1613 			/*
1614 			 * Propagate error, only if there's any. Ignore positive
1615 			 * number of returned events and interrupt error.
1616 			 */
1617 			if (err > 0 || (err < 0 && errno == EINTR))
1618 				err = 0;
1619 			waking++;
1620 
1621 			if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1622 				draining = true;
1623 		}
1624 
1625 		/*
1626 		 * When perf is starting the traced process, at the end events
1627 		 * die with the process and we wait for that. Thus no need to
1628 		 * disable events in this case.
1629 		 */
1630 		if (done && !disabled && !target__none(&opts->target)) {
1631 			trigger_off(&auxtrace_snapshot_trigger);
1632 			evlist__disable(rec->evlist);
1633 			disabled = true;
1634 		}
1635 	}
1636 
1637 	trigger_off(&auxtrace_snapshot_trigger);
1638 	trigger_off(&switch_output_trigger);
1639 
1640 	if (opts->auxtrace_snapshot_on_exit)
1641 		record__auxtrace_snapshot_exit(rec);
1642 
1643 	if (forks && workload_exec_errno) {
1644 		char msg[STRERR_BUFSIZE];
1645 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1646 		pr_err("Workload failed: %s\n", emsg);
1647 		err = -1;
1648 		goto out_child;
1649 	}
1650 
1651 	if (!quiet)
1652 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1653 
1654 	if (target__none(&rec->opts.target))
1655 		record__synthesize_workload(rec, true);
1656 
1657 out_child:
1658 	record__mmap_read_all(rec, true);
1659 	record__aio_mmap_read_sync(rec);
1660 
1661 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1662 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1663 		session->header.env.comp_ratio = ratio + 0.5;
1664 	}
1665 
1666 	if (forks) {
1667 		int exit_status;
1668 
1669 		if (!child_finished)
1670 			kill(rec->evlist->workload.pid, SIGTERM);
1671 
1672 		wait(&exit_status);
1673 
1674 		if (err < 0)
1675 			status = err;
1676 		else if (WIFEXITED(exit_status))
1677 			status = WEXITSTATUS(exit_status);
1678 		else if (WIFSIGNALED(exit_status))
1679 			signr = WTERMSIG(exit_status);
1680 	} else
1681 		status = err;
1682 
1683 	record__synthesize(rec, true);
1684 	/* this will be recalculated during process_buildids() */
1685 	rec->samples = 0;
1686 
1687 	if (!err) {
1688 		if (!rec->timestamp_filename) {
1689 			record__finish_output(rec);
1690 		} else {
1691 			fd = record__switch_output(rec, true);
1692 			if (fd < 0) {
1693 				status = fd;
1694 				goto out_delete_session;
1695 			}
1696 		}
1697 	}
1698 
1699 	perf_hooks__invoke_record_end();
1700 
1701 	if (!err && !quiet) {
1702 		char samples[128];
1703 		const char *postfix = rec->timestamp_filename ?
1704 					".<timestamp>" : "";
1705 
1706 		if (rec->samples && !rec->opts.full_auxtrace)
1707 			scnprintf(samples, sizeof(samples),
1708 				  " (%" PRIu64 " samples)", rec->samples);
1709 		else
1710 			samples[0] = '\0';
1711 
1712 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
1713 			perf_data__size(data) / 1024.0 / 1024.0,
1714 			data->path, postfix, samples);
1715 		if (ratio) {
1716 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
1717 					rec->session->bytes_transferred / 1024.0 / 1024.0,
1718 					ratio);
1719 		}
1720 		fprintf(stderr, " ]\n");
1721 	}
1722 
1723 out_delete_session:
1724 	zstd_fini(&session->zstd_data);
1725 	perf_session__delete(session);
1726 
1727 	if (!opts->no_bpf_event)
1728 		perf_evlist__stop_sb_thread(sb_evlist);
1729 	return status;
1730 }
1731 
1732 static void callchain_debug(struct callchain_param *callchain)
1733 {
1734 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1735 
1736 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1737 
1738 	if (callchain->record_mode == CALLCHAIN_DWARF)
1739 		pr_debug("callchain: stack dump size %d\n",
1740 			 callchain->dump_size);
1741 }
1742 
1743 int record_opts__parse_callchain(struct record_opts *record,
1744 				 struct callchain_param *callchain,
1745 				 const char *arg, bool unset)
1746 {
1747 	int ret;
1748 	callchain->enabled = !unset;
1749 
1750 	/* --no-call-graph */
1751 	if (unset) {
1752 		callchain->record_mode = CALLCHAIN_NONE;
1753 		pr_debug("callchain: disabled\n");
1754 		return 0;
1755 	}
1756 
1757 	ret = parse_callchain_record_opt(arg, callchain);
1758 	if (!ret) {
1759 		/* Enable data address sampling for DWARF unwind. */
1760 		if (callchain->record_mode == CALLCHAIN_DWARF)
1761 			record->sample_address = true;
1762 		callchain_debug(callchain);
1763 	}
1764 
1765 	return ret;
1766 }
1767 
1768 int record_parse_callchain_opt(const struct option *opt,
1769 			       const char *arg,
1770 			       int unset)
1771 {
1772 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1773 }
1774 
1775 int record_callchain_opt(const struct option *opt,
1776 			 const char *arg __maybe_unused,
1777 			 int unset __maybe_unused)
1778 {
1779 	struct callchain_param *callchain = opt->value;
1780 
1781 	callchain->enabled = true;
1782 
1783 	if (callchain->record_mode == CALLCHAIN_NONE)
1784 		callchain->record_mode = CALLCHAIN_FP;
1785 
1786 	callchain_debug(callchain);
1787 	return 0;
1788 }
1789 
1790 static int perf_record_config(const char *var, const char *value, void *cb)
1791 {
1792 	struct record *rec = cb;
1793 
1794 	if (!strcmp(var, "record.build-id")) {
1795 		if (!strcmp(value, "cache"))
1796 			rec->no_buildid_cache = false;
1797 		else if (!strcmp(value, "no-cache"))
1798 			rec->no_buildid_cache = true;
1799 		else if (!strcmp(value, "skip"))
1800 			rec->no_buildid = true;
1801 		else
1802 			return -1;
1803 		return 0;
1804 	}
1805 	if (!strcmp(var, "record.call-graph")) {
1806 		var = "call-graph.record-mode";
1807 		return perf_default_config(var, value, cb);
1808 	}
1809 #ifdef HAVE_AIO_SUPPORT
1810 	if (!strcmp(var, "record.aio")) {
1811 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
1812 		if (!rec->opts.nr_cblocks)
1813 			rec->opts.nr_cblocks = nr_cblocks_default;
1814 	}
1815 #endif
1816 
1817 	return 0;
1818 }
1819 
1820 struct clockid_map {
1821 	const char *name;
1822 	int clockid;
1823 };
1824 
1825 #define CLOCKID_MAP(n, c)	\
1826 	{ .name = n, .clockid = (c), }
1827 
1828 #define CLOCKID_END	{ .name = NULL, }
1829 
1830 
1831 /*
1832  * Add the missing ones, we need to build on many distros...
1833  */
1834 #ifndef CLOCK_MONOTONIC_RAW
1835 #define CLOCK_MONOTONIC_RAW 4
1836 #endif
1837 #ifndef CLOCK_BOOTTIME
1838 #define CLOCK_BOOTTIME 7
1839 #endif
1840 #ifndef CLOCK_TAI
1841 #define CLOCK_TAI 11
1842 #endif
1843 
1844 static const struct clockid_map clockids[] = {
1845 	/* available for all events, NMI safe */
1846 	CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1847 	CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1848 
1849 	/* available for some events */
1850 	CLOCKID_MAP("realtime", CLOCK_REALTIME),
1851 	CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1852 	CLOCKID_MAP("tai", CLOCK_TAI),
1853 
1854 	/* available for the lazy */
1855 	CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1856 	CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1857 	CLOCKID_MAP("real", CLOCK_REALTIME),
1858 	CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1859 
1860 	CLOCKID_END,
1861 };
1862 
1863 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1864 {
1865 	struct timespec res;
1866 
1867 	*res_ns = 0;
1868 	if (!clock_getres(clk_id, &res))
1869 		*res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1870 	else
1871 		pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1872 
1873 	return 0;
1874 }
1875 
1876 static int parse_clockid(const struct option *opt, const char *str, int unset)
1877 {
1878 	struct record_opts *opts = (struct record_opts *)opt->value;
1879 	const struct clockid_map *cm;
1880 	const char *ostr = str;
1881 
1882 	if (unset) {
1883 		opts->use_clockid = 0;
1884 		return 0;
1885 	}
1886 
1887 	/* no arg passed */
1888 	if (!str)
1889 		return 0;
1890 
1891 	/* no setting it twice */
1892 	if (opts->use_clockid)
1893 		return -1;
1894 
1895 	opts->use_clockid = true;
1896 
1897 	/* if its a number, we're done */
1898 	if (sscanf(str, "%d", &opts->clockid) == 1)
1899 		return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1900 
1901 	/* allow a "CLOCK_" prefix to the name */
1902 	if (!strncasecmp(str, "CLOCK_", 6))
1903 		str += 6;
1904 
1905 	for (cm = clockids; cm->name; cm++) {
1906 		if (!strcasecmp(str, cm->name)) {
1907 			opts->clockid = cm->clockid;
1908 			return get_clockid_res(opts->clockid,
1909 					       &opts->clockid_res_ns);
1910 		}
1911 	}
1912 
1913 	opts->use_clockid = false;
1914 	ui__warning("unknown clockid %s, check man page\n", ostr);
1915 	return -1;
1916 }
1917 
1918 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1919 {
1920 	struct record_opts *opts = (struct record_opts *)opt->value;
1921 
1922 	if (unset || !str)
1923 		return 0;
1924 
1925 	if (!strcasecmp(str, "node"))
1926 		opts->affinity = PERF_AFFINITY_NODE;
1927 	else if (!strcasecmp(str, "cpu"))
1928 		opts->affinity = PERF_AFFINITY_CPU;
1929 
1930 	return 0;
1931 }
1932 
1933 static int record__parse_mmap_pages(const struct option *opt,
1934 				    const char *str,
1935 				    int unset __maybe_unused)
1936 {
1937 	struct record_opts *opts = opt->value;
1938 	char *s, *p;
1939 	unsigned int mmap_pages;
1940 	int ret;
1941 
1942 	if (!str)
1943 		return -EINVAL;
1944 
1945 	s = strdup(str);
1946 	if (!s)
1947 		return -ENOMEM;
1948 
1949 	p = strchr(s, ',');
1950 	if (p)
1951 		*p = '\0';
1952 
1953 	if (*s) {
1954 		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1955 		if (ret)
1956 			goto out_free;
1957 		opts->mmap_pages = mmap_pages;
1958 	}
1959 
1960 	if (!p) {
1961 		ret = 0;
1962 		goto out_free;
1963 	}
1964 
1965 	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1966 	if (ret)
1967 		goto out_free;
1968 
1969 	opts->auxtrace_mmap_pages = mmap_pages;
1970 
1971 out_free:
1972 	free(s);
1973 	return ret;
1974 }
1975 
1976 static void switch_output_size_warn(struct record *rec)
1977 {
1978 	u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
1979 	struct switch_output *s = &rec->switch_output;
1980 
1981 	wakeup_size /= 2;
1982 
1983 	if (s->size < wakeup_size) {
1984 		char buf[100];
1985 
1986 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1987 		pr_warning("WARNING: switch-output data size lower than "
1988 			   "wakeup kernel buffer size (%s) "
1989 			   "expect bigger perf.data sizes\n", buf);
1990 	}
1991 }
1992 
1993 static int switch_output_setup(struct record *rec)
1994 {
1995 	struct switch_output *s = &rec->switch_output;
1996 	static struct parse_tag tags_size[] = {
1997 		{ .tag  = 'B', .mult = 1       },
1998 		{ .tag  = 'K', .mult = 1 << 10 },
1999 		{ .tag  = 'M', .mult = 1 << 20 },
2000 		{ .tag  = 'G', .mult = 1 << 30 },
2001 		{ .tag  = 0 },
2002 	};
2003 	static struct parse_tag tags_time[] = {
2004 		{ .tag  = 's', .mult = 1        },
2005 		{ .tag  = 'm', .mult = 60       },
2006 		{ .tag  = 'h', .mult = 60*60    },
2007 		{ .tag  = 'd', .mult = 60*60*24 },
2008 		{ .tag  = 0 },
2009 	};
2010 	unsigned long val;
2011 
2012 	if (!s->set)
2013 		return 0;
2014 
2015 	if (!strcmp(s->str, "signal")) {
2016 		s->signal = true;
2017 		pr_debug("switch-output with SIGUSR2 signal\n");
2018 		goto enabled;
2019 	}
2020 
2021 	val = parse_tag_value(s->str, tags_size);
2022 	if (val != (unsigned long) -1) {
2023 		s->size = val;
2024 		pr_debug("switch-output with %s size threshold\n", s->str);
2025 		goto enabled;
2026 	}
2027 
2028 	val = parse_tag_value(s->str, tags_time);
2029 	if (val != (unsigned long) -1) {
2030 		s->time = val;
2031 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2032 			 s->str, s->time);
2033 		goto enabled;
2034 	}
2035 
2036 	return -1;
2037 
2038 enabled:
2039 	rec->timestamp_filename = true;
2040 	s->enabled              = true;
2041 
2042 	if (s->size && !rec->opts.no_buffering)
2043 		switch_output_size_warn(rec);
2044 
2045 	return 0;
2046 }
2047 
2048 static const char * const __record_usage[] = {
2049 	"perf record [<options>] [<command>]",
2050 	"perf record [<options>] -- <command> [<options>]",
2051 	NULL
2052 };
2053 const char * const *record_usage = __record_usage;
2054 
2055 /*
2056  * XXX Ideally would be local to cmd_record() and passed to a record__new
2057  * because we need to have access to it in record__exit, that is called
2058  * after cmd_record() exits, but since record_options need to be accessible to
2059  * builtin-script, leave it here.
2060  *
2061  * At least we don't ouch it in all the other functions here directly.
2062  *
2063  * Just say no to tons of global variables, sigh.
2064  */
2065 static struct record record = {
2066 	.opts = {
2067 		.sample_time	     = true,
2068 		.mmap_pages	     = UINT_MAX,
2069 		.user_freq	     = UINT_MAX,
2070 		.user_interval	     = ULLONG_MAX,
2071 		.freq		     = 4000,
2072 		.target		     = {
2073 			.uses_mmap   = true,
2074 			.default_per_cpu = true,
2075 		},
2076 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2077 	},
2078 	.tool = {
2079 		.sample		= process_sample_event,
2080 		.fork		= perf_event__process_fork,
2081 		.exit		= perf_event__process_exit,
2082 		.comm		= perf_event__process_comm,
2083 		.namespaces	= perf_event__process_namespaces,
2084 		.mmap		= perf_event__process_mmap,
2085 		.mmap2		= perf_event__process_mmap2,
2086 		.ordered_events	= true,
2087 	},
2088 };
2089 
2090 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2091 	"\n\t\t\t\tDefault: fp";
2092 
2093 static bool dry_run;
2094 
2095 /*
2096  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2097  * with it and switch to use the library functions in perf_evlist that came
2098  * from builtin-record.c, i.e. use record_opts,
2099  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2100  * using pipes, etc.
2101  */
2102 static struct option __record_options[] = {
2103 	OPT_CALLBACK('e', "event", &record.evlist, "event",
2104 		     "event selector. use 'perf list' to list available events",
2105 		     parse_events_option),
2106 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2107 		     "event filter", parse_filter),
2108 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2109 			   NULL, "don't record events from perf itself",
2110 			   exclude_perf),
2111 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2112 		    "record events on existing process id"),
2113 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2114 		    "record events on existing thread id"),
2115 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2116 		    "collect data with this RT SCHED_FIFO priority"),
2117 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2118 		    "collect data without buffering"),
2119 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2120 		    "collect raw sample records from all opened counters"),
2121 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2122 			    "system-wide collection from all CPUs"),
2123 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2124 		    "list of cpus to monitor"),
2125 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2126 	OPT_STRING('o', "output", &record.data.path, "file",
2127 		    "output file name"),
2128 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2129 			&record.opts.no_inherit_set,
2130 			"child tasks do not inherit counters"),
2131 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2132 		    "synthesize non-sample events at the end of output"),
2133 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2134 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2135 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2136 		    "Fail if the specified frequency can't be used"),
2137 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2138 		     "profile at this frequency",
2139 		      record__parse_freq),
2140 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2141 		     "number of mmap data pages and AUX area tracing mmap pages",
2142 		     record__parse_mmap_pages),
2143 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2144 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2145 		     record__mmap_flush_parse),
2146 	OPT_BOOLEAN(0, "group", &record.opts.group,
2147 		    "put the counters into a counter group"),
2148 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2149 			   NULL, "enables call-graph recording" ,
2150 			   &record_callchain_opt),
2151 	OPT_CALLBACK(0, "call-graph", &record.opts,
2152 		     "record_mode[,record_size]", record_callchain_help,
2153 		     &record_parse_callchain_opt),
2154 	OPT_INCR('v', "verbose", &verbose,
2155 		    "be more verbose (show counter open errors, etc)"),
2156 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2157 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2158 		    "per thread counts"),
2159 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2160 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2161 		    "Record the sample physical addresses"),
2162 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2163 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2164 			&record.opts.sample_time_set,
2165 			"Record the sample timestamps"),
2166 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2167 			"Record the sample period"),
2168 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2169 		    "don't sample"),
2170 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2171 			&record.no_buildid_cache_set,
2172 			"do not update the buildid cache"),
2173 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2174 			&record.no_buildid_set,
2175 			"do not collect buildids in perf.data"),
2176 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2177 		     "monitor event in cgroup name only",
2178 		     parse_cgroups),
2179 	OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2180 		  "ms to wait before starting measurement after program start"),
2181 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2182 		   "user to profile"),
2183 
2184 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2185 		     "branch any", "sample any taken branches",
2186 		     parse_branch_stack),
2187 
2188 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2189 		     "branch filter mask", "branch stack filter modes",
2190 		     parse_branch_stack),
2191 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2192 		    "sample by weight (on special events only)"),
2193 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2194 		    "sample transaction flags (special events only)"),
2195 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2196 		    "use per-thread mmaps"),
2197 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2198 		    "sample selected machine registers on interrupt,"
2199 		    " use '-I?' to list register names", parse_intr_regs),
2200 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2201 		    "sample selected machine registers on interrupt,"
2202 		    " use '--user-regs=?' to list register names", parse_user_regs),
2203 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2204 		    "Record running/enabled time of read (:S) events"),
2205 	OPT_CALLBACK('k', "clockid", &record.opts,
2206 	"clockid", "clockid to use for events, see clock_gettime()",
2207 	parse_clockid),
2208 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2209 			  "opts", "AUX area tracing Snapshot Mode", ""),
2210 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2211 			"per thread proc mmap processing timeout in ms"),
2212 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2213 		    "Record namespaces events"),
2214 	OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2215 		    "Record context switch events"),
2216 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2217 			 "Configure all used events to run in kernel space.",
2218 			 PARSE_OPT_EXCLUSIVE),
2219 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2220 			 "Configure all used events to run in user space.",
2221 			 PARSE_OPT_EXCLUSIVE),
2222 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2223 		    "collect kernel callchains"),
2224 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2225 		    "collect user callchains"),
2226 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2227 		   "clang binary to use for compiling BPF scriptlets"),
2228 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2229 		   "options passed to clang when compiling BPF scriptlets"),
2230 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2231 		   "file", "vmlinux pathname"),
2232 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2233 		    "Record build-id of all DSOs regardless of hits"),
2234 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2235 		    "append timestamp to output filename"),
2236 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2237 		    "Record timestamp boundary (time of first/last samples)"),
2238 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2239 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2240 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2241 			  "signal"),
2242 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2243 		   "Limit number of switch output generated files"),
2244 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2245 		    "Parse options then exit"),
2246 #ifdef HAVE_AIO_SUPPORT
2247 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2248 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2249 		     record__aio_parse),
2250 #endif
2251 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2252 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2253 		     record__parse_affinity),
2254 #ifdef HAVE_ZSTD_SUPPORT
2255 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2256 			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2257 			    record__parse_comp_level),
2258 #endif
2259 	OPT_END()
2260 };
2261 
2262 struct option *record_options = __record_options;
2263 
2264 int cmd_record(int argc, const char **argv)
2265 {
2266 	int err;
2267 	struct record *rec = &record;
2268 	char errbuf[BUFSIZ];
2269 
2270 	setlocale(LC_ALL, "");
2271 
2272 #ifndef HAVE_LIBBPF_SUPPORT
2273 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2274 	set_nobuild('\0', "clang-path", true);
2275 	set_nobuild('\0', "clang-opt", true);
2276 # undef set_nobuild
2277 #endif
2278 
2279 #ifndef HAVE_BPF_PROLOGUE
2280 # if !defined (HAVE_DWARF_SUPPORT)
2281 #  define REASON  "NO_DWARF=1"
2282 # elif !defined (HAVE_LIBBPF_SUPPORT)
2283 #  define REASON  "NO_LIBBPF=1"
2284 # else
2285 #  define REASON  "this architecture doesn't support BPF prologue"
2286 # endif
2287 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2288 	set_nobuild('\0', "vmlinux", true);
2289 # undef set_nobuild
2290 # undef REASON
2291 #endif
2292 
2293 	CPU_ZERO(&rec->affinity_mask);
2294 	rec->opts.affinity = PERF_AFFINITY_SYS;
2295 
2296 	rec->evlist = evlist__new();
2297 	if (rec->evlist == NULL)
2298 		return -ENOMEM;
2299 
2300 	err = perf_config(perf_record_config, rec);
2301 	if (err)
2302 		return err;
2303 
2304 	argc = parse_options(argc, argv, record_options, record_usage,
2305 			    PARSE_OPT_STOP_AT_NON_OPTION);
2306 	if (quiet)
2307 		perf_quiet_option();
2308 
2309 	/* Make system wide (-a) the default target. */
2310 	if (!argc && target__none(&rec->opts.target))
2311 		rec->opts.target.system_wide = true;
2312 
2313 	if (nr_cgroups && !rec->opts.target.system_wide) {
2314 		usage_with_options_msg(record_usage, record_options,
2315 			"cgroup monitoring only available in system-wide mode");
2316 
2317 	}
2318 
2319 	if (rec->opts.comp_level != 0) {
2320 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2321 		rec->no_buildid = true;
2322 	}
2323 
2324 	if (rec->opts.record_switch_events &&
2325 	    !perf_can_record_switch_events()) {
2326 		ui__error("kernel does not support recording context switch events\n");
2327 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2328 		return -EINVAL;
2329 	}
2330 
2331 	if (switch_output_setup(rec)) {
2332 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2333 		return -EINVAL;
2334 	}
2335 
2336 	if (rec->switch_output.time) {
2337 		signal(SIGALRM, alarm_sig_handler);
2338 		alarm(rec->switch_output.time);
2339 	}
2340 
2341 	if (rec->switch_output.num_files) {
2342 		rec->switch_output.filenames = calloc(sizeof(char *),
2343 						      rec->switch_output.num_files);
2344 		if (!rec->switch_output.filenames)
2345 			return -EINVAL;
2346 	}
2347 
2348 	/*
2349 	 * Allow aliases to facilitate the lookup of symbols for address
2350 	 * filters. Refer to auxtrace_parse_filters().
2351 	 */
2352 	symbol_conf.allow_aliases = true;
2353 
2354 	symbol__init(NULL);
2355 
2356 	err = record__auxtrace_init(rec);
2357 	if (err)
2358 		goto out;
2359 
2360 	if (dry_run)
2361 		goto out;
2362 
2363 	err = bpf__setup_stdout(rec->evlist);
2364 	if (err) {
2365 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2366 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2367 			 errbuf);
2368 		goto out;
2369 	}
2370 
2371 	err = -ENOMEM;
2372 
2373 	if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist))
2374 		pr_warning(
2375 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
2376 "check /proc/sys/kernel/kptr_restrict and /proc/sys/kernel/perf_event_paranoid.\n\n"
2377 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
2378 "file is not found in the buildid cache or in the vmlinux path.\n\n"
2379 "Samples in kernel modules won't be resolved at all.\n\n"
2380 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
2381 "even with a suitable vmlinux or kallsyms file.\n\n");
2382 
2383 	if (rec->no_buildid_cache || rec->no_buildid) {
2384 		disable_buildid_cache();
2385 	} else if (rec->switch_output.enabled) {
2386 		/*
2387 		 * In 'perf record --switch-output', disable buildid
2388 		 * generation by default to reduce data file switching
2389 		 * overhead. Still generate buildid if they are required
2390 		 * explicitly using
2391 		 *
2392 		 *  perf record --switch-output --no-no-buildid \
2393 		 *              --no-no-buildid-cache
2394 		 *
2395 		 * Following code equals to:
2396 		 *
2397 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2398 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2399 		 *         disable_buildid_cache();
2400 		 */
2401 		bool disable = true;
2402 
2403 		if (rec->no_buildid_set && !rec->no_buildid)
2404 			disable = false;
2405 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2406 			disable = false;
2407 		if (disable) {
2408 			rec->no_buildid = true;
2409 			rec->no_buildid_cache = true;
2410 			disable_buildid_cache();
2411 		}
2412 	}
2413 
2414 	if (record.opts.overwrite)
2415 		record.opts.tail_synthesize = true;
2416 
2417 	if (rec->evlist->core.nr_entries == 0 &&
2418 	    __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2419 		pr_err("Not enough memory for event selector list\n");
2420 		goto out;
2421 	}
2422 
2423 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2424 		rec->opts.no_inherit = true;
2425 
2426 	err = target__validate(&rec->opts.target);
2427 	if (err) {
2428 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2429 		ui__warning("%s\n", errbuf);
2430 	}
2431 
2432 	err = target__parse_uid(&rec->opts.target);
2433 	if (err) {
2434 		int saved_errno = errno;
2435 
2436 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2437 		ui__error("%s", errbuf);
2438 
2439 		err = -saved_errno;
2440 		goto out;
2441 	}
2442 
2443 	/* Enable ignoring missing threads when -u/-p option is defined. */
2444 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2445 
2446 	err = -ENOMEM;
2447 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2448 		usage_with_options(record_usage, record_options);
2449 
2450 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2451 	if (err)
2452 		goto out;
2453 
2454 	/*
2455 	 * We take all buildids when the file contains
2456 	 * AUX area tracing data because we do not decode the
2457 	 * trace because it would take too long.
2458 	 */
2459 	if (rec->opts.full_auxtrace)
2460 		rec->buildid_all = true;
2461 
2462 	if (record_opts__config(&rec->opts)) {
2463 		err = -EINVAL;
2464 		goto out;
2465 	}
2466 
2467 	if (rec->opts.nr_cblocks > nr_cblocks_max)
2468 		rec->opts.nr_cblocks = nr_cblocks_max;
2469 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2470 
2471 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2472 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2473 
2474 	if (rec->opts.comp_level > comp_level_max)
2475 		rec->opts.comp_level = comp_level_max;
2476 	pr_debug("comp level: %d\n", rec->opts.comp_level);
2477 
2478 	err = __cmd_record(&record, argc, argv);
2479 out:
2480 	evlist__delete(rec->evlist);
2481 	symbol__exit();
2482 	auxtrace_record__free(rec->itr);
2483 	return err;
2484 }
2485 
2486 static void snapshot_sig_handler(int sig __maybe_unused)
2487 {
2488 	struct record *rec = &record;
2489 
2490 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2491 		trigger_hit(&auxtrace_snapshot_trigger);
2492 		auxtrace_record__snapshot_started = 1;
2493 		if (auxtrace_record__snapshot_start(record.itr))
2494 			trigger_error(&auxtrace_snapshot_trigger);
2495 	}
2496 
2497 	if (switch_output_signal(rec))
2498 		trigger_hit(&switch_output_trigger);
2499 }
2500 
2501 static void alarm_sig_handler(int sig __maybe_unused)
2502 {
2503 	struct record *rec = &record;
2504 
2505 	if (switch_output_time(rec))
2506 		trigger_hit(&switch_output_trigger);
2507 }
2508