xref: /linux/tools/perf/builtin-record.c (revision 42fc2e9ef9603a7948aaa4ffd8dfb94b30294ad8)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "perf.h"
12 
13 #include "util/build-id.h"
14 #include <subcmd/parse-options.h>
15 #include "util/parse-events.h"
16 #include "util/config.h"
17 
18 #include "util/callchain.h"
19 #include "util/cgroup.h"
20 #include "util/header.h"
21 #include "util/event.h"
22 #include "util/evlist.h"
23 #include "util/evsel.h"
24 #include "util/debug.h"
25 #include "util/session.h"
26 #include "util/tool.h"
27 #include "util/symbol.h"
28 #include "util/cpumap.h"
29 #include "util/thread_map.h"
30 #include "util/data.h"
31 #include "util/perf_regs.h"
32 #include "util/auxtrace.h"
33 #include "util/tsc.h"
34 #include "util/parse-branch-options.h"
35 #include "util/parse-regs-options.h"
36 #include "util/llvm-utils.h"
37 #include "util/bpf-loader.h"
38 #include "util/trigger.h"
39 #include "util/perf-hooks.h"
40 #include "util/cpu-set-sched.h"
41 #include "util/time-utils.h"
42 #include "util/units.h"
43 #include "util/bpf-event.h"
44 #include "asm/bug.h"
45 
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <locale.h>
49 #include <poll.h>
50 #include <unistd.h>
51 #include <sched.h>
52 #include <signal.h>
53 #include <sys/mman.h>
54 #include <sys/wait.h>
55 #include <linux/time64.h>
56 #include <linux/zalloc.h>
57 
58 struct switch_output {
59 	bool		 enabled;
60 	bool		 signal;
61 	unsigned long	 size;
62 	unsigned long	 time;
63 	const char	*str;
64 	bool		 set;
65 	char		 **filenames;
66 	int		 num_files;
67 	int		 cur_file;
68 };
69 
70 struct record {
71 	struct perf_tool	tool;
72 	struct record_opts	opts;
73 	u64			bytes_written;
74 	struct perf_data	data;
75 	struct auxtrace_record	*itr;
76 	struct evlist	*evlist;
77 	struct perf_session	*session;
78 	int			realtime_prio;
79 	bool			no_buildid;
80 	bool			no_buildid_set;
81 	bool			no_buildid_cache;
82 	bool			no_buildid_cache_set;
83 	bool			buildid_all;
84 	bool			timestamp_filename;
85 	bool			timestamp_boundary;
86 	struct switch_output	switch_output;
87 	unsigned long long	samples;
88 	cpu_set_t		affinity_mask;
89 };
90 
91 static volatile int auxtrace_record__snapshot_started;
92 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
93 static DEFINE_TRIGGER(switch_output_trigger);
94 
95 static const char *affinity_tags[PERF_AFFINITY_MAX] = {
96 	"SYS", "NODE", "CPU"
97 };
98 
99 static bool switch_output_signal(struct record *rec)
100 {
101 	return rec->switch_output.signal &&
102 	       trigger_is_ready(&switch_output_trigger);
103 }
104 
105 static bool switch_output_size(struct record *rec)
106 {
107 	return rec->switch_output.size &&
108 	       trigger_is_ready(&switch_output_trigger) &&
109 	       (rec->bytes_written >= rec->switch_output.size);
110 }
111 
112 static bool switch_output_time(struct record *rec)
113 {
114 	return rec->switch_output.time &&
115 	       trigger_is_ready(&switch_output_trigger);
116 }
117 
118 static int record__write(struct record *rec, struct perf_mmap *map __maybe_unused,
119 			 void *bf, size_t size)
120 {
121 	struct perf_data_file *file = &rec->session->data->file;
122 
123 	if (perf_data_file__write(file, bf, size) < 0) {
124 		pr_err("failed to write perf data, error: %m\n");
125 		return -1;
126 	}
127 
128 	rec->bytes_written += size;
129 
130 	if (switch_output_size(rec))
131 		trigger_hit(&switch_output_trigger);
132 
133 	return 0;
134 }
135 
136 static int record__aio_enabled(struct record *rec);
137 static int record__comp_enabled(struct record *rec);
138 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
139 			    void *src, size_t src_size);
140 
141 #ifdef HAVE_AIO_SUPPORT
142 static int record__aio_write(struct aiocb *cblock, int trace_fd,
143 		void *buf, size_t size, off_t off)
144 {
145 	int rc;
146 
147 	cblock->aio_fildes = trace_fd;
148 	cblock->aio_buf    = buf;
149 	cblock->aio_nbytes = size;
150 	cblock->aio_offset = off;
151 	cblock->aio_sigevent.sigev_notify = SIGEV_NONE;
152 
153 	do {
154 		rc = aio_write(cblock);
155 		if (rc == 0) {
156 			break;
157 		} else if (errno != EAGAIN) {
158 			cblock->aio_fildes = -1;
159 			pr_err("failed to queue perf data, error: %m\n");
160 			break;
161 		}
162 	} while (1);
163 
164 	return rc;
165 }
166 
167 static int record__aio_complete(struct perf_mmap *md, struct aiocb *cblock)
168 {
169 	void *rem_buf;
170 	off_t rem_off;
171 	size_t rem_size;
172 	int rc, aio_errno;
173 	ssize_t aio_ret, written;
174 
175 	aio_errno = aio_error(cblock);
176 	if (aio_errno == EINPROGRESS)
177 		return 0;
178 
179 	written = aio_ret = aio_return(cblock);
180 	if (aio_ret < 0) {
181 		if (aio_errno != EINTR)
182 			pr_err("failed to write perf data, error: %m\n");
183 		written = 0;
184 	}
185 
186 	rem_size = cblock->aio_nbytes - written;
187 
188 	if (rem_size == 0) {
189 		cblock->aio_fildes = -1;
190 		/*
191 		 * md->refcount is incremented in record__aio_pushfn() for
192 		 * every aio write request started in record__aio_push() so
193 		 * decrement it because the request is now complete.
194 		 */
195 		perf_mmap__put(md);
196 		rc = 1;
197 	} else {
198 		/*
199 		 * aio write request may require restart with the
200 		 * reminder if the kernel didn't write whole
201 		 * chunk at once.
202 		 */
203 		rem_off = cblock->aio_offset + written;
204 		rem_buf = (void *)(cblock->aio_buf + written);
205 		record__aio_write(cblock, cblock->aio_fildes,
206 				rem_buf, rem_size, rem_off);
207 		rc = 0;
208 	}
209 
210 	return rc;
211 }
212 
213 static int record__aio_sync(struct perf_mmap *md, bool sync_all)
214 {
215 	struct aiocb **aiocb = md->aio.aiocb;
216 	struct aiocb *cblocks = md->aio.cblocks;
217 	struct timespec timeout = { 0, 1000 * 1000  * 1 }; /* 1ms */
218 	int i, do_suspend;
219 
220 	do {
221 		do_suspend = 0;
222 		for (i = 0; i < md->aio.nr_cblocks; ++i) {
223 			if (cblocks[i].aio_fildes == -1 || record__aio_complete(md, &cblocks[i])) {
224 				if (sync_all)
225 					aiocb[i] = NULL;
226 				else
227 					return i;
228 			} else {
229 				/*
230 				 * Started aio write is not complete yet
231 				 * so it has to be waited before the
232 				 * next allocation.
233 				 */
234 				aiocb[i] = &cblocks[i];
235 				do_suspend = 1;
236 			}
237 		}
238 		if (!do_suspend)
239 			return -1;
240 
241 		while (aio_suspend((const struct aiocb **)aiocb, md->aio.nr_cblocks, &timeout)) {
242 			if (!(errno == EAGAIN || errno == EINTR))
243 				pr_err("failed to sync perf data, error: %m\n");
244 		}
245 	} while (1);
246 }
247 
248 struct record_aio {
249 	struct record	*rec;
250 	void		*data;
251 	size_t		size;
252 };
253 
254 static int record__aio_pushfn(struct perf_mmap *map, void *to, void *buf, size_t size)
255 {
256 	struct record_aio *aio = to;
257 
258 	/*
259 	 * map->base data pointed by buf is copied into free map->aio.data[] buffer
260 	 * to release space in the kernel buffer as fast as possible, calling
261 	 * perf_mmap__consume() from perf_mmap__push() function.
262 	 *
263 	 * That lets the kernel to proceed with storing more profiling data into
264 	 * the kernel buffer earlier than other per-cpu kernel buffers are handled.
265 	 *
266 	 * Coping can be done in two steps in case the chunk of profiling data
267 	 * crosses the upper bound of the kernel buffer. In this case we first move
268 	 * part of data from map->start till the upper bound and then the reminder
269 	 * from the beginning of the kernel buffer till the end of the data chunk.
270 	 */
271 
272 	if (record__comp_enabled(aio->rec)) {
273 		size = zstd_compress(aio->rec->session, aio->data + aio->size,
274 				     perf_mmap__mmap_len(map) - aio->size,
275 				     buf, size);
276 	} else {
277 		memcpy(aio->data + aio->size, buf, size);
278 	}
279 
280 	if (!aio->size) {
281 		/*
282 		 * Increment map->refcount to guard map->aio.data[] buffer
283 		 * from premature deallocation because map object can be
284 		 * released earlier than aio write request started on
285 		 * map->aio.data[] buffer is complete.
286 		 *
287 		 * perf_mmap__put() is done at record__aio_complete()
288 		 * after started aio request completion or at record__aio_push()
289 		 * if the request failed to start.
290 		 */
291 		perf_mmap__get(map);
292 	}
293 
294 	aio->size += size;
295 
296 	return size;
297 }
298 
299 static int record__aio_push(struct record *rec, struct perf_mmap *map, off_t *off)
300 {
301 	int ret, idx;
302 	int trace_fd = rec->session->data->file.fd;
303 	struct record_aio aio = { .rec = rec, .size = 0 };
304 
305 	/*
306 	 * Call record__aio_sync() to wait till map->aio.data[] buffer
307 	 * becomes available after previous aio write operation.
308 	 */
309 
310 	idx = record__aio_sync(map, false);
311 	aio.data = map->aio.data[idx];
312 	ret = perf_mmap__push(map, &aio, record__aio_pushfn);
313 	if (ret != 0) /* ret > 0 - no data, ret < 0 - error */
314 		return ret;
315 
316 	rec->samples++;
317 	ret = record__aio_write(&(map->aio.cblocks[idx]), trace_fd, aio.data, aio.size, *off);
318 	if (!ret) {
319 		*off += aio.size;
320 		rec->bytes_written += aio.size;
321 		if (switch_output_size(rec))
322 			trigger_hit(&switch_output_trigger);
323 	} else {
324 		/*
325 		 * Decrement map->refcount incremented in record__aio_pushfn()
326 		 * back if record__aio_write() operation failed to start, otherwise
327 		 * map->refcount is decremented in record__aio_complete() after
328 		 * aio write operation finishes successfully.
329 		 */
330 		perf_mmap__put(map);
331 	}
332 
333 	return ret;
334 }
335 
336 static off_t record__aio_get_pos(int trace_fd)
337 {
338 	return lseek(trace_fd, 0, SEEK_CUR);
339 }
340 
341 static void record__aio_set_pos(int trace_fd, off_t pos)
342 {
343 	lseek(trace_fd, pos, SEEK_SET);
344 }
345 
346 static void record__aio_mmap_read_sync(struct record *rec)
347 {
348 	int i;
349 	struct evlist *evlist = rec->evlist;
350 	struct perf_mmap *maps = evlist->mmap;
351 
352 	if (!record__aio_enabled(rec))
353 		return;
354 
355 	for (i = 0; i < evlist->nr_mmaps; i++) {
356 		struct perf_mmap *map = &maps[i];
357 
358 		if (map->base)
359 			record__aio_sync(map, true);
360 	}
361 }
362 
363 static int nr_cblocks_default = 1;
364 static int nr_cblocks_max = 4;
365 
366 static int record__aio_parse(const struct option *opt,
367 			     const char *str,
368 			     int unset)
369 {
370 	struct record_opts *opts = (struct record_opts *)opt->value;
371 
372 	if (unset) {
373 		opts->nr_cblocks = 0;
374 	} else {
375 		if (str)
376 			opts->nr_cblocks = strtol(str, NULL, 0);
377 		if (!opts->nr_cblocks)
378 			opts->nr_cblocks = nr_cblocks_default;
379 	}
380 
381 	return 0;
382 }
383 #else /* HAVE_AIO_SUPPORT */
384 static int nr_cblocks_max = 0;
385 
386 static int record__aio_push(struct record *rec __maybe_unused, struct perf_mmap *map __maybe_unused,
387 			    off_t *off __maybe_unused)
388 {
389 	return -1;
390 }
391 
392 static off_t record__aio_get_pos(int trace_fd __maybe_unused)
393 {
394 	return -1;
395 }
396 
397 static void record__aio_set_pos(int trace_fd __maybe_unused, off_t pos __maybe_unused)
398 {
399 }
400 
401 static void record__aio_mmap_read_sync(struct record *rec __maybe_unused)
402 {
403 }
404 #endif
405 
406 static int record__aio_enabled(struct record *rec)
407 {
408 	return rec->opts.nr_cblocks > 0;
409 }
410 
411 #define MMAP_FLUSH_DEFAULT 1
412 static int record__mmap_flush_parse(const struct option *opt,
413 				    const char *str,
414 				    int unset)
415 {
416 	int flush_max;
417 	struct record_opts *opts = (struct record_opts *)opt->value;
418 	static struct parse_tag tags[] = {
419 			{ .tag  = 'B', .mult = 1       },
420 			{ .tag  = 'K', .mult = 1 << 10 },
421 			{ .tag  = 'M', .mult = 1 << 20 },
422 			{ .tag  = 'G', .mult = 1 << 30 },
423 			{ .tag  = 0 },
424 	};
425 
426 	if (unset)
427 		return 0;
428 
429 	if (str) {
430 		opts->mmap_flush = parse_tag_value(str, tags);
431 		if (opts->mmap_flush == (int)-1)
432 			opts->mmap_flush = strtol(str, NULL, 0);
433 	}
434 
435 	if (!opts->mmap_flush)
436 		opts->mmap_flush = MMAP_FLUSH_DEFAULT;
437 
438 	flush_max = perf_evlist__mmap_size(opts->mmap_pages);
439 	flush_max /= 4;
440 	if (opts->mmap_flush > flush_max)
441 		opts->mmap_flush = flush_max;
442 
443 	return 0;
444 }
445 
446 #ifdef HAVE_ZSTD_SUPPORT
447 static unsigned int comp_level_default = 1;
448 
449 static int record__parse_comp_level(const struct option *opt, const char *str, int unset)
450 {
451 	struct record_opts *opts = opt->value;
452 
453 	if (unset) {
454 		opts->comp_level = 0;
455 	} else {
456 		if (str)
457 			opts->comp_level = strtol(str, NULL, 0);
458 		if (!opts->comp_level)
459 			opts->comp_level = comp_level_default;
460 	}
461 
462 	return 0;
463 }
464 #endif
465 static unsigned int comp_level_max = 22;
466 
467 static int record__comp_enabled(struct record *rec)
468 {
469 	return rec->opts.comp_level > 0;
470 }
471 
472 static int process_synthesized_event(struct perf_tool *tool,
473 				     union perf_event *event,
474 				     struct perf_sample *sample __maybe_unused,
475 				     struct machine *machine __maybe_unused)
476 {
477 	struct record *rec = container_of(tool, struct record, tool);
478 	return record__write(rec, NULL, event, event->header.size);
479 }
480 
481 static int record__pushfn(struct perf_mmap *map, void *to, void *bf, size_t size)
482 {
483 	struct record *rec = to;
484 
485 	if (record__comp_enabled(rec)) {
486 		size = zstd_compress(rec->session, map->data, perf_mmap__mmap_len(map), bf, size);
487 		bf   = map->data;
488 	}
489 
490 	rec->samples++;
491 	return record__write(rec, map, bf, size);
492 }
493 
494 static volatile int done;
495 static volatile int signr = -1;
496 static volatile int child_finished;
497 
498 static void sig_handler(int sig)
499 {
500 	if (sig == SIGCHLD)
501 		child_finished = 1;
502 	else
503 		signr = sig;
504 
505 	done = 1;
506 }
507 
508 static void sigsegv_handler(int sig)
509 {
510 	perf_hooks__recover();
511 	sighandler_dump_stack(sig);
512 }
513 
514 static void record__sig_exit(void)
515 {
516 	if (signr == -1)
517 		return;
518 
519 	signal(signr, SIG_DFL);
520 	raise(signr);
521 }
522 
523 #ifdef HAVE_AUXTRACE_SUPPORT
524 
525 static int record__process_auxtrace(struct perf_tool *tool,
526 				    struct perf_mmap *map,
527 				    union perf_event *event, void *data1,
528 				    size_t len1, void *data2, size_t len2)
529 {
530 	struct record *rec = container_of(tool, struct record, tool);
531 	struct perf_data *data = &rec->data;
532 	size_t padding;
533 	u8 pad[8] = {0};
534 
535 	if (!perf_data__is_pipe(data) && !perf_data__is_dir(data)) {
536 		off_t file_offset;
537 		int fd = perf_data__fd(data);
538 		int err;
539 
540 		file_offset = lseek(fd, 0, SEEK_CUR);
541 		if (file_offset == -1)
542 			return -1;
543 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
544 						     event, file_offset);
545 		if (err)
546 			return err;
547 	}
548 
549 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
550 	padding = (len1 + len2) & 7;
551 	if (padding)
552 		padding = 8 - padding;
553 
554 	record__write(rec, map, event, event->header.size);
555 	record__write(rec, map, data1, len1);
556 	if (len2)
557 		record__write(rec, map, data2, len2);
558 	record__write(rec, map, &pad, padding);
559 
560 	return 0;
561 }
562 
563 static int record__auxtrace_mmap_read(struct record *rec,
564 				      struct perf_mmap *map)
565 {
566 	int ret;
567 
568 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
569 				  record__process_auxtrace);
570 	if (ret < 0)
571 		return ret;
572 
573 	if (ret)
574 		rec->samples++;
575 
576 	return 0;
577 }
578 
579 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
580 					       struct perf_mmap *map)
581 {
582 	int ret;
583 
584 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
585 					   record__process_auxtrace,
586 					   rec->opts.auxtrace_snapshot_size);
587 	if (ret < 0)
588 		return ret;
589 
590 	if (ret)
591 		rec->samples++;
592 
593 	return 0;
594 }
595 
596 static int record__auxtrace_read_snapshot_all(struct record *rec)
597 {
598 	int i;
599 	int rc = 0;
600 
601 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
602 		struct perf_mmap *map = &rec->evlist->mmap[i];
603 
604 		if (!map->auxtrace_mmap.base)
605 			continue;
606 
607 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
608 			rc = -1;
609 			goto out;
610 		}
611 	}
612 out:
613 	return rc;
614 }
615 
616 static void record__read_auxtrace_snapshot(struct record *rec, bool on_exit)
617 {
618 	pr_debug("Recording AUX area tracing snapshot\n");
619 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
620 		trigger_error(&auxtrace_snapshot_trigger);
621 	} else {
622 		if (auxtrace_record__snapshot_finish(rec->itr, on_exit))
623 			trigger_error(&auxtrace_snapshot_trigger);
624 		else
625 			trigger_ready(&auxtrace_snapshot_trigger);
626 	}
627 }
628 
629 static int record__auxtrace_snapshot_exit(struct record *rec)
630 {
631 	if (trigger_is_error(&auxtrace_snapshot_trigger))
632 		return 0;
633 
634 	if (!auxtrace_record__snapshot_started &&
635 	    auxtrace_record__snapshot_start(rec->itr))
636 		return -1;
637 
638 	record__read_auxtrace_snapshot(rec, true);
639 	if (trigger_is_error(&auxtrace_snapshot_trigger))
640 		return -1;
641 
642 	return 0;
643 }
644 
645 static int record__auxtrace_init(struct record *rec)
646 {
647 	int err;
648 
649 	if (!rec->itr) {
650 		rec->itr = auxtrace_record__init(rec->evlist, &err);
651 		if (err)
652 			return err;
653 	}
654 
655 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
656 					      rec->opts.auxtrace_snapshot_opts);
657 	if (err)
658 		return err;
659 
660 	return auxtrace_parse_filters(rec->evlist);
661 }
662 
663 #else
664 
665 static inline
666 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
667 			       struct perf_mmap *map __maybe_unused)
668 {
669 	return 0;
670 }
671 
672 static inline
673 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused,
674 				    bool on_exit __maybe_unused)
675 {
676 }
677 
678 static inline
679 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
680 {
681 	return 0;
682 }
683 
684 static inline
685 int record__auxtrace_snapshot_exit(struct record *rec __maybe_unused)
686 {
687 	return 0;
688 }
689 
690 static int record__auxtrace_init(struct record *rec __maybe_unused)
691 {
692 	return 0;
693 }
694 
695 #endif
696 
697 static int record__mmap_evlist(struct record *rec,
698 			       struct evlist *evlist)
699 {
700 	struct record_opts *opts = &rec->opts;
701 	char msg[512];
702 
703 	if (opts->affinity != PERF_AFFINITY_SYS)
704 		cpu__setup_cpunode_map();
705 
706 	if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
707 				 opts->auxtrace_mmap_pages,
708 				 opts->auxtrace_snapshot_mode,
709 				 opts->nr_cblocks, opts->affinity,
710 				 opts->mmap_flush, opts->comp_level) < 0) {
711 		if (errno == EPERM) {
712 			pr_err("Permission error mapping pages.\n"
713 			       "Consider increasing "
714 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
715 			       "or try again with a smaller value of -m/--mmap_pages.\n"
716 			       "(current value: %u,%u)\n",
717 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
718 			return -errno;
719 		} else {
720 			pr_err("failed to mmap with %d (%s)\n", errno,
721 				str_error_r(errno, msg, sizeof(msg)));
722 			if (errno)
723 				return -errno;
724 			else
725 				return -EINVAL;
726 		}
727 	}
728 	return 0;
729 }
730 
731 static int record__mmap(struct record *rec)
732 {
733 	return record__mmap_evlist(rec, rec->evlist);
734 }
735 
736 static int record__open(struct record *rec)
737 {
738 	char msg[BUFSIZ];
739 	struct evsel *pos;
740 	struct evlist *evlist = rec->evlist;
741 	struct perf_session *session = rec->session;
742 	struct record_opts *opts = &rec->opts;
743 	int rc = 0;
744 
745 	/*
746 	 * For initial_delay we need to add a dummy event so that we can track
747 	 * PERF_RECORD_MMAP while we wait for the initial delay to enable the
748 	 * real events, the ones asked by the user.
749 	 */
750 	if (opts->initial_delay) {
751 		if (perf_evlist__add_dummy(evlist))
752 			return -ENOMEM;
753 
754 		pos = perf_evlist__first(evlist);
755 		pos->tracking = 0;
756 		pos = perf_evlist__last(evlist);
757 		pos->tracking = 1;
758 		pos->core.attr.enable_on_exec = 1;
759 	}
760 
761 	perf_evlist__config(evlist, opts, &callchain_param);
762 
763 	evlist__for_each_entry(evlist, pos) {
764 try_again:
765 		if (evsel__open(pos, pos->core.cpus, pos->core.threads) < 0) {
766 			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
767 				if (verbose > 0)
768 					ui__warning("%s\n", msg);
769 				goto try_again;
770 			}
771 			if ((errno == EINVAL || errno == EBADF) &&
772 			    pos->leader != pos &&
773 			    pos->weak_group) {
774 			        pos = perf_evlist__reset_weak_group(evlist, pos);
775 				goto try_again;
776 			}
777 			rc = -errno;
778 			perf_evsel__open_strerror(pos, &opts->target,
779 						  errno, msg, sizeof(msg));
780 			ui__error("%s\n", msg);
781 			goto out;
782 		}
783 
784 		pos->supported = true;
785 	}
786 
787 	if (perf_evlist__apply_filters(evlist, &pos)) {
788 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
789 			pos->filter, perf_evsel__name(pos), errno,
790 			str_error_r(errno, msg, sizeof(msg)));
791 		rc = -1;
792 		goto out;
793 	}
794 
795 	rc = record__mmap(rec);
796 	if (rc)
797 		goto out;
798 
799 	session->evlist = evlist;
800 	perf_session__set_id_hdr_size(session);
801 out:
802 	return rc;
803 }
804 
805 static int process_sample_event(struct perf_tool *tool,
806 				union perf_event *event,
807 				struct perf_sample *sample,
808 				struct evsel *evsel,
809 				struct machine *machine)
810 {
811 	struct record *rec = container_of(tool, struct record, tool);
812 
813 	if (rec->evlist->first_sample_time == 0)
814 		rec->evlist->first_sample_time = sample->time;
815 
816 	rec->evlist->last_sample_time = sample->time;
817 
818 	if (rec->buildid_all)
819 		return 0;
820 
821 	rec->samples++;
822 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
823 }
824 
825 static int process_buildids(struct record *rec)
826 {
827 	struct perf_session *session = rec->session;
828 
829 	if (perf_data__size(&rec->data) == 0)
830 		return 0;
831 
832 	/*
833 	 * During this process, it'll load kernel map and replace the
834 	 * dso->long_name to a real pathname it found.  In this case
835 	 * we prefer the vmlinux path like
836 	 *   /lib/modules/3.16.4/build/vmlinux
837 	 *
838 	 * rather than build-id path (in debug directory).
839 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
840 	 */
841 	symbol_conf.ignore_vmlinux_buildid = true;
842 
843 	/*
844 	 * If --buildid-all is given, it marks all DSO regardless of hits,
845 	 * so no need to process samples. But if timestamp_boundary is enabled,
846 	 * it still needs to walk on all samples to get the timestamps of
847 	 * first/last samples.
848 	 */
849 	if (rec->buildid_all && !rec->timestamp_boundary)
850 		rec->tool.sample = NULL;
851 
852 	return perf_session__process_events(session);
853 }
854 
855 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
856 {
857 	int err;
858 	struct perf_tool *tool = data;
859 	/*
860 	 *As for guest kernel when processing subcommand record&report,
861 	 *we arrange module mmap prior to guest kernel mmap and trigger
862 	 *a preload dso because default guest module symbols are loaded
863 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
864 	 *method is used to avoid symbol missing when the first addr is
865 	 *in module instead of in guest kernel.
866 	 */
867 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
868 					     machine);
869 	if (err < 0)
870 		pr_err("Couldn't record guest kernel [%d]'s reference"
871 		       " relocation symbol.\n", machine->pid);
872 
873 	/*
874 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
875 	 * have no _text sometimes.
876 	 */
877 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
878 						 machine);
879 	if (err < 0)
880 		pr_err("Couldn't record guest kernel [%d]'s reference"
881 		       " relocation symbol.\n", machine->pid);
882 }
883 
884 static struct perf_event_header finished_round_event = {
885 	.size = sizeof(struct perf_event_header),
886 	.type = PERF_RECORD_FINISHED_ROUND,
887 };
888 
889 static void record__adjust_affinity(struct record *rec, struct perf_mmap *map)
890 {
891 	if (rec->opts.affinity != PERF_AFFINITY_SYS &&
892 	    !CPU_EQUAL(&rec->affinity_mask, &map->affinity_mask)) {
893 		CPU_ZERO(&rec->affinity_mask);
894 		CPU_OR(&rec->affinity_mask, &rec->affinity_mask, &map->affinity_mask);
895 		sched_setaffinity(0, sizeof(rec->affinity_mask), &rec->affinity_mask);
896 	}
897 }
898 
899 static size_t process_comp_header(void *record, size_t increment)
900 {
901 	struct compressed_event *event = record;
902 	size_t size = sizeof(*event);
903 
904 	if (increment) {
905 		event->header.size += increment;
906 		return increment;
907 	}
908 
909 	event->header.type = PERF_RECORD_COMPRESSED;
910 	event->header.size = size;
911 
912 	return size;
913 }
914 
915 static size_t zstd_compress(struct perf_session *session, void *dst, size_t dst_size,
916 			    void *src, size_t src_size)
917 {
918 	size_t compressed;
919 	size_t max_record_size = PERF_SAMPLE_MAX_SIZE - sizeof(struct compressed_event) - 1;
920 
921 	compressed = zstd_compress_stream_to_records(&session->zstd_data, dst, dst_size, src, src_size,
922 						     max_record_size, process_comp_header);
923 
924 	session->bytes_transferred += src_size;
925 	session->bytes_compressed  += compressed;
926 
927 	return compressed;
928 }
929 
930 static int record__mmap_read_evlist(struct record *rec, struct evlist *evlist,
931 				    bool overwrite, bool synch)
932 {
933 	u64 bytes_written = rec->bytes_written;
934 	int i;
935 	int rc = 0;
936 	struct perf_mmap *maps;
937 	int trace_fd = rec->data.file.fd;
938 	off_t off = 0;
939 
940 	if (!evlist)
941 		return 0;
942 
943 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
944 	if (!maps)
945 		return 0;
946 
947 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
948 		return 0;
949 
950 	if (record__aio_enabled(rec))
951 		off = record__aio_get_pos(trace_fd);
952 
953 	for (i = 0; i < evlist->nr_mmaps; i++) {
954 		u64 flush = 0;
955 		struct perf_mmap *map = &maps[i];
956 
957 		if (map->base) {
958 			record__adjust_affinity(rec, map);
959 			if (synch) {
960 				flush = map->flush;
961 				map->flush = 1;
962 			}
963 			if (!record__aio_enabled(rec)) {
964 				if (perf_mmap__push(map, rec, record__pushfn) < 0) {
965 					if (synch)
966 						map->flush = flush;
967 					rc = -1;
968 					goto out;
969 				}
970 			} else {
971 				if (record__aio_push(rec, map, &off) < 0) {
972 					record__aio_set_pos(trace_fd, off);
973 					if (synch)
974 						map->flush = flush;
975 					rc = -1;
976 					goto out;
977 				}
978 			}
979 			if (synch)
980 				map->flush = flush;
981 		}
982 
983 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
984 		    record__auxtrace_mmap_read(rec, map) != 0) {
985 			rc = -1;
986 			goto out;
987 		}
988 	}
989 
990 	if (record__aio_enabled(rec))
991 		record__aio_set_pos(trace_fd, off);
992 
993 	/*
994 	 * Mark the round finished in case we wrote
995 	 * at least one event.
996 	 */
997 	if (bytes_written != rec->bytes_written)
998 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
999 
1000 	if (overwrite)
1001 		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
1002 out:
1003 	return rc;
1004 }
1005 
1006 static int record__mmap_read_all(struct record *rec, bool synch)
1007 {
1008 	int err;
1009 
1010 	err = record__mmap_read_evlist(rec, rec->evlist, false, synch);
1011 	if (err)
1012 		return err;
1013 
1014 	return record__mmap_read_evlist(rec, rec->evlist, true, synch);
1015 }
1016 
1017 static void record__init_features(struct record *rec)
1018 {
1019 	struct perf_session *session = rec->session;
1020 	int feat;
1021 
1022 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
1023 		perf_header__set_feat(&session->header, feat);
1024 
1025 	if (rec->no_buildid)
1026 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
1027 
1028 	if (!have_tracepoints(&rec->evlist->core.entries))
1029 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
1030 
1031 	if (!rec->opts.branch_stack)
1032 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
1033 
1034 	if (!rec->opts.full_auxtrace)
1035 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
1036 
1037 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
1038 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
1039 
1040 	perf_header__clear_feat(&session->header, HEADER_DIR_FORMAT);
1041 	if (!record__comp_enabled(rec))
1042 		perf_header__clear_feat(&session->header, HEADER_COMPRESSED);
1043 
1044 	perf_header__clear_feat(&session->header, HEADER_STAT);
1045 }
1046 
1047 static void
1048 record__finish_output(struct record *rec)
1049 {
1050 	struct perf_data *data = &rec->data;
1051 	int fd = perf_data__fd(data);
1052 
1053 	if (data->is_pipe)
1054 		return;
1055 
1056 	rec->session->header.data_size += rec->bytes_written;
1057 	data->file.size = lseek(perf_data__fd(data), 0, SEEK_CUR);
1058 
1059 	if (!rec->no_buildid) {
1060 		process_buildids(rec);
1061 
1062 		if (rec->buildid_all)
1063 			dsos__hit_all(rec->session);
1064 	}
1065 	perf_session__write_header(rec->session, rec->evlist, fd, true);
1066 
1067 	return;
1068 }
1069 
1070 static int record__synthesize_workload(struct record *rec, bool tail)
1071 {
1072 	int err;
1073 	struct perf_thread_map *thread_map;
1074 
1075 	if (rec->opts.tail_synthesize != tail)
1076 		return 0;
1077 
1078 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
1079 	if (thread_map == NULL)
1080 		return -1;
1081 
1082 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
1083 						 process_synthesized_event,
1084 						 &rec->session->machines.host,
1085 						 rec->opts.sample_address);
1086 	perf_thread_map__put(thread_map);
1087 	return err;
1088 }
1089 
1090 static int record__synthesize(struct record *rec, bool tail);
1091 
1092 static int
1093 record__switch_output(struct record *rec, bool at_exit)
1094 {
1095 	struct perf_data *data = &rec->data;
1096 	int fd, err;
1097 	char *new_filename;
1098 
1099 	/* Same Size:      "2015122520103046"*/
1100 	char timestamp[] = "InvalidTimestamp";
1101 
1102 	record__aio_mmap_read_sync(rec);
1103 
1104 	record__synthesize(rec, true);
1105 	if (target__none(&rec->opts.target))
1106 		record__synthesize_workload(rec, true);
1107 
1108 	rec->samples = 0;
1109 	record__finish_output(rec);
1110 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
1111 	if (err) {
1112 		pr_err("Failed to get current timestamp\n");
1113 		return -EINVAL;
1114 	}
1115 
1116 	fd = perf_data__switch(data, timestamp,
1117 				    rec->session->header.data_offset,
1118 				    at_exit, &new_filename);
1119 	if (fd >= 0 && !at_exit) {
1120 		rec->bytes_written = 0;
1121 		rec->session->header.data_size = 0;
1122 	}
1123 
1124 	if (!quiet)
1125 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
1126 			data->path, timestamp);
1127 
1128 	if (rec->switch_output.num_files) {
1129 		int n = rec->switch_output.cur_file + 1;
1130 
1131 		if (n >= rec->switch_output.num_files)
1132 			n = 0;
1133 		rec->switch_output.cur_file = n;
1134 		if (rec->switch_output.filenames[n]) {
1135 			remove(rec->switch_output.filenames[n]);
1136 			zfree(&rec->switch_output.filenames[n]);
1137 		}
1138 		rec->switch_output.filenames[n] = new_filename;
1139 	} else {
1140 		free(new_filename);
1141 	}
1142 
1143 	/* Output tracking events */
1144 	if (!at_exit) {
1145 		record__synthesize(rec, false);
1146 
1147 		/*
1148 		 * In 'perf record --switch-output' without -a,
1149 		 * record__synthesize() in record__switch_output() won't
1150 		 * generate tracking events because there's no thread_map
1151 		 * in evlist. Which causes newly created perf.data doesn't
1152 		 * contain map and comm information.
1153 		 * Create a fake thread_map and directly call
1154 		 * perf_event__synthesize_thread_map() for those events.
1155 		 */
1156 		if (target__none(&rec->opts.target))
1157 			record__synthesize_workload(rec, false);
1158 	}
1159 	return fd;
1160 }
1161 
1162 static volatile int workload_exec_errno;
1163 
1164 /*
1165  * perf_evlist__prepare_workload will send a SIGUSR1
1166  * if the fork fails, since we asked by setting its
1167  * want_signal to true.
1168  */
1169 static void workload_exec_failed_signal(int signo __maybe_unused,
1170 					siginfo_t *info,
1171 					void *ucontext __maybe_unused)
1172 {
1173 	workload_exec_errno = info->si_value.sival_int;
1174 	done = 1;
1175 	child_finished = 1;
1176 }
1177 
1178 static void snapshot_sig_handler(int sig);
1179 static void alarm_sig_handler(int sig);
1180 
1181 int __weak
1182 perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
1183 			    struct perf_tool *tool __maybe_unused,
1184 			    perf_event__handler_t process __maybe_unused,
1185 			    struct machine *machine __maybe_unused)
1186 {
1187 	return 0;
1188 }
1189 
1190 static const struct perf_event_mmap_page *
1191 perf_evlist__pick_pc(struct evlist *evlist)
1192 {
1193 	if (evlist) {
1194 		if (evlist->mmap && evlist->mmap[0].base)
1195 			return evlist->mmap[0].base;
1196 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
1197 			return evlist->overwrite_mmap[0].base;
1198 	}
1199 	return NULL;
1200 }
1201 
1202 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
1203 {
1204 	const struct perf_event_mmap_page *pc;
1205 
1206 	pc = perf_evlist__pick_pc(rec->evlist);
1207 	if (pc)
1208 		return pc;
1209 	return NULL;
1210 }
1211 
1212 static int record__synthesize(struct record *rec, bool tail)
1213 {
1214 	struct perf_session *session = rec->session;
1215 	struct machine *machine = &session->machines.host;
1216 	struct perf_data *data = &rec->data;
1217 	struct record_opts *opts = &rec->opts;
1218 	struct perf_tool *tool = &rec->tool;
1219 	int fd = perf_data__fd(data);
1220 	int err = 0;
1221 
1222 	if (rec->opts.tail_synthesize != tail)
1223 		return 0;
1224 
1225 	if (data->is_pipe) {
1226 		/*
1227 		 * We need to synthesize events first, because some
1228 		 * features works on top of them (on report side).
1229 		 */
1230 		err = perf_event__synthesize_attrs(tool, rec->evlist,
1231 						   process_synthesized_event);
1232 		if (err < 0) {
1233 			pr_err("Couldn't synthesize attrs.\n");
1234 			goto out;
1235 		}
1236 
1237 		err = perf_event__synthesize_features(tool, session, rec->evlist,
1238 						      process_synthesized_event);
1239 		if (err < 0) {
1240 			pr_err("Couldn't synthesize features.\n");
1241 			return err;
1242 		}
1243 
1244 		if (have_tracepoints(&rec->evlist->core.entries)) {
1245 			/*
1246 			 * FIXME err <= 0 here actually means that
1247 			 * there were no tracepoints so its not really
1248 			 * an error, just that we don't need to
1249 			 * synthesize anything.  We really have to
1250 			 * return this more properly and also
1251 			 * propagate errors that now are calling die()
1252 			 */
1253 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
1254 								  process_synthesized_event);
1255 			if (err <= 0) {
1256 				pr_err("Couldn't record tracing data.\n");
1257 				goto out;
1258 			}
1259 			rec->bytes_written += err;
1260 		}
1261 	}
1262 
1263 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
1264 					  process_synthesized_event, machine);
1265 	if (err)
1266 		goto out;
1267 
1268 	if (rec->opts.full_auxtrace) {
1269 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
1270 					session, process_synthesized_event);
1271 		if (err)
1272 			goto out;
1273 	}
1274 
1275 	if (!perf_evlist__exclude_kernel(rec->evlist)) {
1276 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
1277 							 machine);
1278 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
1279 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1280 				   "Check /proc/kallsyms permission or run as root.\n");
1281 
1282 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
1283 						     machine);
1284 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
1285 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
1286 				   "Check /proc/modules permission or run as root.\n");
1287 	}
1288 
1289 	if (perf_guest) {
1290 		machines__process_guests(&session->machines,
1291 					 perf_event__synthesize_guest_os, tool);
1292 	}
1293 
1294 	err = perf_event__synthesize_extra_attr(&rec->tool,
1295 						rec->evlist,
1296 						process_synthesized_event,
1297 						data->is_pipe);
1298 	if (err)
1299 		goto out;
1300 
1301 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->core.threads,
1302 						 process_synthesized_event,
1303 						NULL);
1304 	if (err < 0) {
1305 		pr_err("Couldn't synthesize thread map.\n");
1306 		return err;
1307 	}
1308 
1309 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->core.cpus,
1310 					     process_synthesized_event, NULL);
1311 	if (err < 0) {
1312 		pr_err("Couldn't synthesize cpu map.\n");
1313 		return err;
1314 	}
1315 
1316 	err = perf_event__synthesize_bpf_events(session, process_synthesized_event,
1317 						machine, opts);
1318 	if (err < 0)
1319 		pr_warning("Couldn't synthesize bpf events.\n");
1320 
1321 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->core.threads,
1322 					    process_synthesized_event, opts->sample_address,
1323 					    1);
1324 out:
1325 	return err;
1326 }
1327 
1328 static int __cmd_record(struct record *rec, int argc, const char **argv)
1329 {
1330 	int err;
1331 	int status = 0;
1332 	unsigned long waking = 0;
1333 	const bool forks = argc > 0;
1334 	struct perf_tool *tool = &rec->tool;
1335 	struct record_opts *opts = &rec->opts;
1336 	struct perf_data *data = &rec->data;
1337 	struct perf_session *session;
1338 	bool disabled = false, draining = false;
1339 	struct evlist *sb_evlist = NULL;
1340 	int fd;
1341 	float ratio = 0;
1342 
1343 	atexit(record__sig_exit);
1344 	signal(SIGCHLD, sig_handler);
1345 	signal(SIGINT, sig_handler);
1346 	signal(SIGTERM, sig_handler);
1347 	signal(SIGSEGV, sigsegv_handler);
1348 
1349 	if (rec->opts.record_namespaces)
1350 		tool->namespace_events = true;
1351 
1352 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
1353 		signal(SIGUSR2, snapshot_sig_handler);
1354 		if (rec->opts.auxtrace_snapshot_mode)
1355 			trigger_on(&auxtrace_snapshot_trigger);
1356 		if (rec->switch_output.enabled)
1357 			trigger_on(&switch_output_trigger);
1358 	} else {
1359 		signal(SIGUSR2, SIG_IGN);
1360 	}
1361 
1362 	session = perf_session__new(data, false, tool);
1363 	if (session == NULL) {
1364 		pr_err("Perf session creation failed.\n");
1365 		return -1;
1366 	}
1367 
1368 	fd = perf_data__fd(data);
1369 	rec->session = session;
1370 
1371 	if (zstd_init(&session->zstd_data, rec->opts.comp_level) < 0) {
1372 		pr_err("Compression initialization failed.\n");
1373 		return -1;
1374 	}
1375 
1376 	session->header.env.comp_type  = PERF_COMP_ZSTD;
1377 	session->header.env.comp_level = rec->opts.comp_level;
1378 
1379 	record__init_features(rec);
1380 
1381 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
1382 		session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
1383 
1384 	if (forks) {
1385 		err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
1386 						    argv, data->is_pipe,
1387 						    workload_exec_failed_signal);
1388 		if (err < 0) {
1389 			pr_err("Couldn't run the workload!\n");
1390 			status = err;
1391 			goto out_delete_session;
1392 		}
1393 	}
1394 
1395 	/*
1396 	 * If we have just single event and are sending data
1397 	 * through pipe, we need to force the ids allocation,
1398 	 * because we synthesize event name through the pipe
1399 	 * and need the id for that.
1400 	 */
1401 	if (data->is_pipe && rec->evlist->core.nr_entries == 1)
1402 		rec->opts.sample_id = true;
1403 
1404 	if (record__open(rec) != 0) {
1405 		err = -1;
1406 		goto out_child;
1407 	}
1408 	session->header.env.comp_mmap_len = session->evlist->mmap_len;
1409 
1410 	err = bpf__apply_obj_config();
1411 	if (err) {
1412 		char errbuf[BUFSIZ];
1413 
1414 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
1415 		pr_err("ERROR: Apply config to BPF failed: %s\n",
1416 			 errbuf);
1417 		goto out_child;
1418 	}
1419 
1420 	/*
1421 	 * Normally perf_session__new would do this, but it doesn't have the
1422 	 * evlist.
1423 	 */
1424 	if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
1425 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
1426 		rec->tool.ordered_events = false;
1427 	}
1428 
1429 	if (!rec->evlist->nr_groups)
1430 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
1431 
1432 	if (data->is_pipe) {
1433 		err = perf_header__write_pipe(fd);
1434 		if (err < 0)
1435 			goto out_child;
1436 	} else {
1437 		err = perf_session__write_header(session, rec->evlist, fd, false);
1438 		if (err < 0)
1439 			goto out_child;
1440 	}
1441 
1442 	if (!rec->no_buildid
1443 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
1444 		pr_err("Couldn't generate buildids. "
1445 		       "Use --no-buildid to profile anyway.\n");
1446 		err = -1;
1447 		goto out_child;
1448 	}
1449 
1450 	if (!opts->no_bpf_event)
1451 		bpf_event__add_sb_event(&sb_evlist, &session->header.env);
1452 
1453 	if (perf_evlist__start_sb_thread(sb_evlist, &rec->opts.target)) {
1454 		pr_debug("Couldn't start the BPF side band thread:\nBPF programs starting from now on won't be annotatable\n");
1455 		opts->no_bpf_event = true;
1456 	}
1457 
1458 	err = record__synthesize(rec, false);
1459 	if (err < 0)
1460 		goto out_child;
1461 
1462 	if (rec->realtime_prio) {
1463 		struct sched_param param;
1464 
1465 		param.sched_priority = rec->realtime_prio;
1466 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
1467 			pr_err("Could not set realtime priority.\n");
1468 			err = -1;
1469 			goto out_child;
1470 		}
1471 	}
1472 
1473 	/*
1474 	 * When perf is starting the traced process, all the events
1475 	 * (apart from group members) have enable_on_exec=1 set,
1476 	 * so don't spoil it by prematurely enabling them.
1477 	 */
1478 	if (!target__none(&opts->target) && !opts->initial_delay)
1479 		evlist__enable(rec->evlist);
1480 
1481 	/*
1482 	 * Let the child rip
1483 	 */
1484 	if (forks) {
1485 		struct machine *machine = &session->machines.host;
1486 		union perf_event *event;
1487 		pid_t tgid;
1488 
1489 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1490 		if (event == NULL) {
1491 			err = -ENOMEM;
1492 			goto out_child;
1493 		}
1494 
1495 		/*
1496 		 * Some H/W events are generated before COMM event
1497 		 * which is emitted during exec(), so perf script
1498 		 * cannot see a correct process name for those events.
1499 		 * Synthesize COMM event to prevent it.
1500 		 */
1501 		tgid = perf_event__synthesize_comm(tool, event,
1502 						   rec->evlist->workload.pid,
1503 						   process_synthesized_event,
1504 						   machine);
1505 		free(event);
1506 
1507 		if (tgid == -1)
1508 			goto out_child;
1509 
1510 		event = malloc(sizeof(event->namespaces) +
1511 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1512 			       machine->id_hdr_size);
1513 		if (event == NULL) {
1514 			err = -ENOMEM;
1515 			goto out_child;
1516 		}
1517 
1518 		/*
1519 		 * Synthesize NAMESPACES event for the command specified.
1520 		 */
1521 		perf_event__synthesize_namespaces(tool, event,
1522 						  rec->evlist->workload.pid,
1523 						  tgid, process_synthesized_event,
1524 						  machine);
1525 		free(event);
1526 
1527 		perf_evlist__start_workload(rec->evlist);
1528 	}
1529 
1530 	if (opts->initial_delay) {
1531 		usleep(opts->initial_delay * USEC_PER_MSEC);
1532 		evlist__enable(rec->evlist);
1533 	}
1534 
1535 	trigger_ready(&auxtrace_snapshot_trigger);
1536 	trigger_ready(&switch_output_trigger);
1537 	perf_hooks__invoke_record_start();
1538 	for (;;) {
1539 		unsigned long long hits = rec->samples;
1540 
1541 		/*
1542 		 * rec->evlist->bkw_mmap_state is possible to be
1543 		 * BKW_MMAP_EMPTY here: when done == true and
1544 		 * hits != rec->samples in previous round.
1545 		 *
1546 		 * perf_evlist__toggle_bkw_mmap ensure we never
1547 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1548 		 */
1549 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1550 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1551 
1552 		if (record__mmap_read_all(rec, false) < 0) {
1553 			trigger_error(&auxtrace_snapshot_trigger);
1554 			trigger_error(&switch_output_trigger);
1555 			err = -1;
1556 			goto out_child;
1557 		}
1558 
1559 		if (auxtrace_record__snapshot_started) {
1560 			auxtrace_record__snapshot_started = 0;
1561 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1562 				record__read_auxtrace_snapshot(rec, false);
1563 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1564 				pr_err("AUX area tracing snapshot failed\n");
1565 				err = -1;
1566 				goto out_child;
1567 			}
1568 		}
1569 
1570 		if (trigger_is_hit(&switch_output_trigger)) {
1571 			/*
1572 			 * If switch_output_trigger is hit, the data in
1573 			 * overwritable ring buffer should have been collected,
1574 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1575 			 *
1576 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1577 			 * record__mmap_read_all() didn't collect data from
1578 			 * overwritable ring buffer. Read again.
1579 			 */
1580 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1581 				continue;
1582 			trigger_ready(&switch_output_trigger);
1583 
1584 			/*
1585 			 * Reenable events in overwrite ring buffer after
1586 			 * record__mmap_read_all(): we should have collected
1587 			 * data from it.
1588 			 */
1589 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1590 
1591 			if (!quiet)
1592 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1593 					waking);
1594 			waking = 0;
1595 			fd = record__switch_output(rec, false);
1596 			if (fd < 0) {
1597 				pr_err("Failed to switch to new file\n");
1598 				trigger_error(&switch_output_trigger);
1599 				err = fd;
1600 				goto out_child;
1601 			}
1602 
1603 			/* re-arm the alarm */
1604 			if (rec->switch_output.time)
1605 				alarm(rec->switch_output.time);
1606 		}
1607 
1608 		if (hits == rec->samples) {
1609 			if (done || draining)
1610 				break;
1611 			err = perf_evlist__poll(rec->evlist, -1);
1612 			/*
1613 			 * Propagate error, only if there's any. Ignore positive
1614 			 * number of returned events and interrupt error.
1615 			 */
1616 			if (err > 0 || (err < 0 && errno == EINTR))
1617 				err = 0;
1618 			waking++;
1619 
1620 			if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1621 				draining = true;
1622 		}
1623 
1624 		/*
1625 		 * When perf is starting the traced process, at the end events
1626 		 * die with the process and we wait for that. Thus no need to
1627 		 * disable events in this case.
1628 		 */
1629 		if (done && !disabled && !target__none(&opts->target)) {
1630 			trigger_off(&auxtrace_snapshot_trigger);
1631 			evlist__disable(rec->evlist);
1632 			disabled = true;
1633 		}
1634 	}
1635 
1636 	trigger_off(&auxtrace_snapshot_trigger);
1637 	trigger_off(&switch_output_trigger);
1638 
1639 	if (opts->auxtrace_snapshot_on_exit)
1640 		record__auxtrace_snapshot_exit(rec);
1641 
1642 	if (forks && workload_exec_errno) {
1643 		char msg[STRERR_BUFSIZE];
1644 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1645 		pr_err("Workload failed: %s\n", emsg);
1646 		err = -1;
1647 		goto out_child;
1648 	}
1649 
1650 	if (!quiet)
1651 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1652 
1653 	if (target__none(&rec->opts.target))
1654 		record__synthesize_workload(rec, true);
1655 
1656 out_child:
1657 	record__mmap_read_all(rec, true);
1658 	record__aio_mmap_read_sync(rec);
1659 
1660 	if (rec->session->bytes_transferred && rec->session->bytes_compressed) {
1661 		ratio = (float)rec->session->bytes_transferred/(float)rec->session->bytes_compressed;
1662 		session->header.env.comp_ratio = ratio + 0.5;
1663 	}
1664 
1665 	if (forks) {
1666 		int exit_status;
1667 
1668 		if (!child_finished)
1669 			kill(rec->evlist->workload.pid, SIGTERM);
1670 
1671 		wait(&exit_status);
1672 
1673 		if (err < 0)
1674 			status = err;
1675 		else if (WIFEXITED(exit_status))
1676 			status = WEXITSTATUS(exit_status);
1677 		else if (WIFSIGNALED(exit_status))
1678 			signr = WTERMSIG(exit_status);
1679 	} else
1680 		status = err;
1681 
1682 	record__synthesize(rec, true);
1683 	/* this will be recalculated during process_buildids() */
1684 	rec->samples = 0;
1685 
1686 	if (!err) {
1687 		if (!rec->timestamp_filename) {
1688 			record__finish_output(rec);
1689 		} else {
1690 			fd = record__switch_output(rec, true);
1691 			if (fd < 0) {
1692 				status = fd;
1693 				goto out_delete_session;
1694 			}
1695 		}
1696 	}
1697 
1698 	perf_hooks__invoke_record_end();
1699 
1700 	if (!err && !quiet) {
1701 		char samples[128];
1702 		const char *postfix = rec->timestamp_filename ?
1703 					".<timestamp>" : "";
1704 
1705 		if (rec->samples && !rec->opts.full_auxtrace)
1706 			scnprintf(samples, sizeof(samples),
1707 				  " (%" PRIu64 " samples)", rec->samples);
1708 		else
1709 			samples[0] = '\0';
1710 
1711 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s",
1712 			perf_data__size(data) / 1024.0 / 1024.0,
1713 			data->path, postfix, samples);
1714 		if (ratio) {
1715 			fprintf(stderr,	", compressed (original %.3f MB, ratio is %.3f)",
1716 					rec->session->bytes_transferred / 1024.0 / 1024.0,
1717 					ratio);
1718 		}
1719 		fprintf(stderr, " ]\n");
1720 	}
1721 
1722 out_delete_session:
1723 	zstd_fini(&session->zstd_data);
1724 	perf_session__delete(session);
1725 
1726 	if (!opts->no_bpf_event)
1727 		perf_evlist__stop_sb_thread(sb_evlist);
1728 	return status;
1729 }
1730 
1731 static void callchain_debug(struct callchain_param *callchain)
1732 {
1733 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1734 
1735 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1736 
1737 	if (callchain->record_mode == CALLCHAIN_DWARF)
1738 		pr_debug("callchain: stack dump size %d\n",
1739 			 callchain->dump_size);
1740 }
1741 
1742 int record_opts__parse_callchain(struct record_opts *record,
1743 				 struct callchain_param *callchain,
1744 				 const char *arg, bool unset)
1745 {
1746 	int ret;
1747 	callchain->enabled = !unset;
1748 
1749 	/* --no-call-graph */
1750 	if (unset) {
1751 		callchain->record_mode = CALLCHAIN_NONE;
1752 		pr_debug("callchain: disabled\n");
1753 		return 0;
1754 	}
1755 
1756 	ret = parse_callchain_record_opt(arg, callchain);
1757 	if (!ret) {
1758 		/* Enable data address sampling for DWARF unwind. */
1759 		if (callchain->record_mode == CALLCHAIN_DWARF)
1760 			record->sample_address = true;
1761 		callchain_debug(callchain);
1762 	}
1763 
1764 	return ret;
1765 }
1766 
1767 int record_parse_callchain_opt(const struct option *opt,
1768 			       const char *arg,
1769 			       int unset)
1770 {
1771 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1772 }
1773 
1774 int record_callchain_opt(const struct option *opt,
1775 			 const char *arg __maybe_unused,
1776 			 int unset __maybe_unused)
1777 {
1778 	struct callchain_param *callchain = opt->value;
1779 
1780 	callchain->enabled = true;
1781 
1782 	if (callchain->record_mode == CALLCHAIN_NONE)
1783 		callchain->record_mode = CALLCHAIN_FP;
1784 
1785 	callchain_debug(callchain);
1786 	return 0;
1787 }
1788 
1789 static int perf_record_config(const char *var, const char *value, void *cb)
1790 {
1791 	struct record *rec = cb;
1792 
1793 	if (!strcmp(var, "record.build-id")) {
1794 		if (!strcmp(value, "cache"))
1795 			rec->no_buildid_cache = false;
1796 		else if (!strcmp(value, "no-cache"))
1797 			rec->no_buildid_cache = true;
1798 		else if (!strcmp(value, "skip"))
1799 			rec->no_buildid = true;
1800 		else
1801 			return -1;
1802 		return 0;
1803 	}
1804 	if (!strcmp(var, "record.call-graph")) {
1805 		var = "call-graph.record-mode";
1806 		return perf_default_config(var, value, cb);
1807 	}
1808 #ifdef HAVE_AIO_SUPPORT
1809 	if (!strcmp(var, "record.aio")) {
1810 		rec->opts.nr_cblocks = strtol(value, NULL, 0);
1811 		if (!rec->opts.nr_cblocks)
1812 			rec->opts.nr_cblocks = nr_cblocks_default;
1813 	}
1814 #endif
1815 
1816 	return 0;
1817 }
1818 
1819 struct clockid_map {
1820 	const char *name;
1821 	int clockid;
1822 };
1823 
1824 #define CLOCKID_MAP(n, c)	\
1825 	{ .name = n, .clockid = (c), }
1826 
1827 #define CLOCKID_END	{ .name = NULL, }
1828 
1829 
1830 /*
1831  * Add the missing ones, we need to build on many distros...
1832  */
1833 #ifndef CLOCK_MONOTONIC_RAW
1834 #define CLOCK_MONOTONIC_RAW 4
1835 #endif
1836 #ifndef CLOCK_BOOTTIME
1837 #define CLOCK_BOOTTIME 7
1838 #endif
1839 #ifndef CLOCK_TAI
1840 #define CLOCK_TAI 11
1841 #endif
1842 
1843 static const struct clockid_map clockids[] = {
1844 	/* available for all events, NMI safe */
1845 	CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1846 	CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1847 
1848 	/* available for some events */
1849 	CLOCKID_MAP("realtime", CLOCK_REALTIME),
1850 	CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1851 	CLOCKID_MAP("tai", CLOCK_TAI),
1852 
1853 	/* available for the lazy */
1854 	CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1855 	CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1856 	CLOCKID_MAP("real", CLOCK_REALTIME),
1857 	CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1858 
1859 	CLOCKID_END,
1860 };
1861 
1862 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1863 {
1864 	struct timespec res;
1865 
1866 	*res_ns = 0;
1867 	if (!clock_getres(clk_id, &res))
1868 		*res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1869 	else
1870 		pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1871 
1872 	return 0;
1873 }
1874 
1875 static int parse_clockid(const struct option *opt, const char *str, int unset)
1876 {
1877 	struct record_opts *opts = (struct record_opts *)opt->value;
1878 	const struct clockid_map *cm;
1879 	const char *ostr = str;
1880 
1881 	if (unset) {
1882 		opts->use_clockid = 0;
1883 		return 0;
1884 	}
1885 
1886 	/* no arg passed */
1887 	if (!str)
1888 		return 0;
1889 
1890 	/* no setting it twice */
1891 	if (opts->use_clockid)
1892 		return -1;
1893 
1894 	opts->use_clockid = true;
1895 
1896 	/* if its a number, we're done */
1897 	if (sscanf(str, "%d", &opts->clockid) == 1)
1898 		return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1899 
1900 	/* allow a "CLOCK_" prefix to the name */
1901 	if (!strncasecmp(str, "CLOCK_", 6))
1902 		str += 6;
1903 
1904 	for (cm = clockids; cm->name; cm++) {
1905 		if (!strcasecmp(str, cm->name)) {
1906 			opts->clockid = cm->clockid;
1907 			return get_clockid_res(opts->clockid,
1908 					       &opts->clockid_res_ns);
1909 		}
1910 	}
1911 
1912 	opts->use_clockid = false;
1913 	ui__warning("unknown clockid %s, check man page\n", ostr);
1914 	return -1;
1915 }
1916 
1917 static int record__parse_affinity(const struct option *opt, const char *str, int unset)
1918 {
1919 	struct record_opts *opts = (struct record_opts *)opt->value;
1920 
1921 	if (unset || !str)
1922 		return 0;
1923 
1924 	if (!strcasecmp(str, "node"))
1925 		opts->affinity = PERF_AFFINITY_NODE;
1926 	else if (!strcasecmp(str, "cpu"))
1927 		opts->affinity = PERF_AFFINITY_CPU;
1928 
1929 	return 0;
1930 }
1931 
1932 static int record__parse_mmap_pages(const struct option *opt,
1933 				    const char *str,
1934 				    int unset __maybe_unused)
1935 {
1936 	struct record_opts *opts = opt->value;
1937 	char *s, *p;
1938 	unsigned int mmap_pages;
1939 	int ret;
1940 
1941 	if (!str)
1942 		return -EINVAL;
1943 
1944 	s = strdup(str);
1945 	if (!s)
1946 		return -ENOMEM;
1947 
1948 	p = strchr(s, ',');
1949 	if (p)
1950 		*p = '\0';
1951 
1952 	if (*s) {
1953 		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1954 		if (ret)
1955 			goto out_free;
1956 		opts->mmap_pages = mmap_pages;
1957 	}
1958 
1959 	if (!p) {
1960 		ret = 0;
1961 		goto out_free;
1962 	}
1963 
1964 	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1965 	if (ret)
1966 		goto out_free;
1967 
1968 	opts->auxtrace_mmap_pages = mmap_pages;
1969 
1970 out_free:
1971 	free(s);
1972 	return ret;
1973 }
1974 
1975 static void switch_output_size_warn(struct record *rec)
1976 {
1977 	u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
1978 	struct switch_output *s = &rec->switch_output;
1979 
1980 	wakeup_size /= 2;
1981 
1982 	if (s->size < wakeup_size) {
1983 		char buf[100];
1984 
1985 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1986 		pr_warning("WARNING: switch-output data size lower than "
1987 			   "wakeup kernel buffer size (%s) "
1988 			   "expect bigger perf.data sizes\n", buf);
1989 	}
1990 }
1991 
1992 static int switch_output_setup(struct record *rec)
1993 {
1994 	struct switch_output *s = &rec->switch_output;
1995 	static struct parse_tag tags_size[] = {
1996 		{ .tag  = 'B', .mult = 1       },
1997 		{ .tag  = 'K', .mult = 1 << 10 },
1998 		{ .tag  = 'M', .mult = 1 << 20 },
1999 		{ .tag  = 'G', .mult = 1 << 30 },
2000 		{ .tag  = 0 },
2001 	};
2002 	static struct parse_tag tags_time[] = {
2003 		{ .tag  = 's', .mult = 1        },
2004 		{ .tag  = 'm', .mult = 60       },
2005 		{ .tag  = 'h', .mult = 60*60    },
2006 		{ .tag  = 'd', .mult = 60*60*24 },
2007 		{ .tag  = 0 },
2008 	};
2009 	unsigned long val;
2010 
2011 	if (!s->set)
2012 		return 0;
2013 
2014 	if (!strcmp(s->str, "signal")) {
2015 		s->signal = true;
2016 		pr_debug("switch-output with SIGUSR2 signal\n");
2017 		goto enabled;
2018 	}
2019 
2020 	val = parse_tag_value(s->str, tags_size);
2021 	if (val != (unsigned long) -1) {
2022 		s->size = val;
2023 		pr_debug("switch-output with %s size threshold\n", s->str);
2024 		goto enabled;
2025 	}
2026 
2027 	val = parse_tag_value(s->str, tags_time);
2028 	if (val != (unsigned long) -1) {
2029 		s->time = val;
2030 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
2031 			 s->str, s->time);
2032 		goto enabled;
2033 	}
2034 
2035 	return -1;
2036 
2037 enabled:
2038 	rec->timestamp_filename = true;
2039 	s->enabled              = true;
2040 
2041 	if (s->size && !rec->opts.no_buffering)
2042 		switch_output_size_warn(rec);
2043 
2044 	return 0;
2045 }
2046 
2047 static const char * const __record_usage[] = {
2048 	"perf record [<options>] [<command>]",
2049 	"perf record [<options>] -- <command> [<options>]",
2050 	NULL
2051 };
2052 const char * const *record_usage = __record_usage;
2053 
2054 /*
2055  * XXX Ideally would be local to cmd_record() and passed to a record__new
2056  * because we need to have access to it in record__exit, that is called
2057  * after cmd_record() exits, but since record_options need to be accessible to
2058  * builtin-script, leave it here.
2059  *
2060  * At least we don't ouch it in all the other functions here directly.
2061  *
2062  * Just say no to tons of global variables, sigh.
2063  */
2064 static struct record record = {
2065 	.opts = {
2066 		.sample_time	     = true,
2067 		.mmap_pages	     = UINT_MAX,
2068 		.user_freq	     = UINT_MAX,
2069 		.user_interval	     = ULLONG_MAX,
2070 		.freq		     = 4000,
2071 		.target		     = {
2072 			.uses_mmap   = true,
2073 			.default_per_cpu = true,
2074 		},
2075 		.mmap_flush          = MMAP_FLUSH_DEFAULT,
2076 	},
2077 	.tool = {
2078 		.sample		= process_sample_event,
2079 		.fork		= perf_event__process_fork,
2080 		.exit		= perf_event__process_exit,
2081 		.comm		= perf_event__process_comm,
2082 		.namespaces	= perf_event__process_namespaces,
2083 		.mmap		= perf_event__process_mmap,
2084 		.mmap2		= perf_event__process_mmap2,
2085 		.ordered_events	= true,
2086 	},
2087 };
2088 
2089 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
2090 	"\n\t\t\t\tDefault: fp";
2091 
2092 static bool dry_run;
2093 
2094 /*
2095  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
2096  * with it and switch to use the library functions in perf_evlist that came
2097  * from builtin-record.c, i.e. use record_opts,
2098  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
2099  * using pipes, etc.
2100  */
2101 static struct option __record_options[] = {
2102 	OPT_CALLBACK('e', "event", &record.evlist, "event",
2103 		     "event selector. use 'perf list' to list available events",
2104 		     parse_events_option),
2105 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
2106 		     "event filter", parse_filter),
2107 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
2108 			   NULL, "don't record events from perf itself",
2109 			   exclude_perf),
2110 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
2111 		    "record events on existing process id"),
2112 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
2113 		    "record events on existing thread id"),
2114 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
2115 		    "collect data with this RT SCHED_FIFO priority"),
2116 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
2117 		    "collect data without buffering"),
2118 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
2119 		    "collect raw sample records from all opened counters"),
2120 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
2121 			    "system-wide collection from all CPUs"),
2122 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
2123 		    "list of cpus to monitor"),
2124 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
2125 	OPT_STRING('o', "output", &record.data.path, "file",
2126 		    "output file name"),
2127 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
2128 			&record.opts.no_inherit_set,
2129 			"child tasks do not inherit counters"),
2130 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
2131 		    "synthesize non-sample events at the end of output"),
2132 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
2133 	OPT_BOOLEAN(0, "no-bpf-event", &record.opts.no_bpf_event, "record bpf events"),
2134 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
2135 		    "Fail if the specified frequency can't be used"),
2136 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
2137 		     "profile at this frequency",
2138 		      record__parse_freq),
2139 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
2140 		     "number of mmap data pages and AUX area tracing mmap pages",
2141 		     record__parse_mmap_pages),
2142 	OPT_CALLBACK(0, "mmap-flush", &record.opts, "number",
2143 		     "Minimal number of bytes that is extracted from mmap data pages (default: 1)",
2144 		     record__mmap_flush_parse),
2145 	OPT_BOOLEAN(0, "group", &record.opts.group,
2146 		    "put the counters into a counter group"),
2147 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
2148 			   NULL, "enables call-graph recording" ,
2149 			   &record_callchain_opt),
2150 	OPT_CALLBACK(0, "call-graph", &record.opts,
2151 		     "record_mode[,record_size]", record_callchain_help,
2152 		     &record_parse_callchain_opt),
2153 	OPT_INCR('v', "verbose", &verbose,
2154 		    "be more verbose (show counter open errors, etc)"),
2155 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
2156 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
2157 		    "per thread counts"),
2158 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
2159 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
2160 		    "Record the sample physical addresses"),
2161 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
2162 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
2163 			&record.opts.sample_time_set,
2164 			"Record the sample timestamps"),
2165 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
2166 			"Record the sample period"),
2167 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
2168 		    "don't sample"),
2169 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
2170 			&record.no_buildid_cache_set,
2171 			"do not update the buildid cache"),
2172 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
2173 			&record.no_buildid_set,
2174 			"do not collect buildids in perf.data"),
2175 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
2176 		     "monitor event in cgroup name only",
2177 		     parse_cgroups),
2178 	OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
2179 		  "ms to wait before starting measurement after program start"),
2180 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
2181 		   "user to profile"),
2182 
2183 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
2184 		     "branch any", "sample any taken branches",
2185 		     parse_branch_stack),
2186 
2187 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
2188 		     "branch filter mask", "branch stack filter modes",
2189 		     parse_branch_stack),
2190 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
2191 		    "sample by weight (on special events only)"),
2192 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
2193 		    "sample transaction flags (special events only)"),
2194 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
2195 		    "use per-thread mmaps"),
2196 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
2197 		    "sample selected machine registers on interrupt,"
2198 		    " use '-I?' to list register names", parse_intr_regs),
2199 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
2200 		    "sample selected machine registers on interrupt,"
2201 		    " use '--user-regs=?' to list register names", parse_user_regs),
2202 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
2203 		    "Record running/enabled time of read (:S) events"),
2204 	OPT_CALLBACK('k', "clockid", &record.opts,
2205 	"clockid", "clockid to use for events, see clock_gettime()",
2206 	parse_clockid),
2207 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
2208 			  "opts", "AUX area tracing Snapshot Mode", ""),
2209 	OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout,
2210 			"per thread proc mmap processing timeout in ms"),
2211 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
2212 		    "Record namespaces events"),
2213 	OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
2214 		    "Record context switch events"),
2215 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
2216 			 "Configure all used events to run in kernel space.",
2217 			 PARSE_OPT_EXCLUSIVE),
2218 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
2219 			 "Configure all used events to run in user space.",
2220 			 PARSE_OPT_EXCLUSIVE),
2221 	OPT_BOOLEAN(0, "kernel-callchains", &record.opts.kernel_callchains,
2222 		    "collect kernel callchains"),
2223 	OPT_BOOLEAN(0, "user-callchains", &record.opts.user_callchains,
2224 		    "collect user callchains"),
2225 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
2226 		   "clang binary to use for compiling BPF scriptlets"),
2227 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
2228 		   "options passed to clang when compiling BPF scriptlets"),
2229 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
2230 		   "file", "vmlinux pathname"),
2231 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
2232 		    "Record build-id of all DSOs regardless of hits"),
2233 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
2234 		    "append timestamp to output filename"),
2235 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
2236 		    "Record timestamp boundary (time of first/last samples)"),
2237 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
2238 			  &record.switch_output.set, "signal or size[BKMG] or time[smhd]",
2239 			  "Switch output when receiving SIGUSR2 (signal) or cross a size or time threshold",
2240 			  "signal"),
2241 	OPT_INTEGER(0, "switch-max-files", &record.switch_output.num_files,
2242 		   "Limit number of switch output generated files"),
2243 	OPT_BOOLEAN(0, "dry-run", &dry_run,
2244 		    "Parse options then exit"),
2245 #ifdef HAVE_AIO_SUPPORT
2246 	OPT_CALLBACK_OPTARG(0, "aio", &record.opts,
2247 		     &nr_cblocks_default, "n", "Use <n> control blocks in asynchronous trace writing mode (default: 1, max: 4)",
2248 		     record__aio_parse),
2249 #endif
2250 	OPT_CALLBACK(0, "affinity", &record.opts, "node|cpu",
2251 		     "Set affinity mask of trace reading thread to NUMA node cpu mask or cpu of processed mmap buffer",
2252 		     record__parse_affinity),
2253 #ifdef HAVE_ZSTD_SUPPORT
2254 	OPT_CALLBACK_OPTARG('z', "compression-level", &record.opts, &comp_level_default,
2255 			    "n", "Compressed records using specified level (default: 1 - fastest compression, 22 - greatest compression)",
2256 			    record__parse_comp_level),
2257 #endif
2258 	OPT_END()
2259 };
2260 
2261 struct option *record_options = __record_options;
2262 
2263 int cmd_record(int argc, const char **argv)
2264 {
2265 	int err;
2266 	struct record *rec = &record;
2267 	char errbuf[BUFSIZ];
2268 
2269 	setlocale(LC_ALL, "");
2270 
2271 #ifndef HAVE_LIBBPF_SUPPORT
2272 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
2273 	set_nobuild('\0', "clang-path", true);
2274 	set_nobuild('\0', "clang-opt", true);
2275 # undef set_nobuild
2276 #endif
2277 
2278 #ifndef HAVE_BPF_PROLOGUE
2279 # if !defined (HAVE_DWARF_SUPPORT)
2280 #  define REASON  "NO_DWARF=1"
2281 # elif !defined (HAVE_LIBBPF_SUPPORT)
2282 #  define REASON  "NO_LIBBPF=1"
2283 # else
2284 #  define REASON  "this architecture doesn't support BPF prologue"
2285 # endif
2286 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
2287 	set_nobuild('\0', "vmlinux", true);
2288 # undef set_nobuild
2289 # undef REASON
2290 #endif
2291 
2292 	CPU_ZERO(&rec->affinity_mask);
2293 	rec->opts.affinity = PERF_AFFINITY_SYS;
2294 
2295 	rec->evlist = evlist__new();
2296 	if (rec->evlist == NULL)
2297 		return -ENOMEM;
2298 
2299 	err = perf_config(perf_record_config, rec);
2300 	if (err)
2301 		return err;
2302 
2303 	argc = parse_options(argc, argv, record_options, record_usage,
2304 			    PARSE_OPT_STOP_AT_NON_OPTION);
2305 	if (quiet)
2306 		perf_quiet_option();
2307 
2308 	/* Make system wide (-a) the default target. */
2309 	if (!argc && target__none(&rec->opts.target))
2310 		rec->opts.target.system_wide = true;
2311 
2312 	if (nr_cgroups && !rec->opts.target.system_wide) {
2313 		usage_with_options_msg(record_usage, record_options,
2314 			"cgroup monitoring only available in system-wide mode");
2315 
2316 	}
2317 
2318 	if (rec->opts.comp_level != 0) {
2319 		pr_debug("Compression enabled, disabling build id collection at the end of the session.\n");
2320 		rec->no_buildid = true;
2321 	}
2322 
2323 	if (rec->opts.record_switch_events &&
2324 	    !perf_can_record_switch_events()) {
2325 		ui__error("kernel does not support recording context switch events\n");
2326 		parse_options_usage(record_usage, record_options, "switch-events", 0);
2327 		return -EINVAL;
2328 	}
2329 
2330 	if (switch_output_setup(rec)) {
2331 		parse_options_usage(record_usage, record_options, "switch-output", 0);
2332 		return -EINVAL;
2333 	}
2334 
2335 	if (rec->switch_output.time) {
2336 		signal(SIGALRM, alarm_sig_handler);
2337 		alarm(rec->switch_output.time);
2338 	}
2339 
2340 	if (rec->switch_output.num_files) {
2341 		rec->switch_output.filenames = calloc(sizeof(char *),
2342 						      rec->switch_output.num_files);
2343 		if (!rec->switch_output.filenames)
2344 			return -EINVAL;
2345 	}
2346 
2347 	/*
2348 	 * Allow aliases to facilitate the lookup of symbols for address
2349 	 * filters. Refer to auxtrace_parse_filters().
2350 	 */
2351 	symbol_conf.allow_aliases = true;
2352 
2353 	symbol__init(NULL);
2354 
2355 	err = record__auxtrace_init(rec);
2356 	if (err)
2357 		goto out;
2358 
2359 	if (dry_run)
2360 		goto out;
2361 
2362 	err = bpf__setup_stdout(rec->evlist);
2363 	if (err) {
2364 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
2365 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
2366 			 errbuf);
2367 		goto out;
2368 	}
2369 
2370 	err = -ENOMEM;
2371 
2372 	if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist))
2373 		pr_warning(
2374 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
2375 "check /proc/sys/kernel/kptr_restrict.\n\n"
2376 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
2377 "file is not found in the buildid cache or in the vmlinux path.\n\n"
2378 "Samples in kernel modules won't be resolved at all.\n\n"
2379 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
2380 "even with a suitable vmlinux or kallsyms file.\n\n");
2381 
2382 	if (rec->no_buildid_cache || rec->no_buildid) {
2383 		disable_buildid_cache();
2384 	} else if (rec->switch_output.enabled) {
2385 		/*
2386 		 * In 'perf record --switch-output', disable buildid
2387 		 * generation by default to reduce data file switching
2388 		 * overhead. Still generate buildid if they are required
2389 		 * explicitly using
2390 		 *
2391 		 *  perf record --switch-output --no-no-buildid \
2392 		 *              --no-no-buildid-cache
2393 		 *
2394 		 * Following code equals to:
2395 		 *
2396 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
2397 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
2398 		 *         disable_buildid_cache();
2399 		 */
2400 		bool disable = true;
2401 
2402 		if (rec->no_buildid_set && !rec->no_buildid)
2403 			disable = false;
2404 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
2405 			disable = false;
2406 		if (disable) {
2407 			rec->no_buildid = true;
2408 			rec->no_buildid_cache = true;
2409 			disable_buildid_cache();
2410 		}
2411 	}
2412 
2413 	if (record.opts.overwrite)
2414 		record.opts.tail_synthesize = true;
2415 
2416 	if (rec->evlist->core.nr_entries == 0 &&
2417 	    __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
2418 		pr_err("Not enough memory for event selector list\n");
2419 		goto out;
2420 	}
2421 
2422 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
2423 		rec->opts.no_inherit = true;
2424 
2425 	err = target__validate(&rec->opts.target);
2426 	if (err) {
2427 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2428 		ui__warning("%s\n", errbuf);
2429 	}
2430 
2431 	err = target__parse_uid(&rec->opts.target);
2432 	if (err) {
2433 		int saved_errno = errno;
2434 
2435 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
2436 		ui__error("%s", errbuf);
2437 
2438 		err = -saved_errno;
2439 		goto out;
2440 	}
2441 
2442 	/* Enable ignoring missing threads when -u/-p option is defined. */
2443 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
2444 
2445 	err = -ENOMEM;
2446 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
2447 		usage_with_options(record_usage, record_options);
2448 
2449 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
2450 	if (err)
2451 		goto out;
2452 
2453 	/*
2454 	 * We take all buildids when the file contains
2455 	 * AUX area tracing data because we do not decode the
2456 	 * trace because it would take too long.
2457 	 */
2458 	if (rec->opts.full_auxtrace)
2459 		rec->buildid_all = true;
2460 
2461 	if (record_opts__config(&rec->opts)) {
2462 		err = -EINVAL;
2463 		goto out;
2464 	}
2465 
2466 	if (rec->opts.nr_cblocks > nr_cblocks_max)
2467 		rec->opts.nr_cblocks = nr_cblocks_max;
2468 	pr_debug("nr_cblocks: %d\n", rec->opts.nr_cblocks);
2469 
2470 	pr_debug("affinity: %s\n", affinity_tags[rec->opts.affinity]);
2471 	pr_debug("mmap flush: %d\n", rec->opts.mmap_flush);
2472 
2473 	if (rec->opts.comp_level > comp_level_max)
2474 		rec->opts.comp_level = comp_level_max;
2475 	pr_debug("comp level: %d\n", rec->opts.comp_level);
2476 
2477 	err = __cmd_record(&record, argc, argv);
2478 out:
2479 	evlist__delete(rec->evlist);
2480 	symbol__exit();
2481 	auxtrace_record__free(rec->itr);
2482 	return err;
2483 }
2484 
2485 static void snapshot_sig_handler(int sig __maybe_unused)
2486 {
2487 	struct record *rec = &record;
2488 
2489 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
2490 		trigger_hit(&auxtrace_snapshot_trigger);
2491 		auxtrace_record__snapshot_started = 1;
2492 		if (auxtrace_record__snapshot_start(record.itr))
2493 			trigger_error(&auxtrace_snapshot_trigger);
2494 	}
2495 
2496 	if (switch_output_signal(rec))
2497 		trigger_hit(&switch_output_trigger);
2498 }
2499 
2500 static void alarm_sig_handler(int sig __maybe_unused)
2501 {
2502 	struct record *rec = &record;
2503 
2504 	if (switch_output_time(rec))
2505 		trigger_hit(&switch_output_trigger);
2506 }
2507