xref: /linux/tools/perf/builtin-record.c (revision 0a98bf52b15dfd66da2cf666495b3f7841c7b5ab)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * builtin-record.c
4  *
5  * Builtin record command: Record the profile of a workload
6  * (or a CPU, or a PID) into the perf.data output file - for
7  * later analysis via perf report.
8  */
9 #include "builtin.h"
10 
11 #include "perf.h"
12 
13 #include "util/build-id.h"
14 #include "util/util.h"
15 #include <subcmd/parse-options.h>
16 #include "util/parse-events.h"
17 #include "util/config.h"
18 
19 #include "util/callchain.h"
20 #include "util/cgroup.h"
21 #include "util/header.h"
22 #include "util/event.h"
23 #include "util/evlist.h"
24 #include "util/evsel.h"
25 #include "util/debug.h"
26 #include "util/drv_configs.h"
27 #include "util/session.h"
28 #include "util/tool.h"
29 #include "util/symbol.h"
30 #include "util/cpumap.h"
31 #include "util/thread_map.h"
32 #include "util/data.h"
33 #include "util/perf_regs.h"
34 #include "util/auxtrace.h"
35 #include "util/tsc.h"
36 #include "util/parse-branch-options.h"
37 #include "util/parse-regs-options.h"
38 #include "util/llvm-utils.h"
39 #include "util/bpf-loader.h"
40 #include "util/trigger.h"
41 #include "util/perf-hooks.h"
42 #include "util/time-utils.h"
43 #include "util/units.h"
44 #include "asm/bug.h"
45 
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <locale.h>
49 #include <poll.h>
50 #include <unistd.h>
51 #include <sched.h>
52 #include <signal.h>
53 #include <sys/mman.h>
54 #include <sys/wait.h>
55 #include <linux/time64.h>
56 
57 struct switch_output {
58 	bool		 enabled;
59 	bool		 signal;
60 	unsigned long	 size;
61 	unsigned long	 time;
62 	const char	*str;
63 	bool		 set;
64 };
65 
66 struct record {
67 	struct perf_tool	tool;
68 	struct record_opts	opts;
69 	u64			bytes_written;
70 	struct perf_data	data;
71 	struct auxtrace_record	*itr;
72 	struct perf_evlist	*evlist;
73 	struct perf_session	*session;
74 	int			realtime_prio;
75 	bool			no_buildid;
76 	bool			no_buildid_set;
77 	bool			no_buildid_cache;
78 	bool			no_buildid_cache_set;
79 	bool			buildid_all;
80 	bool			timestamp_filename;
81 	bool			timestamp_boundary;
82 	struct switch_output	switch_output;
83 	unsigned long long	samples;
84 };
85 
86 static volatile int auxtrace_record__snapshot_started;
87 static DEFINE_TRIGGER(auxtrace_snapshot_trigger);
88 static DEFINE_TRIGGER(switch_output_trigger);
89 
90 static bool switch_output_signal(struct record *rec)
91 {
92 	return rec->switch_output.signal &&
93 	       trigger_is_ready(&switch_output_trigger);
94 }
95 
96 static bool switch_output_size(struct record *rec)
97 {
98 	return rec->switch_output.size &&
99 	       trigger_is_ready(&switch_output_trigger) &&
100 	       (rec->bytes_written >= rec->switch_output.size);
101 }
102 
103 static bool switch_output_time(struct record *rec)
104 {
105 	return rec->switch_output.time &&
106 	       trigger_is_ready(&switch_output_trigger);
107 }
108 
109 static int record__write(struct record *rec, struct perf_mmap *map __maybe_unused,
110 			 void *bf, size_t size)
111 {
112 	struct perf_data_file *file = &rec->session->data->file;
113 
114 	if (perf_data_file__write(file, bf, size) < 0) {
115 		pr_err("failed to write perf data, error: %m\n");
116 		return -1;
117 	}
118 
119 	rec->bytes_written += size;
120 
121 	if (switch_output_size(rec))
122 		trigger_hit(&switch_output_trigger);
123 
124 	return 0;
125 }
126 
127 static int process_synthesized_event(struct perf_tool *tool,
128 				     union perf_event *event,
129 				     struct perf_sample *sample __maybe_unused,
130 				     struct machine *machine __maybe_unused)
131 {
132 	struct record *rec = container_of(tool, struct record, tool);
133 	return record__write(rec, NULL, event, event->header.size);
134 }
135 
136 static int record__pushfn(struct perf_mmap *map, void *to, void *bf, size_t size)
137 {
138 	struct record *rec = to;
139 
140 	rec->samples++;
141 	return record__write(rec, map, bf, size);
142 }
143 
144 static volatile int done;
145 static volatile int signr = -1;
146 static volatile int child_finished;
147 
148 static void sig_handler(int sig)
149 {
150 	if (sig == SIGCHLD)
151 		child_finished = 1;
152 	else
153 		signr = sig;
154 
155 	done = 1;
156 }
157 
158 static void sigsegv_handler(int sig)
159 {
160 	perf_hooks__recover();
161 	sighandler_dump_stack(sig);
162 }
163 
164 static void record__sig_exit(void)
165 {
166 	if (signr == -1)
167 		return;
168 
169 	signal(signr, SIG_DFL);
170 	raise(signr);
171 }
172 
173 #ifdef HAVE_AUXTRACE_SUPPORT
174 
175 static int record__process_auxtrace(struct perf_tool *tool,
176 				    struct perf_mmap *map,
177 				    union perf_event *event, void *data1,
178 				    size_t len1, void *data2, size_t len2)
179 {
180 	struct record *rec = container_of(tool, struct record, tool);
181 	struct perf_data *data = &rec->data;
182 	size_t padding;
183 	u8 pad[8] = {0};
184 
185 	if (!perf_data__is_pipe(data)) {
186 		off_t file_offset;
187 		int fd = perf_data__fd(data);
188 		int err;
189 
190 		file_offset = lseek(fd, 0, SEEK_CUR);
191 		if (file_offset == -1)
192 			return -1;
193 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
194 						     event, file_offset);
195 		if (err)
196 			return err;
197 	}
198 
199 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
200 	padding = (len1 + len2) & 7;
201 	if (padding)
202 		padding = 8 - padding;
203 
204 	record__write(rec, map, event, event->header.size);
205 	record__write(rec, map, data1, len1);
206 	if (len2)
207 		record__write(rec, map, data2, len2);
208 	record__write(rec, map, &pad, padding);
209 
210 	return 0;
211 }
212 
213 static int record__auxtrace_mmap_read(struct record *rec,
214 				      struct perf_mmap *map)
215 {
216 	int ret;
217 
218 	ret = auxtrace_mmap__read(map, rec->itr, &rec->tool,
219 				  record__process_auxtrace);
220 	if (ret < 0)
221 		return ret;
222 
223 	if (ret)
224 		rec->samples++;
225 
226 	return 0;
227 }
228 
229 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
230 					       struct perf_mmap *map)
231 {
232 	int ret;
233 
234 	ret = auxtrace_mmap__read_snapshot(map, rec->itr, &rec->tool,
235 					   record__process_auxtrace,
236 					   rec->opts.auxtrace_snapshot_size);
237 	if (ret < 0)
238 		return ret;
239 
240 	if (ret)
241 		rec->samples++;
242 
243 	return 0;
244 }
245 
246 static int record__auxtrace_read_snapshot_all(struct record *rec)
247 {
248 	int i;
249 	int rc = 0;
250 
251 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
252 		struct perf_mmap *map = &rec->evlist->mmap[i];
253 
254 		if (!map->auxtrace_mmap.base)
255 			continue;
256 
257 		if (record__auxtrace_mmap_read_snapshot(rec, map) != 0) {
258 			rc = -1;
259 			goto out;
260 		}
261 	}
262 out:
263 	return rc;
264 }
265 
266 static void record__read_auxtrace_snapshot(struct record *rec)
267 {
268 	pr_debug("Recording AUX area tracing snapshot\n");
269 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
270 		trigger_error(&auxtrace_snapshot_trigger);
271 	} else {
272 		if (auxtrace_record__snapshot_finish(rec->itr))
273 			trigger_error(&auxtrace_snapshot_trigger);
274 		else
275 			trigger_ready(&auxtrace_snapshot_trigger);
276 	}
277 }
278 
279 static int record__auxtrace_init(struct record *rec)
280 {
281 	int err;
282 
283 	if (!rec->itr) {
284 		rec->itr = auxtrace_record__init(rec->evlist, &err);
285 		if (err)
286 			return err;
287 	}
288 
289 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
290 					      rec->opts.auxtrace_snapshot_opts);
291 	if (err)
292 		return err;
293 
294 	return auxtrace_parse_filters(rec->evlist);
295 }
296 
297 #else
298 
299 static inline
300 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
301 			       struct perf_mmap *map __maybe_unused)
302 {
303 	return 0;
304 }
305 
306 static inline
307 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
308 {
309 }
310 
311 static inline
312 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
313 {
314 	return 0;
315 }
316 
317 static int record__auxtrace_init(struct record *rec __maybe_unused)
318 {
319 	return 0;
320 }
321 
322 #endif
323 
324 static int record__mmap_evlist(struct record *rec,
325 			       struct perf_evlist *evlist)
326 {
327 	struct record_opts *opts = &rec->opts;
328 	char msg[512];
329 
330 	if (perf_evlist__mmap_ex(evlist, opts->mmap_pages,
331 				 opts->auxtrace_mmap_pages,
332 				 opts->auxtrace_snapshot_mode) < 0) {
333 		if (errno == EPERM) {
334 			pr_err("Permission error mapping pages.\n"
335 			       "Consider increasing "
336 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
337 			       "or try again with a smaller value of -m/--mmap_pages.\n"
338 			       "(current value: %u,%u)\n",
339 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
340 			return -errno;
341 		} else {
342 			pr_err("failed to mmap with %d (%s)\n", errno,
343 				str_error_r(errno, msg, sizeof(msg)));
344 			if (errno)
345 				return -errno;
346 			else
347 				return -EINVAL;
348 		}
349 	}
350 	return 0;
351 }
352 
353 static int record__mmap(struct record *rec)
354 {
355 	return record__mmap_evlist(rec, rec->evlist);
356 }
357 
358 static int record__open(struct record *rec)
359 {
360 	char msg[BUFSIZ];
361 	struct perf_evsel *pos;
362 	struct perf_evlist *evlist = rec->evlist;
363 	struct perf_session *session = rec->session;
364 	struct record_opts *opts = &rec->opts;
365 	struct perf_evsel_config_term *err_term;
366 	int rc = 0;
367 
368 	/*
369 	 * For initial_delay we need to add a dummy event so that we can track
370 	 * PERF_RECORD_MMAP while we wait for the initial delay to enable the
371 	 * real events, the ones asked by the user.
372 	 */
373 	if (opts->initial_delay) {
374 		if (perf_evlist__add_dummy(evlist))
375 			return -ENOMEM;
376 
377 		pos = perf_evlist__first(evlist);
378 		pos->tracking = 0;
379 		pos = perf_evlist__last(evlist);
380 		pos->tracking = 1;
381 		pos->attr.enable_on_exec = 1;
382 	}
383 
384 	perf_evlist__config(evlist, opts, &callchain_param);
385 
386 	evlist__for_each_entry(evlist, pos) {
387 try_again:
388 		if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
389 			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
390 				if (verbose > 0)
391 					ui__warning("%s\n", msg);
392 				goto try_again;
393 			}
394 
395 			rc = -errno;
396 			perf_evsel__open_strerror(pos, &opts->target,
397 						  errno, msg, sizeof(msg));
398 			ui__error("%s\n", msg);
399 			goto out;
400 		}
401 
402 		pos->supported = true;
403 	}
404 
405 	if (perf_evlist__apply_filters(evlist, &pos)) {
406 		pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n",
407 			pos->filter, perf_evsel__name(pos), errno,
408 			str_error_r(errno, msg, sizeof(msg)));
409 		rc = -1;
410 		goto out;
411 	}
412 
413 	if (perf_evlist__apply_drv_configs(evlist, &pos, &err_term)) {
414 		pr_err("failed to set config \"%s\" on event %s with %d (%s)\n",
415 		      err_term->val.drv_cfg, perf_evsel__name(pos), errno,
416 		      str_error_r(errno, msg, sizeof(msg)));
417 		rc = -1;
418 		goto out;
419 	}
420 
421 	rc = record__mmap(rec);
422 	if (rc)
423 		goto out;
424 
425 	session->evlist = evlist;
426 	perf_session__set_id_hdr_size(session);
427 out:
428 	return rc;
429 }
430 
431 static int process_sample_event(struct perf_tool *tool,
432 				union perf_event *event,
433 				struct perf_sample *sample,
434 				struct perf_evsel *evsel,
435 				struct machine *machine)
436 {
437 	struct record *rec = container_of(tool, struct record, tool);
438 
439 	if (rec->evlist->first_sample_time == 0)
440 		rec->evlist->first_sample_time = sample->time;
441 
442 	rec->evlist->last_sample_time = sample->time;
443 
444 	if (rec->buildid_all)
445 		return 0;
446 
447 	rec->samples++;
448 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
449 }
450 
451 static int process_buildids(struct record *rec)
452 {
453 	struct perf_data *data = &rec->data;
454 	struct perf_session *session = rec->session;
455 
456 	if (data->size == 0)
457 		return 0;
458 
459 	/*
460 	 * During this process, it'll load kernel map and replace the
461 	 * dso->long_name to a real pathname it found.  In this case
462 	 * we prefer the vmlinux path like
463 	 *   /lib/modules/3.16.4/build/vmlinux
464 	 *
465 	 * rather than build-id path (in debug directory).
466 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
467 	 */
468 	symbol_conf.ignore_vmlinux_buildid = true;
469 
470 	/*
471 	 * If --buildid-all is given, it marks all DSO regardless of hits,
472 	 * so no need to process samples. But if timestamp_boundary is enabled,
473 	 * it still needs to walk on all samples to get the timestamps of
474 	 * first/last samples.
475 	 */
476 	if (rec->buildid_all && !rec->timestamp_boundary)
477 		rec->tool.sample = NULL;
478 
479 	return perf_session__process_events(session);
480 }
481 
482 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
483 {
484 	int err;
485 	struct perf_tool *tool = data;
486 	/*
487 	 *As for guest kernel when processing subcommand record&report,
488 	 *we arrange module mmap prior to guest kernel mmap and trigger
489 	 *a preload dso because default guest module symbols are loaded
490 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
491 	 *method is used to avoid symbol missing when the first addr is
492 	 *in module instead of in guest kernel.
493 	 */
494 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
495 					     machine);
496 	if (err < 0)
497 		pr_err("Couldn't record guest kernel [%d]'s reference"
498 		       " relocation symbol.\n", machine->pid);
499 
500 	/*
501 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
502 	 * have no _text sometimes.
503 	 */
504 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
505 						 machine);
506 	if (err < 0)
507 		pr_err("Couldn't record guest kernel [%d]'s reference"
508 		       " relocation symbol.\n", machine->pid);
509 }
510 
511 static struct perf_event_header finished_round_event = {
512 	.size = sizeof(struct perf_event_header),
513 	.type = PERF_RECORD_FINISHED_ROUND,
514 };
515 
516 static int record__mmap_read_evlist(struct record *rec, struct perf_evlist *evlist,
517 				    bool overwrite)
518 {
519 	u64 bytes_written = rec->bytes_written;
520 	int i;
521 	int rc = 0;
522 	struct perf_mmap *maps;
523 
524 	if (!evlist)
525 		return 0;
526 
527 	maps = overwrite ? evlist->overwrite_mmap : evlist->mmap;
528 	if (!maps)
529 		return 0;
530 
531 	if (overwrite && evlist->bkw_mmap_state != BKW_MMAP_DATA_PENDING)
532 		return 0;
533 
534 	for (i = 0; i < evlist->nr_mmaps; i++) {
535 		struct perf_mmap *map = &maps[i];
536 
537 		if (map->base) {
538 			if (perf_mmap__push(map, rec, record__pushfn) != 0) {
539 				rc = -1;
540 				goto out;
541 			}
542 		}
543 
544 		if (map->auxtrace_mmap.base && !rec->opts.auxtrace_snapshot_mode &&
545 		    record__auxtrace_mmap_read(rec, map) != 0) {
546 			rc = -1;
547 			goto out;
548 		}
549 	}
550 
551 	/*
552 	 * Mark the round finished in case we wrote
553 	 * at least one event.
554 	 */
555 	if (bytes_written != rec->bytes_written)
556 		rc = record__write(rec, NULL, &finished_round_event, sizeof(finished_round_event));
557 
558 	if (overwrite)
559 		perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY);
560 out:
561 	return rc;
562 }
563 
564 static int record__mmap_read_all(struct record *rec)
565 {
566 	int err;
567 
568 	err = record__mmap_read_evlist(rec, rec->evlist, false);
569 	if (err)
570 		return err;
571 
572 	return record__mmap_read_evlist(rec, rec->evlist, true);
573 }
574 
575 static void record__init_features(struct record *rec)
576 {
577 	struct perf_session *session = rec->session;
578 	int feat;
579 
580 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
581 		perf_header__set_feat(&session->header, feat);
582 
583 	if (rec->no_buildid)
584 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
585 
586 	if (!have_tracepoints(&rec->evlist->entries))
587 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
588 
589 	if (!rec->opts.branch_stack)
590 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
591 
592 	if (!rec->opts.full_auxtrace)
593 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
594 
595 	if (!(rec->opts.use_clockid && rec->opts.clockid_res_ns))
596 		perf_header__clear_feat(&session->header, HEADER_CLOCKID);
597 
598 	perf_header__clear_feat(&session->header, HEADER_STAT);
599 }
600 
601 static void
602 record__finish_output(struct record *rec)
603 {
604 	struct perf_data *data = &rec->data;
605 	int fd = perf_data__fd(data);
606 
607 	if (data->is_pipe)
608 		return;
609 
610 	rec->session->header.data_size += rec->bytes_written;
611 	data->size = lseek(perf_data__fd(data), 0, SEEK_CUR);
612 
613 	if (!rec->no_buildid) {
614 		process_buildids(rec);
615 
616 		if (rec->buildid_all)
617 			dsos__hit_all(rec->session);
618 	}
619 	perf_session__write_header(rec->session, rec->evlist, fd, true);
620 
621 	return;
622 }
623 
624 static int record__synthesize_workload(struct record *rec, bool tail)
625 {
626 	int err;
627 	struct thread_map *thread_map;
628 
629 	if (rec->opts.tail_synthesize != tail)
630 		return 0;
631 
632 	thread_map = thread_map__new_by_tid(rec->evlist->workload.pid);
633 	if (thread_map == NULL)
634 		return -1;
635 
636 	err = perf_event__synthesize_thread_map(&rec->tool, thread_map,
637 						 process_synthesized_event,
638 						 &rec->session->machines.host,
639 						 rec->opts.sample_address,
640 						 rec->opts.proc_map_timeout);
641 	thread_map__put(thread_map);
642 	return err;
643 }
644 
645 static int record__synthesize(struct record *rec, bool tail);
646 
647 static int
648 record__switch_output(struct record *rec, bool at_exit)
649 {
650 	struct perf_data *data = &rec->data;
651 	int fd, err;
652 
653 	/* Same Size:      "2015122520103046"*/
654 	char timestamp[] = "InvalidTimestamp";
655 
656 	record__synthesize(rec, true);
657 	if (target__none(&rec->opts.target))
658 		record__synthesize_workload(rec, true);
659 
660 	rec->samples = 0;
661 	record__finish_output(rec);
662 	err = fetch_current_timestamp(timestamp, sizeof(timestamp));
663 	if (err) {
664 		pr_err("Failed to get current timestamp\n");
665 		return -EINVAL;
666 	}
667 
668 	fd = perf_data__switch(data, timestamp,
669 				    rec->session->header.data_offset,
670 				    at_exit);
671 	if (fd >= 0 && !at_exit) {
672 		rec->bytes_written = 0;
673 		rec->session->header.data_size = 0;
674 	}
675 
676 	if (!quiet)
677 		fprintf(stderr, "[ perf record: Dump %s.%s ]\n",
678 			data->file.path, timestamp);
679 
680 	/* Output tracking events */
681 	if (!at_exit) {
682 		record__synthesize(rec, false);
683 
684 		/*
685 		 * In 'perf record --switch-output' without -a,
686 		 * record__synthesize() in record__switch_output() won't
687 		 * generate tracking events because there's no thread_map
688 		 * in evlist. Which causes newly created perf.data doesn't
689 		 * contain map and comm information.
690 		 * Create a fake thread_map and directly call
691 		 * perf_event__synthesize_thread_map() for those events.
692 		 */
693 		if (target__none(&rec->opts.target))
694 			record__synthesize_workload(rec, false);
695 	}
696 	return fd;
697 }
698 
699 static volatile int workload_exec_errno;
700 
701 /*
702  * perf_evlist__prepare_workload will send a SIGUSR1
703  * if the fork fails, since we asked by setting its
704  * want_signal to true.
705  */
706 static void workload_exec_failed_signal(int signo __maybe_unused,
707 					siginfo_t *info,
708 					void *ucontext __maybe_unused)
709 {
710 	workload_exec_errno = info->si_value.sival_int;
711 	done = 1;
712 	child_finished = 1;
713 }
714 
715 static void snapshot_sig_handler(int sig);
716 static void alarm_sig_handler(int sig);
717 
718 int __weak
719 perf_event__synth_time_conv(const struct perf_event_mmap_page *pc __maybe_unused,
720 			    struct perf_tool *tool __maybe_unused,
721 			    perf_event__handler_t process __maybe_unused,
722 			    struct machine *machine __maybe_unused)
723 {
724 	return 0;
725 }
726 
727 static const struct perf_event_mmap_page *
728 perf_evlist__pick_pc(struct perf_evlist *evlist)
729 {
730 	if (evlist) {
731 		if (evlist->mmap && evlist->mmap[0].base)
732 			return evlist->mmap[0].base;
733 		if (evlist->overwrite_mmap && evlist->overwrite_mmap[0].base)
734 			return evlist->overwrite_mmap[0].base;
735 	}
736 	return NULL;
737 }
738 
739 static const struct perf_event_mmap_page *record__pick_pc(struct record *rec)
740 {
741 	const struct perf_event_mmap_page *pc;
742 
743 	pc = perf_evlist__pick_pc(rec->evlist);
744 	if (pc)
745 		return pc;
746 	return NULL;
747 }
748 
749 static int record__synthesize(struct record *rec, bool tail)
750 {
751 	struct perf_session *session = rec->session;
752 	struct machine *machine = &session->machines.host;
753 	struct perf_data *data = &rec->data;
754 	struct record_opts *opts = &rec->opts;
755 	struct perf_tool *tool = &rec->tool;
756 	int fd = perf_data__fd(data);
757 	int err = 0;
758 
759 	if (rec->opts.tail_synthesize != tail)
760 		return 0;
761 
762 	if (data->is_pipe) {
763 		/*
764 		 * We need to synthesize events first, because some
765 		 * features works on top of them (on report side).
766 		 */
767 		err = perf_event__synthesize_attrs(tool, rec->evlist,
768 						   process_synthesized_event);
769 		if (err < 0) {
770 			pr_err("Couldn't synthesize attrs.\n");
771 			goto out;
772 		}
773 
774 		err = perf_event__synthesize_features(tool, session, rec->evlist,
775 						      process_synthesized_event);
776 		if (err < 0) {
777 			pr_err("Couldn't synthesize features.\n");
778 			return err;
779 		}
780 
781 		if (have_tracepoints(&rec->evlist->entries)) {
782 			/*
783 			 * FIXME err <= 0 here actually means that
784 			 * there were no tracepoints so its not really
785 			 * an error, just that we don't need to
786 			 * synthesize anything.  We really have to
787 			 * return this more properly and also
788 			 * propagate errors that now are calling die()
789 			 */
790 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
791 								  process_synthesized_event);
792 			if (err <= 0) {
793 				pr_err("Couldn't record tracing data.\n");
794 				goto out;
795 			}
796 			rec->bytes_written += err;
797 		}
798 	}
799 
800 	err = perf_event__synth_time_conv(record__pick_pc(rec), tool,
801 					  process_synthesized_event, machine);
802 	if (err)
803 		goto out;
804 
805 	if (rec->opts.full_auxtrace) {
806 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
807 					session, process_synthesized_event);
808 		if (err)
809 			goto out;
810 	}
811 
812 	if (!perf_evlist__exclude_kernel(rec->evlist)) {
813 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
814 							 machine);
815 		WARN_ONCE(err < 0, "Couldn't record kernel reference relocation symbol\n"
816 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
817 				   "Check /proc/kallsyms permission or run as root.\n");
818 
819 		err = perf_event__synthesize_modules(tool, process_synthesized_event,
820 						     machine);
821 		WARN_ONCE(err < 0, "Couldn't record kernel module information.\n"
822 				   "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
823 				   "Check /proc/modules permission or run as root.\n");
824 	}
825 
826 	if (perf_guest) {
827 		machines__process_guests(&session->machines,
828 					 perf_event__synthesize_guest_os, tool);
829 	}
830 
831 	err = perf_event__synthesize_extra_attr(&rec->tool,
832 						rec->evlist,
833 						process_synthesized_event,
834 						data->is_pipe);
835 	if (err)
836 		goto out;
837 
838 	err = perf_event__synthesize_thread_map2(&rec->tool, rec->evlist->threads,
839 						 process_synthesized_event,
840 						NULL);
841 	if (err < 0) {
842 		pr_err("Couldn't synthesize thread map.\n");
843 		return err;
844 	}
845 
846 	err = perf_event__synthesize_cpu_map(&rec->tool, rec->evlist->cpus,
847 					     process_synthesized_event, NULL);
848 	if (err < 0) {
849 		pr_err("Couldn't synthesize cpu map.\n");
850 		return err;
851 	}
852 
853 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
854 					    process_synthesized_event, opts->sample_address,
855 					    opts->proc_map_timeout, 1);
856 out:
857 	return err;
858 }
859 
860 static int __cmd_record(struct record *rec, int argc, const char **argv)
861 {
862 	int err;
863 	int status = 0;
864 	unsigned long waking = 0;
865 	const bool forks = argc > 0;
866 	struct perf_tool *tool = &rec->tool;
867 	struct record_opts *opts = &rec->opts;
868 	struct perf_data *data = &rec->data;
869 	struct perf_session *session;
870 	bool disabled = false, draining = false;
871 	int fd;
872 
873 	atexit(record__sig_exit);
874 	signal(SIGCHLD, sig_handler);
875 	signal(SIGINT, sig_handler);
876 	signal(SIGTERM, sig_handler);
877 	signal(SIGSEGV, sigsegv_handler);
878 
879 	if (rec->opts.record_namespaces)
880 		tool->namespace_events = true;
881 
882 	if (rec->opts.auxtrace_snapshot_mode || rec->switch_output.enabled) {
883 		signal(SIGUSR2, snapshot_sig_handler);
884 		if (rec->opts.auxtrace_snapshot_mode)
885 			trigger_on(&auxtrace_snapshot_trigger);
886 		if (rec->switch_output.enabled)
887 			trigger_on(&switch_output_trigger);
888 	} else {
889 		signal(SIGUSR2, SIG_IGN);
890 	}
891 
892 	session = perf_session__new(data, false, tool);
893 	if (session == NULL) {
894 		pr_err("Perf session creation failed.\n");
895 		return -1;
896 	}
897 
898 	fd = perf_data__fd(data);
899 	rec->session = session;
900 
901 	record__init_features(rec);
902 
903 	if (rec->opts.use_clockid && rec->opts.clockid_res_ns)
904 		session->header.env.clockid_res_ns = rec->opts.clockid_res_ns;
905 
906 	if (forks) {
907 		err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
908 						    argv, data->is_pipe,
909 						    workload_exec_failed_signal);
910 		if (err < 0) {
911 			pr_err("Couldn't run the workload!\n");
912 			status = err;
913 			goto out_delete_session;
914 		}
915 	}
916 
917 	/*
918 	 * If we have just single event and are sending data
919 	 * through pipe, we need to force the ids allocation,
920 	 * because we synthesize event name through the pipe
921 	 * and need the id for that.
922 	 */
923 	if (data->is_pipe && rec->evlist->nr_entries == 1)
924 		rec->opts.sample_id = true;
925 
926 	if (record__open(rec) != 0) {
927 		err = -1;
928 		goto out_child;
929 	}
930 
931 	err = bpf__apply_obj_config();
932 	if (err) {
933 		char errbuf[BUFSIZ];
934 
935 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
936 		pr_err("ERROR: Apply config to BPF failed: %s\n",
937 			 errbuf);
938 		goto out_child;
939 	}
940 
941 	/*
942 	 * Normally perf_session__new would do this, but it doesn't have the
943 	 * evlist.
944 	 */
945 	if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
946 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
947 		rec->tool.ordered_events = false;
948 	}
949 
950 	if (!rec->evlist->nr_groups)
951 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
952 
953 	if (data->is_pipe) {
954 		err = perf_header__write_pipe(fd);
955 		if (err < 0)
956 			goto out_child;
957 	} else {
958 		err = perf_session__write_header(session, rec->evlist, fd, false);
959 		if (err < 0)
960 			goto out_child;
961 	}
962 
963 	if (!rec->no_buildid
964 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
965 		pr_err("Couldn't generate buildids. "
966 		       "Use --no-buildid to profile anyway.\n");
967 		err = -1;
968 		goto out_child;
969 	}
970 
971 	err = record__synthesize(rec, false);
972 	if (err < 0)
973 		goto out_child;
974 
975 	if (rec->realtime_prio) {
976 		struct sched_param param;
977 
978 		param.sched_priority = rec->realtime_prio;
979 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
980 			pr_err("Could not set realtime priority.\n");
981 			err = -1;
982 			goto out_child;
983 		}
984 	}
985 
986 	/*
987 	 * When perf is starting the traced process, all the events
988 	 * (apart from group members) have enable_on_exec=1 set,
989 	 * so don't spoil it by prematurely enabling them.
990 	 */
991 	if (!target__none(&opts->target) && !opts->initial_delay)
992 		perf_evlist__enable(rec->evlist);
993 
994 	/*
995 	 * Let the child rip
996 	 */
997 	if (forks) {
998 		struct machine *machine = &session->machines.host;
999 		union perf_event *event;
1000 		pid_t tgid;
1001 
1002 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
1003 		if (event == NULL) {
1004 			err = -ENOMEM;
1005 			goto out_child;
1006 		}
1007 
1008 		/*
1009 		 * Some H/W events are generated before COMM event
1010 		 * which is emitted during exec(), so perf script
1011 		 * cannot see a correct process name for those events.
1012 		 * Synthesize COMM event to prevent it.
1013 		 */
1014 		tgid = perf_event__synthesize_comm(tool, event,
1015 						   rec->evlist->workload.pid,
1016 						   process_synthesized_event,
1017 						   machine);
1018 		free(event);
1019 
1020 		if (tgid == -1)
1021 			goto out_child;
1022 
1023 		event = malloc(sizeof(event->namespaces) +
1024 			       (NR_NAMESPACES * sizeof(struct perf_ns_link_info)) +
1025 			       machine->id_hdr_size);
1026 		if (event == NULL) {
1027 			err = -ENOMEM;
1028 			goto out_child;
1029 		}
1030 
1031 		/*
1032 		 * Synthesize NAMESPACES event for the command specified.
1033 		 */
1034 		perf_event__synthesize_namespaces(tool, event,
1035 						  rec->evlist->workload.pid,
1036 						  tgid, process_synthesized_event,
1037 						  machine);
1038 		free(event);
1039 
1040 		perf_evlist__start_workload(rec->evlist);
1041 	}
1042 
1043 	if (opts->initial_delay) {
1044 		usleep(opts->initial_delay * USEC_PER_MSEC);
1045 		perf_evlist__enable(rec->evlist);
1046 	}
1047 
1048 	trigger_ready(&auxtrace_snapshot_trigger);
1049 	trigger_ready(&switch_output_trigger);
1050 	perf_hooks__invoke_record_start();
1051 	for (;;) {
1052 		unsigned long long hits = rec->samples;
1053 
1054 		/*
1055 		 * rec->evlist->bkw_mmap_state is possible to be
1056 		 * BKW_MMAP_EMPTY here: when done == true and
1057 		 * hits != rec->samples in previous round.
1058 		 *
1059 		 * perf_evlist__toggle_bkw_mmap ensure we never
1060 		 * convert BKW_MMAP_EMPTY to BKW_MMAP_DATA_PENDING.
1061 		 */
1062 		if (trigger_is_hit(&switch_output_trigger) || done || draining)
1063 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_DATA_PENDING);
1064 
1065 		if (record__mmap_read_all(rec) < 0) {
1066 			trigger_error(&auxtrace_snapshot_trigger);
1067 			trigger_error(&switch_output_trigger);
1068 			err = -1;
1069 			goto out_child;
1070 		}
1071 
1072 		if (auxtrace_record__snapshot_started) {
1073 			auxtrace_record__snapshot_started = 0;
1074 			if (!trigger_is_error(&auxtrace_snapshot_trigger))
1075 				record__read_auxtrace_snapshot(rec);
1076 			if (trigger_is_error(&auxtrace_snapshot_trigger)) {
1077 				pr_err("AUX area tracing snapshot failed\n");
1078 				err = -1;
1079 				goto out_child;
1080 			}
1081 		}
1082 
1083 		if (trigger_is_hit(&switch_output_trigger)) {
1084 			/*
1085 			 * If switch_output_trigger is hit, the data in
1086 			 * overwritable ring buffer should have been collected,
1087 			 * so bkw_mmap_state should be set to BKW_MMAP_EMPTY.
1088 			 *
1089 			 * If SIGUSR2 raise after or during record__mmap_read_all(),
1090 			 * record__mmap_read_all() didn't collect data from
1091 			 * overwritable ring buffer. Read again.
1092 			 */
1093 			if (rec->evlist->bkw_mmap_state == BKW_MMAP_RUNNING)
1094 				continue;
1095 			trigger_ready(&switch_output_trigger);
1096 
1097 			/*
1098 			 * Reenable events in overwrite ring buffer after
1099 			 * record__mmap_read_all(): we should have collected
1100 			 * data from it.
1101 			 */
1102 			perf_evlist__toggle_bkw_mmap(rec->evlist, BKW_MMAP_RUNNING);
1103 
1104 			if (!quiet)
1105 				fprintf(stderr, "[ perf record: dump data: Woken up %ld times ]\n",
1106 					waking);
1107 			waking = 0;
1108 			fd = record__switch_output(rec, false);
1109 			if (fd < 0) {
1110 				pr_err("Failed to switch to new file\n");
1111 				trigger_error(&switch_output_trigger);
1112 				err = fd;
1113 				goto out_child;
1114 			}
1115 
1116 			/* re-arm the alarm */
1117 			if (rec->switch_output.time)
1118 				alarm(rec->switch_output.time);
1119 		}
1120 
1121 		if (hits == rec->samples) {
1122 			if (done || draining)
1123 				break;
1124 			err = perf_evlist__poll(rec->evlist, -1);
1125 			/*
1126 			 * Propagate error, only if there's any. Ignore positive
1127 			 * number of returned events and interrupt error.
1128 			 */
1129 			if (err > 0 || (err < 0 && errno == EINTR))
1130 				err = 0;
1131 			waking++;
1132 
1133 			if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
1134 				draining = true;
1135 		}
1136 
1137 		/*
1138 		 * When perf is starting the traced process, at the end events
1139 		 * die with the process and we wait for that. Thus no need to
1140 		 * disable events in this case.
1141 		 */
1142 		if (done && !disabled && !target__none(&opts->target)) {
1143 			trigger_off(&auxtrace_snapshot_trigger);
1144 			perf_evlist__disable(rec->evlist);
1145 			disabled = true;
1146 		}
1147 	}
1148 	trigger_off(&auxtrace_snapshot_trigger);
1149 	trigger_off(&switch_output_trigger);
1150 
1151 	if (forks && workload_exec_errno) {
1152 		char msg[STRERR_BUFSIZE];
1153 		const char *emsg = str_error_r(workload_exec_errno, msg, sizeof(msg));
1154 		pr_err("Workload failed: %s\n", emsg);
1155 		err = -1;
1156 		goto out_child;
1157 	}
1158 
1159 	if (!quiet)
1160 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
1161 
1162 	if (target__none(&rec->opts.target))
1163 		record__synthesize_workload(rec, true);
1164 
1165 out_child:
1166 	if (forks) {
1167 		int exit_status;
1168 
1169 		if (!child_finished)
1170 			kill(rec->evlist->workload.pid, SIGTERM);
1171 
1172 		wait(&exit_status);
1173 
1174 		if (err < 0)
1175 			status = err;
1176 		else if (WIFEXITED(exit_status))
1177 			status = WEXITSTATUS(exit_status);
1178 		else if (WIFSIGNALED(exit_status))
1179 			signr = WTERMSIG(exit_status);
1180 	} else
1181 		status = err;
1182 
1183 	record__synthesize(rec, true);
1184 	/* this will be recalculated during process_buildids() */
1185 	rec->samples = 0;
1186 
1187 	if (!err) {
1188 		if (!rec->timestamp_filename) {
1189 			record__finish_output(rec);
1190 		} else {
1191 			fd = record__switch_output(rec, true);
1192 			if (fd < 0) {
1193 				status = fd;
1194 				goto out_delete_session;
1195 			}
1196 		}
1197 	}
1198 
1199 	perf_hooks__invoke_record_end();
1200 
1201 	if (!err && !quiet) {
1202 		char samples[128];
1203 		const char *postfix = rec->timestamp_filename ?
1204 					".<timestamp>" : "";
1205 
1206 		if (rec->samples && !rec->opts.full_auxtrace)
1207 			scnprintf(samples, sizeof(samples),
1208 				  " (%" PRIu64 " samples)", rec->samples);
1209 		else
1210 			samples[0] = '\0';
1211 
1212 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s%s ]\n",
1213 			perf_data__size(data) / 1024.0 / 1024.0,
1214 			data->file.path, postfix, samples);
1215 	}
1216 
1217 out_delete_session:
1218 	perf_session__delete(session);
1219 	return status;
1220 }
1221 
1222 static void callchain_debug(struct callchain_param *callchain)
1223 {
1224 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
1225 
1226 	pr_debug("callchain: type %s\n", str[callchain->record_mode]);
1227 
1228 	if (callchain->record_mode == CALLCHAIN_DWARF)
1229 		pr_debug("callchain: stack dump size %d\n",
1230 			 callchain->dump_size);
1231 }
1232 
1233 int record_opts__parse_callchain(struct record_opts *record,
1234 				 struct callchain_param *callchain,
1235 				 const char *arg, bool unset)
1236 {
1237 	int ret;
1238 	callchain->enabled = !unset;
1239 
1240 	/* --no-call-graph */
1241 	if (unset) {
1242 		callchain->record_mode = CALLCHAIN_NONE;
1243 		pr_debug("callchain: disabled\n");
1244 		return 0;
1245 	}
1246 
1247 	ret = parse_callchain_record_opt(arg, callchain);
1248 	if (!ret) {
1249 		/* Enable data address sampling for DWARF unwind. */
1250 		if (callchain->record_mode == CALLCHAIN_DWARF)
1251 			record->sample_address = true;
1252 		callchain_debug(callchain);
1253 	}
1254 
1255 	return ret;
1256 }
1257 
1258 int record_parse_callchain_opt(const struct option *opt,
1259 			       const char *arg,
1260 			       int unset)
1261 {
1262 	return record_opts__parse_callchain(opt->value, &callchain_param, arg, unset);
1263 }
1264 
1265 int record_callchain_opt(const struct option *opt,
1266 			 const char *arg __maybe_unused,
1267 			 int unset __maybe_unused)
1268 {
1269 	struct callchain_param *callchain = opt->value;
1270 
1271 	callchain->enabled = true;
1272 
1273 	if (callchain->record_mode == CALLCHAIN_NONE)
1274 		callchain->record_mode = CALLCHAIN_FP;
1275 
1276 	callchain_debug(callchain);
1277 	return 0;
1278 }
1279 
1280 static int perf_record_config(const char *var, const char *value, void *cb)
1281 {
1282 	struct record *rec = cb;
1283 
1284 	if (!strcmp(var, "record.build-id")) {
1285 		if (!strcmp(value, "cache"))
1286 			rec->no_buildid_cache = false;
1287 		else if (!strcmp(value, "no-cache"))
1288 			rec->no_buildid_cache = true;
1289 		else if (!strcmp(value, "skip"))
1290 			rec->no_buildid = true;
1291 		else
1292 			return -1;
1293 		return 0;
1294 	}
1295 	if (!strcmp(var, "record.call-graph")) {
1296 		var = "call-graph.record-mode";
1297 		return perf_default_config(var, value, cb);
1298 	}
1299 
1300 	return 0;
1301 }
1302 
1303 struct clockid_map {
1304 	const char *name;
1305 	int clockid;
1306 };
1307 
1308 #define CLOCKID_MAP(n, c)	\
1309 	{ .name = n, .clockid = (c), }
1310 
1311 #define CLOCKID_END	{ .name = NULL, }
1312 
1313 
1314 /*
1315  * Add the missing ones, we need to build on many distros...
1316  */
1317 #ifndef CLOCK_MONOTONIC_RAW
1318 #define CLOCK_MONOTONIC_RAW 4
1319 #endif
1320 #ifndef CLOCK_BOOTTIME
1321 #define CLOCK_BOOTTIME 7
1322 #endif
1323 #ifndef CLOCK_TAI
1324 #define CLOCK_TAI 11
1325 #endif
1326 
1327 static const struct clockid_map clockids[] = {
1328 	/* available for all events, NMI safe */
1329 	CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
1330 	CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
1331 
1332 	/* available for some events */
1333 	CLOCKID_MAP("realtime", CLOCK_REALTIME),
1334 	CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
1335 	CLOCKID_MAP("tai", CLOCK_TAI),
1336 
1337 	/* available for the lazy */
1338 	CLOCKID_MAP("mono", CLOCK_MONOTONIC),
1339 	CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
1340 	CLOCKID_MAP("real", CLOCK_REALTIME),
1341 	CLOCKID_MAP("boot", CLOCK_BOOTTIME),
1342 
1343 	CLOCKID_END,
1344 };
1345 
1346 static int get_clockid_res(clockid_t clk_id, u64 *res_ns)
1347 {
1348 	struct timespec res;
1349 
1350 	*res_ns = 0;
1351 	if (!clock_getres(clk_id, &res))
1352 		*res_ns = res.tv_nsec + res.tv_sec * NSEC_PER_SEC;
1353 	else
1354 		pr_warning("WARNING: Failed to determine specified clock resolution.\n");
1355 
1356 	return 0;
1357 }
1358 
1359 static int parse_clockid(const struct option *opt, const char *str, int unset)
1360 {
1361 	struct record_opts *opts = (struct record_opts *)opt->value;
1362 	const struct clockid_map *cm;
1363 	const char *ostr = str;
1364 
1365 	if (unset) {
1366 		opts->use_clockid = 0;
1367 		return 0;
1368 	}
1369 
1370 	/* no arg passed */
1371 	if (!str)
1372 		return 0;
1373 
1374 	/* no setting it twice */
1375 	if (opts->use_clockid)
1376 		return -1;
1377 
1378 	opts->use_clockid = true;
1379 
1380 	/* if its a number, we're done */
1381 	if (sscanf(str, "%d", &opts->clockid) == 1)
1382 		return get_clockid_res(opts->clockid, &opts->clockid_res_ns);
1383 
1384 	/* allow a "CLOCK_" prefix to the name */
1385 	if (!strncasecmp(str, "CLOCK_", 6))
1386 		str += 6;
1387 
1388 	for (cm = clockids; cm->name; cm++) {
1389 		if (!strcasecmp(str, cm->name)) {
1390 			opts->clockid = cm->clockid;
1391 			return get_clockid_res(opts->clockid,
1392 					       &opts->clockid_res_ns);
1393 		}
1394 	}
1395 
1396 	opts->use_clockid = false;
1397 	ui__warning("unknown clockid %s, check man page\n", ostr);
1398 	return -1;
1399 }
1400 
1401 static int record__parse_mmap_pages(const struct option *opt,
1402 				    const char *str,
1403 				    int unset __maybe_unused)
1404 {
1405 	struct record_opts *opts = opt->value;
1406 	char *s, *p;
1407 	unsigned int mmap_pages;
1408 	int ret;
1409 
1410 	if (!str)
1411 		return -EINVAL;
1412 
1413 	s = strdup(str);
1414 	if (!s)
1415 		return -ENOMEM;
1416 
1417 	p = strchr(s, ',');
1418 	if (p)
1419 		*p = '\0';
1420 
1421 	if (*s) {
1422 		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
1423 		if (ret)
1424 			goto out_free;
1425 		opts->mmap_pages = mmap_pages;
1426 	}
1427 
1428 	if (!p) {
1429 		ret = 0;
1430 		goto out_free;
1431 	}
1432 
1433 	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
1434 	if (ret)
1435 		goto out_free;
1436 
1437 	opts->auxtrace_mmap_pages = mmap_pages;
1438 
1439 out_free:
1440 	free(s);
1441 	return ret;
1442 }
1443 
1444 static void switch_output_size_warn(struct record *rec)
1445 {
1446 	u64 wakeup_size = perf_evlist__mmap_size(rec->opts.mmap_pages);
1447 	struct switch_output *s = &rec->switch_output;
1448 
1449 	wakeup_size /= 2;
1450 
1451 	if (s->size < wakeup_size) {
1452 		char buf[100];
1453 
1454 		unit_number__scnprintf(buf, sizeof(buf), wakeup_size);
1455 		pr_warning("WARNING: switch-output data size lower than "
1456 			   "wakeup kernel buffer size (%s) "
1457 			   "expect bigger perf.data sizes\n", buf);
1458 	}
1459 }
1460 
1461 static int switch_output_setup(struct record *rec)
1462 {
1463 	struct switch_output *s = &rec->switch_output;
1464 	static struct parse_tag tags_size[] = {
1465 		{ .tag  = 'B', .mult = 1       },
1466 		{ .tag  = 'K', .mult = 1 << 10 },
1467 		{ .tag  = 'M', .mult = 1 << 20 },
1468 		{ .tag  = 'G', .mult = 1 << 30 },
1469 		{ .tag  = 0 },
1470 	};
1471 	static struct parse_tag tags_time[] = {
1472 		{ .tag  = 's', .mult = 1        },
1473 		{ .tag  = 'm', .mult = 60       },
1474 		{ .tag  = 'h', .mult = 60*60    },
1475 		{ .tag  = 'd', .mult = 60*60*24 },
1476 		{ .tag  = 0 },
1477 	};
1478 	unsigned long val;
1479 
1480 	if (!s->set)
1481 		return 0;
1482 
1483 	if (!strcmp(s->str, "signal")) {
1484 		s->signal = true;
1485 		pr_debug("switch-output with SIGUSR2 signal\n");
1486 		goto enabled;
1487 	}
1488 
1489 	val = parse_tag_value(s->str, tags_size);
1490 	if (val != (unsigned long) -1) {
1491 		s->size = val;
1492 		pr_debug("switch-output with %s size threshold\n", s->str);
1493 		goto enabled;
1494 	}
1495 
1496 	val = parse_tag_value(s->str, tags_time);
1497 	if (val != (unsigned long) -1) {
1498 		s->time = val;
1499 		pr_debug("switch-output with %s time threshold (%lu seconds)\n",
1500 			 s->str, s->time);
1501 		goto enabled;
1502 	}
1503 
1504 	return -1;
1505 
1506 enabled:
1507 	rec->timestamp_filename = true;
1508 	s->enabled              = true;
1509 
1510 	if (s->size && !rec->opts.no_buffering)
1511 		switch_output_size_warn(rec);
1512 
1513 	return 0;
1514 }
1515 
1516 static const char * const __record_usage[] = {
1517 	"perf record [<options>] [<command>]",
1518 	"perf record [<options>] -- <command> [<options>]",
1519 	NULL
1520 };
1521 const char * const *record_usage = __record_usage;
1522 
1523 /*
1524  * XXX Ideally would be local to cmd_record() and passed to a record__new
1525  * because we need to have access to it in record__exit, that is called
1526  * after cmd_record() exits, but since record_options need to be accessible to
1527  * builtin-script, leave it here.
1528  *
1529  * At least we don't ouch it in all the other functions here directly.
1530  *
1531  * Just say no to tons of global variables, sigh.
1532  */
1533 static struct record record = {
1534 	.opts = {
1535 		.sample_time	     = true,
1536 		.mmap_pages	     = UINT_MAX,
1537 		.user_freq	     = UINT_MAX,
1538 		.user_interval	     = ULLONG_MAX,
1539 		.freq		     = 4000,
1540 		.target		     = {
1541 			.uses_mmap   = true,
1542 			.default_per_cpu = true,
1543 		},
1544 		.proc_map_timeout     = 500,
1545 	},
1546 	.tool = {
1547 		.sample		= process_sample_event,
1548 		.fork		= perf_event__process_fork,
1549 		.exit		= perf_event__process_exit,
1550 		.comm		= perf_event__process_comm,
1551 		.namespaces	= perf_event__process_namespaces,
1552 		.mmap		= perf_event__process_mmap,
1553 		.mmap2		= perf_event__process_mmap2,
1554 		.ordered_events	= true,
1555 	},
1556 };
1557 
1558 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
1559 	"\n\t\t\t\tDefault: fp";
1560 
1561 static bool dry_run;
1562 
1563 /*
1564  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
1565  * with it and switch to use the library functions in perf_evlist that came
1566  * from builtin-record.c, i.e. use record_opts,
1567  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
1568  * using pipes, etc.
1569  */
1570 static struct option __record_options[] = {
1571 	OPT_CALLBACK('e', "event", &record.evlist, "event",
1572 		     "event selector. use 'perf list' to list available events",
1573 		     parse_events_option),
1574 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
1575 		     "event filter", parse_filter),
1576 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
1577 			   NULL, "don't record events from perf itself",
1578 			   exclude_perf),
1579 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
1580 		    "record events on existing process id"),
1581 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
1582 		    "record events on existing thread id"),
1583 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
1584 		    "collect data with this RT SCHED_FIFO priority"),
1585 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
1586 		    "collect data without buffering"),
1587 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
1588 		    "collect raw sample records from all opened counters"),
1589 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1590 			    "system-wide collection from all CPUs"),
1591 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1592 		    "list of cpus to monitor"),
1593 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
1594 	OPT_STRING('o', "output", &record.data.file.path, "file",
1595 		    "output file name"),
1596 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
1597 			&record.opts.no_inherit_set,
1598 			"child tasks do not inherit counters"),
1599 	OPT_BOOLEAN(0, "tail-synthesize", &record.opts.tail_synthesize,
1600 		    "synthesize non-sample events at the end of output"),
1601 	OPT_BOOLEAN(0, "overwrite", &record.opts.overwrite, "use overwrite mode"),
1602 	OPT_BOOLEAN(0, "strict-freq", &record.opts.strict_freq,
1603 		    "Fail if the specified frequency can't be used"),
1604 	OPT_CALLBACK('F', "freq", &record.opts, "freq or 'max'",
1605 		     "profile at this frequency",
1606 		      record__parse_freq),
1607 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
1608 		     "number of mmap data pages and AUX area tracing mmap pages",
1609 		     record__parse_mmap_pages),
1610 	OPT_BOOLEAN(0, "group", &record.opts.group,
1611 		    "put the counters into a counter group"),
1612 	OPT_CALLBACK_NOOPT('g', NULL, &callchain_param,
1613 			   NULL, "enables call-graph recording" ,
1614 			   &record_callchain_opt),
1615 	OPT_CALLBACK(0, "call-graph", &record.opts,
1616 		     "record_mode[,record_size]", record_callchain_help,
1617 		     &record_parse_callchain_opt),
1618 	OPT_INCR('v', "verbose", &verbose,
1619 		    "be more verbose (show counter open errors, etc)"),
1620 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1621 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1622 		    "per thread counts"),
1623 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
1624 	OPT_BOOLEAN(0, "phys-data", &record.opts.sample_phys_addr,
1625 		    "Record the sample physical addresses"),
1626 	OPT_BOOLEAN(0, "sample-cpu", &record.opts.sample_cpu, "Record the sample cpu"),
1627 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
1628 			&record.opts.sample_time_set,
1629 			"Record the sample timestamps"),
1630 	OPT_BOOLEAN_SET('P', "period", &record.opts.period, &record.opts.period_set,
1631 			"Record the sample period"),
1632 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
1633 		    "don't sample"),
1634 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
1635 			&record.no_buildid_cache_set,
1636 			"do not update the buildid cache"),
1637 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
1638 			&record.no_buildid_set,
1639 			"do not collect buildids in perf.data"),
1640 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
1641 		     "monitor event in cgroup name only",
1642 		     parse_cgroups),
1643 	OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
1644 		  "ms to wait before starting measurement after program start"),
1645 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
1646 		   "user to profile"),
1647 
1648 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
1649 		     "branch any", "sample any taken branches",
1650 		     parse_branch_stack),
1651 
1652 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
1653 		     "branch filter mask", "branch stack filter modes",
1654 		     parse_branch_stack),
1655 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
1656 		    "sample by weight (on special events only)"),
1657 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
1658 		    "sample transaction flags (special events only)"),
1659 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
1660 		    "use per-thread mmaps"),
1661 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
1662 		    "sample selected machine registers on interrupt,"
1663 		    " use -I ? to list register names", parse_regs),
1664 	OPT_CALLBACK_OPTARG(0, "user-regs", &record.opts.sample_user_regs, NULL, "any register",
1665 		    "sample selected machine registers on interrupt,"
1666 		    " use -I ? to list register names", parse_regs),
1667 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
1668 		    "Record running/enabled time of read (:S) events"),
1669 	OPT_CALLBACK('k', "clockid", &record.opts,
1670 	"clockid", "clockid to use for events, see clock_gettime()",
1671 	parse_clockid),
1672 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
1673 			  "opts", "AUX area tracing Snapshot Mode", ""),
1674 	OPT_UINTEGER(0, "proc-map-timeout", &record.opts.proc_map_timeout,
1675 			"per thread proc mmap processing timeout in ms"),
1676 	OPT_BOOLEAN(0, "namespaces", &record.opts.record_namespaces,
1677 		    "Record namespaces events"),
1678 	OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
1679 		    "Record context switch events"),
1680 	OPT_BOOLEAN_FLAG(0, "all-kernel", &record.opts.all_kernel,
1681 			 "Configure all used events to run in kernel space.",
1682 			 PARSE_OPT_EXCLUSIVE),
1683 	OPT_BOOLEAN_FLAG(0, "all-user", &record.opts.all_user,
1684 			 "Configure all used events to run in user space.",
1685 			 PARSE_OPT_EXCLUSIVE),
1686 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
1687 		   "clang binary to use for compiling BPF scriptlets"),
1688 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
1689 		   "options passed to clang when compiling BPF scriptlets"),
1690 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
1691 		   "file", "vmlinux pathname"),
1692 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
1693 		    "Record build-id of all DSOs regardless of hits"),
1694 	OPT_BOOLEAN(0, "timestamp-filename", &record.timestamp_filename,
1695 		    "append timestamp to output filename"),
1696 	OPT_BOOLEAN(0, "timestamp-boundary", &record.timestamp_boundary,
1697 		    "Record timestamp boundary (time of first/last samples)"),
1698 	OPT_STRING_OPTARG_SET(0, "switch-output", &record.switch_output.str,
1699 			  &record.switch_output.set, "signal,size,time",
1700 			  "Switch output when receive SIGUSR2 or cross size,time threshold",
1701 			  "signal"),
1702 	OPT_BOOLEAN(0, "dry-run", &dry_run,
1703 		    "Parse options then exit"),
1704 	OPT_END()
1705 };
1706 
1707 struct option *record_options = __record_options;
1708 
1709 int cmd_record(int argc, const char **argv)
1710 {
1711 	int err;
1712 	struct record *rec = &record;
1713 	char errbuf[BUFSIZ];
1714 
1715 	setlocale(LC_ALL, "");
1716 
1717 #ifndef HAVE_LIBBPF_SUPPORT
1718 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
1719 	set_nobuild('\0', "clang-path", true);
1720 	set_nobuild('\0', "clang-opt", true);
1721 # undef set_nobuild
1722 #endif
1723 
1724 #ifndef HAVE_BPF_PROLOGUE
1725 # if !defined (HAVE_DWARF_SUPPORT)
1726 #  define REASON  "NO_DWARF=1"
1727 # elif !defined (HAVE_LIBBPF_SUPPORT)
1728 #  define REASON  "NO_LIBBPF=1"
1729 # else
1730 #  define REASON  "this architecture doesn't support BPF prologue"
1731 # endif
1732 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
1733 	set_nobuild('\0', "vmlinux", true);
1734 # undef set_nobuild
1735 # undef REASON
1736 #endif
1737 
1738 	rec->evlist = perf_evlist__new();
1739 	if (rec->evlist == NULL)
1740 		return -ENOMEM;
1741 
1742 	err = perf_config(perf_record_config, rec);
1743 	if (err)
1744 		return err;
1745 
1746 	argc = parse_options(argc, argv, record_options, record_usage,
1747 			    PARSE_OPT_STOP_AT_NON_OPTION);
1748 	if (quiet)
1749 		perf_quiet_option();
1750 
1751 	/* Make system wide (-a) the default target. */
1752 	if (!argc && target__none(&rec->opts.target))
1753 		rec->opts.target.system_wide = true;
1754 
1755 	if (nr_cgroups && !rec->opts.target.system_wide) {
1756 		usage_with_options_msg(record_usage, record_options,
1757 			"cgroup monitoring only available in system-wide mode");
1758 
1759 	}
1760 	if (rec->opts.record_switch_events &&
1761 	    !perf_can_record_switch_events()) {
1762 		ui__error("kernel does not support recording context switch events\n");
1763 		parse_options_usage(record_usage, record_options, "switch-events", 0);
1764 		return -EINVAL;
1765 	}
1766 
1767 	if (switch_output_setup(rec)) {
1768 		parse_options_usage(record_usage, record_options, "switch-output", 0);
1769 		return -EINVAL;
1770 	}
1771 
1772 	if (rec->switch_output.time) {
1773 		signal(SIGALRM, alarm_sig_handler);
1774 		alarm(rec->switch_output.time);
1775 	}
1776 
1777 	/*
1778 	 * Allow aliases to facilitate the lookup of symbols for address
1779 	 * filters. Refer to auxtrace_parse_filters().
1780 	 */
1781 	symbol_conf.allow_aliases = true;
1782 
1783 	symbol__init(NULL);
1784 
1785 	err = record__auxtrace_init(rec);
1786 	if (err)
1787 		goto out;
1788 
1789 	if (dry_run)
1790 		goto out;
1791 
1792 	err = bpf__setup_stdout(rec->evlist);
1793 	if (err) {
1794 		bpf__strerror_setup_stdout(rec->evlist, err, errbuf, sizeof(errbuf));
1795 		pr_err("ERROR: Setup BPF stdout failed: %s\n",
1796 			 errbuf);
1797 		goto out;
1798 	}
1799 
1800 	err = -ENOMEM;
1801 
1802 	if (symbol_conf.kptr_restrict && !perf_evlist__exclude_kernel(rec->evlist))
1803 		pr_warning(
1804 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1805 "check /proc/sys/kernel/kptr_restrict.\n\n"
1806 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1807 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1808 "Samples in kernel modules won't be resolved at all.\n\n"
1809 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1810 "even with a suitable vmlinux or kallsyms file.\n\n");
1811 
1812 	if (rec->no_buildid_cache || rec->no_buildid) {
1813 		disable_buildid_cache();
1814 	} else if (rec->switch_output.enabled) {
1815 		/*
1816 		 * In 'perf record --switch-output', disable buildid
1817 		 * generation by default to reduce data file switching
1818 		 * overhead. Still generate buildid if they are required
1819 		 * explicitly using
1820 		 *
1821 		 *  perf record --switch-output --no-no-buildid \
1822 		 *              --no-no-buildid-cache
1823 		 *
1824 		 * Following code equals to:
1825 		 *
1826 		 * if ((rec->no_buildid || !rec->no_buildid_set) &&
1827 		 *     (rec->no_buildid_cache || !rec->no_buildid_cache_set))
1828 		 *         disable_buildid_cache();
1829 		 */
1830 		bool disable = true;
1831 
1832 		if (rec->no_buildid_set && !rec->no_buildid)
1833 			disable = false;
1834 		if (rec->no_buildid_cache_set && !rec->no_buildid_cache)
1835 			disable = false;
1836 		if (disable) {
1837 			rec->no_buildid = true;
1838 			rec->no_buildid_cache = true;
1839 			disable_buildid_cache();
1840 		}
1841 	}
1842 
1843 	if (record.opts.overwrite)
1844 		record.opts.tail_synthesize = true;
1845 
1846 	if (rec->evlist->nr_entries == 0 &&
1847 	    __perf_evlist__add_default(rec->evlist, !record.opts.no_samples) < 0) {
1848 		pr_err("Not enough memory for event selector list\n");
1849 		goto out;
1850 	}
1851 
1852 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
1853 		rec->opts.no_inherit = true;
1854 
1855 	err = target__validate(&rec->opts.target);
1856 	if (err) {
1857 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1858 		ui__warning("%s\n", errbuf);
1859 	}
1860 
1861 	err = target__parse_uid(&rec->opts.target);
1862 	if (err) {
1863 		int saved_errno = errno;
1864 
1865 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1866 		ui__error("%s", errbuf);
1867 
1868 		err = -saved_errno;
1869 		goto out;
1870 	}
1871 
1872 	/* Enable ignoring missing threads when -u/-p option is defined. */
1873 	rec->opts.ignore_missing_thread = rec->opts.target.uid != UINT_MAX || rec->opts.target.pid;
1874 
1875 	err = -ENOMEM;
1876 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
1877 		usage_with_options(record_usage, record_options);
1878 
1879 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
1880 	if (err)
1881 		goto out;
1882 
1883 	/*
1884 	 * We take all buildids when the file contains
1885 	 * AUX area tracing data because we do not decode the
1886 	 * trace because it would take too long.
1887 	 */
1888 	if (rec->opts.full_auxtrace)
1889 		rec->buildid_all = true;
1890 
1891 	if (record_opts__config(&rec->opts)) {
1892 		err = -EINVAL;
1893 		goto out;
1894 	}
1895 
1896 	err = __cmd_record(&record, argc, argv);
1897 out:
1898 	perf_evlist__delete(rec->evlist);
1899 	symbol__exit();
1900 	auxtrace_record__free(rec->itr);
1901 	return err;
1902 }
1903 
1904 static void snapshot_sig_handler(int sig __maybe_unused)
1905 {
1906 	struct record *rec = &record;
1907 
1908 	if (trigger_is_ready(&auxtrace_snapshot_trigger)) {
1909 		trigger_hit(&auxtrace_snapshot_trigger);
1910 		auxtrace_record__snapshot_started = 1;
1911 		if (auxtrace_record__snapshot_start(record.itr))
1912 			trigger_error(&auxtrace_snapshot_trigger);
1913 	}
1914 
1915 	if (switch_output_signal(rec))
1916 		trigger_hit(&switch_output_trigger);
1917 }
1918 
1919 static void alarm_sig_handler(int sig __maybe_unused)
1920 {
1921 	struct record *rec = &record;
1922 
1923 	if (switch_output_time(rec))
1924 		trigger_hit(&switch_output_trigger);
1925 }
1926