xref: /linux/tools/perf/builtin-record.c (revision 8e947f1e84fd1588f66e5f2ea69c80647de72cd4)
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #include "builtin.h"
9 
10 #include "perf.h"
11 
12 #include "util/build-id.h"
13 #include "util/util.h"
14 #include "util/parse-options.h"
15 #include "util/parse-events.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/cpumap.h"
28 #include "util/thread_map.h"
29 #include "util/data.h"
30 #include "util/perf_regs.h"
31 #include "util/auxtrace.h"
32 #include "util/parse-branch-options.h"
33 #include "util/parse-regs-options.h"
34 
35 #include <unistd.h>
36 #include <sched.h>
37 #include <sys/mman.h>
38 
39 
40 struct record {
41 	struct perf_tool	tool;
42 	struct record_opts	opts;
43 	u64			bytes_written;
44 	struct perf_data_file	file;
45 	struct auxtrace_record	*itr;
46 	struct perf_evlist	*evlist;
47 	struct perf_session	*session;
48 	const char		*progname;
49 	int			realtime_prio;
50 	bool			no_buildid;
51 	bool			no_buildid_cache;
52 	long			samples;
53 };
54 
55 static int record__write(struct record *rec, void *bf, size_t size)
56 {
57 	if (perf_data_file__write(rec->session->file, bf, size) < 0) {
58 		pr_err("failed to write perf data, error: %m\n");
59 		return -1;
60 	}
61 
62 	rec->bytes_written += size;
63 	return 0;
64 }
65 
66 static int process_synthesized_event(struct perf_tool *tool,
67 				     union perf_event *event,
68 				     struct perf_sample *sample __maybe_unused,
69 				     struct machine *machine __maybe_unused)
70 {
71 	struct record *rec = container_of(tool, struct record, tool);
72 	return record__write(rec, event, event->header.size);
73 }
74 
75 static int record__mmap_read(struct record *rec, int idx)
76 {
77 	struct perf_mmap *md = &rec->evlist->mmap[idx];
78 	u64 head = perf_mmap__read_head(md);
79 	u64 old = md->prev;
80 	unsigned char *data = md->base + page_size;
81 	unsigned long size;
82 	void *buf;
83 	int rc = 0;
84 
85 	if (old == head)
86 		return 0;
87 
88 	rec->samples++;
89 
90 	size = head - old;
91 
92 	if ((old & md->mask) + size != (head & md->mask)) {
93 		buf = &data[old & md->mask];
94 		size = md->mask + 1 - (old & md->mask);
95 		old += size;
96 
97 		if (record__write(rec, buf, size) < 0) {
98 			rc = -1;
99 			goto out;
100 		}
101 	}
102 
103 	buf = &data[old & md->mask];
104 	size = head - old;
105 	old += size;
106 
107 	if (record__write(rec, buf, size) < 0) {
108 		rc = -1;
109 		goto out;
110 	}
111 
112 	md->prev = old;
113 	perf_evlist__mmap_consume(rec->evlist, idx);
114 out:
115 	return rc;
116 }
117 
118 static volatile int done;
119 static volatile int signr = -1;
120 static volatile int child_finished;
121 static volatile int auxtrace_snapshot_enabled;
122 static volatile int auxtrace_snapshot_err;
123 static volatile int auxtrace_record__snapshot_started;
124 
125 static void sig_handler(int sig)
126 {
127 	if (sig == SIGCHLD)
128 		child_finished = 1;
129 	else
130 		signr = sig;
131 
132 	done = 1;
133 }
134 
135 static void record__sig_exit(void)
136 {
137 	if (signr == -1)
138 		return;
139 
140 	signal(signr, SIG_DFL);
141 	raise(signr);
142 }
143 
144 #ifdef HAVE_AUXTRACE_SUPPORT
145 
146 static int record__process_auxtrace(struct perf_tool *tool,
147 				    union perf_event *event, void *data1,
148 				    size_t len1, void *data2, size_t len2)
149 {
150 	struct record *rec = container_of(tool, struct record, tool);
151 	struct perf_data_file *file = &rec->file;
152 	size_t padding;
153 	u8 pad[8] = {0};
154 
155 	if (!perf_data_file__is_pipe(file)) {
156 		off_t file_offset;
157 		int fd = perf_data_file__fd(file);
158 		int err;
159 
160 		file_offset = lseek(fd, 0, SEEK_CUR);
161 		if (file_offset == -1)
162 			return -1;
163 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
164 						     event, file_offset);
165 		if (err)
166 			return err;
167 	}
168 
169 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
170 	padding = (len1 + len2) & 7;
171 	if (padding)
172 		padding = 8 - padding;
173 
174 	record__write(rec, event, event->header.size);
175 	record__write(rec, data1, len1);
176 	if (len2)
177 		record__write(rec, data2, len2);
178 	record__write(rec, &pad, padding);
179 
180 	return 0;
181 }
182 
183 static int record__auxtrace_mmap_read(struct record *rec,
184 				      struct auxtrace_mmap *mm)
185 {
186 	int ret;
187 
188 	ret = auxtrace_mmap__read(mm, rec->itr, &rec->tool,
189 				  record__process_auxtrace);
190 	if (ret < 0)
191 		return ret;
192 
193 	if (ret)
194 		rec->samples++;
195 
196 	return 0;
197 }
198 
199 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
200 					       struct auxtrace_mmap *mm)
201 {
202 	int ret;
203 
204 	ret = auxtrace_mmap__read_snapshot(mm, rec->itr, &rec->tool,
205 					   record__process_auxtrace,
206 					   rec->opts.auxtrace_snapshot_size);
207 	if (ret < 0)
208 		return ret;
209 
210 	if (ret)
211 		rec->samples++;
212 
213 	return 0;
214 }
215 
216 static int record__auxtrace_read_snapshot_all(struct record *rec)
217 {
218 	int i;
219 	int rc = 0;
220 
221 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
222 		struct auxtrace_mmap *mm =
223 				&rec->evlist->mmap[i].auxtrace_mmap;
224 
225 		if (!mm->base)
226 			continue;
227 
228 		if (record__auxtrace_mmap_read_snapshot(rec, mm) != 0) {
229 			rc = -1;
230 			goto out;
231 		}
232 	}
233 out:
234 	return rc;
235 }
236 
237 static void record__read_auxtrace_snapshot(struct record *rec)
238 {
239 	pr_debug("Recording AUX area tracing snapshot\n");
240 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
241 		auxtrace_snapshot_err = -1;
242 	} else {
243 		auxtrace_snapshot_err = auxtrace_record__snapshot_finish(rec->itr);
244 		if (!auxtrace_snapshot_err)
245 			auxtrace_snapshot_enabled = 1;
246 	}
247 }
248 
249 #else
250 
251 static inline
252 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
253 			       struct auxtrace_mmap *mm __maybe_unused)
254 {
255 	return 0;
256 }
257 
258 static inline
259 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
260 {
261 }
262 
263 static inline
264 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
265 {
266 	return 0;
267 }
268 
269 #endif
270 
271 static int record__open(struct record *rec)
272 {
273 	char msg[512];
274 	struct perf_evsel *pos;
275 	struct perf_evlist *evlist = rec->evlist;
276 	struct perf_session *session = rec->session;
277 	struct record_opts *opts = &rec->opts;
278 	int rc = 0;
279 
280 	perf_evlist__config(evlist, opts);
281 
282 	evlist__for_each(evlist, pos) {
283 try_again:
284 		if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
285 			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
286 				if (verbose)
287 					ui__warning("%s\n", msg);
288 				goto try_again;
289 			}
290 
291 			rc = -errno;
292 			perf_evsel__open_strerror(pos, &opts->target,
293 						  errno, msg, sizeof(msg));
294 			ui__error("%s\n", msg);
295 			goto out;
296 		}
297 	}
298 
299 	if (perf_evlist__apply_filters(evlist, &pos)) {
300 		error("failed to set filter \"%s\" on event %s with %d (%s)\n",
301 			pos->filter, perf_evsel__name(pos), errno,
302 			strerror_r(errno, msg, sizeof(msg)));
303 		rc = -1;
304 		goto out;
305 	}
306 
307 	if (perf_evlist__mmap_ex(evlist, opts->mmap_pages, false,
308 				 opts->auxtrace_mmap_pages,
309 				 opts->auxtrace_snapshot_mode) < 0) {
310 		if (errno == EPERM) {
311 			pr_err("Permission error mapping pages.\n"
312 			       "Consider increasing "
313 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
314 			       "or try again with a smaller value of -m/--mmap_pages.\n"
315 			       "(current value: %u,%u)\n",
316 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
317 			rc = -errno;
318 		} else {
319 			pr_err("failed to mmap with %d (%s)\n", errno,
320 				strerror_r(errno, msg, sizeof(msg)));
321 			rc = -errno;
322 		}
323 		goto out;
324 	}
325 
326 	session->evlist = evlist;
327 	perf_session__set_id_hdr_size(session);
328 out:
329 	return rc;
330 }
331 
332 static int process_sample_event(struct perf_tool *tool,
333 				union perf_event *event,
334 				struct perf_sample *sample,
335 				struct perf_evsel *evsel,
336 				struct machine *machine)
337 {
338 	struct record *rec = container_of(tool, struct record, tool);
339 
340 	rec->samples++;
341 
342 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
343 }
344 
345 static int process_buildids(struct record *rec)
346 {
347 	struct perf_data_file *file  = &rec->file;
348 	struct perf_session *session = rec->session;
349 
350 	if (file->size == 0)
351 		return 0;
352 
353 	/*
354 	 * During this process, it'll load kernel map and replace the
355 	 * dso->long_name to a real pathname it found.  In this case
356 	 * we prefer the vmlinux path like
357 	 *   /lib/modules/3.16.4/build/vmlinux
358 	 *
359 	 * rather than build-id path (in debug directory).
360 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
361 	 */
362 	symbol_conf.ignore_vmlinux_buildid = true;
363 
364 	return perf_session__process_events(session);
365 }
366 
367 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
368 {
369 	int err;
370 	struct perf_tool *tool = data;
371 	/*
372 	 *As for guest kernel when processing subcommand record&report,
373 	 *we arrange module mmap prior to guest kernel mmap and trigger
374 	 *a preload dso because default guest module symbols are loaded
375 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
376 	 *method is used to avoid symbol missing when the first addr is
377 	 *in module instead of in guest kernel.
378 	 */
379 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
380 					     machine);
381 	if (err < 0)
382 		pr_err("Couldn't record guest kernel [%d]'s reference"
383 		       " relocation symbol.\n", machine->pid);
384 
385 	/*
386 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
387 	 * have no _text sometimes.
388 	 */
389 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
390 						 machine);
391 	if (err < 0)
392 		pr_err("Couldn't record guest kernel [%d]'s reference"
393 		       " relocation symbol.\n", machine->pid);
394 }
395 
396 static struct perf_event_header finished_round_event = {
397 	.size = sizeof(struct perf_event_header),
398 	.type = PERF_RECORD_FINISHED_ROUND,
399 };
400 
401 static int record__mmap_read_all(struct record *rec)
402 {
403 	u64 bytes_written = rec->bytes_written;
404 	int i;
405 	int rc = 0;
406 
407 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
408 		struct auxtrace_mmap *mm = &rec->evlist->mmap[i].auxtrace_mmap;
409 
410 		if (rec->evlist->mmap[i].base) {
411 			if (record__mmap_read(rec, i) != 0) {
412 				rc = -1;
413 				goto out;
414 			}
415 		}
416 
417 		if (mm->base && !rec->opts.auxtrace_snapshot_mode &&
418 		    record__auxtrace_mmap_read(rec, mm) != 0) {
419 			rc = -1;
420 			goto out;
421 		}
422 	}
423 
424 	/*
425 	 * Mark the round finished in case we wrote
426 	 * at least one event.
427 	 */
428 	if (bytes_written != rec->bytes_written)
429 		rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
430 
431 out:
432 	return rc;
433 }
434 
435 static void record__init_features(struct record *rec)
436 {
437 	struct perf_session *session = rec->session;
438 	int feat;
439 
440 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
441 		perf_header__set_feat(&session->header, feat);
442 
443 	if (rec->no_buildid)
444 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
445 
446 	if (!have_tracepoints(&rec->evlist->entries))
447 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
448 
449 	if (!rec->opts.branch_stack)
450 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
451 
452 	if (!rec->opts.full_auxtrace)
453 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
454 }
455 
456 static volatile int workload_exec_errno;
457 
458 /*
459  * perf_evlist__prepare_workload will send a SIGUSR1
460  * if the fork fails, since we asked by setting its
461  * want_signal to true.
462  */
463 static void workload_exec_failed_signal(int signo __maybe_unused,
464 					siginfo_t *info,
465 					void *ucontext __maybe_unused)
466 {
467 	workload_exec_errno = info->si_value.sival_int;
468 	done = 1;
469 	child_finished = 1;
470 }
471 
472 static void snapshot_sig_handler(int sig);
473 
474 static int __cmd_record(struct record *rec, int argc, const char **argv)
475 {
476 	int err;
477 	int status = 0;
478 	unsigned long waking = 0;
479 	const bool forks = argc > 0;
480 	struct machine *machine;
481 	struct perf_tool *tool = &rec->tool;
482 	struct record_opts *opts = &rec->opts;
483 	struct perf_data_file *file = &rec->file;
484 	struct perf_session *session;
485 	bool disabled = false, draining = false;
486 	int fd;
487 
488 	rec->progname = argv[0];
489 
490 	atexit(record__sig_exit);
491 	signal(SIGCHLD, sig_handler);
492 	signal(SIGINT, sig_handler);
493 	signal(SIGTERM, sig_handler);
494 	if (rec->opts.auxtrace_snapshot_mode)
495 		signal(SIGUSR2, snapshot_sig_handler);
496 	else
497 		signal(SIGUSR2, SIG_IGN);
498 
499 	session = perf_session__new(file, false, tool);
500 	if (session == NULL) {
501 		pr_err("Perf session creation failed.\n");
502 		return -1;
503 	}
504 
505 	fd = perf_data_file__fd(file);
506 	rec->session = session;
507 
508 	record__init_features(rec);
509 
510 	if (forks) {
511 		err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
512 						    argv, file->is_pipe,
513 						    workload_exec_failed_signal);
514 		if (err < 0) {
515 			pr_err("Couldn't run the workload!\n");
516 			status = err;
517 			goto out_delete_session;
518 		}
519 	}
520 
521 	if (record__open(rec) != 0) {
522 		err = -1;
523 		goto out_child;
524 	}
525 
526 	/*
527 	 * Normally perf_session__new would do this, but it doesn't have the
528 	 * evlist.
529 	 */
530 	if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
531 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
532 		rec->tool.ordered_events = false;
533 	}
534 
535 	if (!rec->evlist->nr_groups)
536 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
537 
538 	if (file->is_pipe) {
539 		err = perf_header__write_pipe(fd);
540 		if (err < 0)
541 			goto out_child;
542 	} else {
543 		err = perf_session__write_header(session, rec->evlist, fd, false);
544 		if (err < 0)
545 			goto out_child;
546 	}
547 
548 	if (!rec->no_buildid
549 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
550 		pr_err("Couldn't generate buildids. "
551 		       "Use --no-buildid to profile anyway.\n");
552 		err = -1;
553 		goto out_child;
554 	}
555 
556 	machine = &session->machines.host;
557 
558 	if (file->is_pipe) {
559 		err = perf_event__synthesize_attrs(tool, session,
560 						   process_synthesized_event);
561 		if (err < 0) {
562 			pr_err("Couldn't synthesize attrs.\n");
563 			goto out_child;
564 		}
565 
566 		if (have_tracepoints(&rec->evlist->entries)) {
567 			/*
568 			 * FIXME err <= 0 here actually means that
569 			 * there were no tracepoints so its not really
570 			 * an error, just that we don't need to
571 			 * synthesize anything.  We really have to
572 			 * return this more properly and also
573 			 * propagate errors that now are calling die()
574 			 */
575 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
576 								  process_synthesized_event);
577 			if (err <= 0) {
578 				pr_err("Couldn't record tracing data.\n");
579 				goto out_child;
580 			}
581 			rec->bytes_written += err;
582 		}
583 	}
584 
585 	if (rec->opts.full_auxtrace) {
586 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
587 					session, process_synthesized_event);
588 		if (err)
589 			goto out_delete_session;
590 	}
591 
592 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
593 						 machine);
594 	if (err < 0)
595 		pr_err("Couldn't record kernel reference relocation symbol\n"
596 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
597 		       "Check /proc/kallsyms permission or run as root.\n");
598 
599 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
600 					     machine);
601 	if (err < 0)
602 		pr_err("Couldn't record kernel module information.\n"
603 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
604 		       "Check /proc/modules permission or run as root.\n");
605 
606 	if (perf_guest) {
607 		machines__process_guests(&session->machines,
608 					 perf_event__synthesize_guest_os, tool);
609 	}
610 
611 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
612 					    process_synthesized_event, opts->sample_address,
613 					    opts->proc_map_timeout);
614 	if (err != 0)
615 		goto out_child;
616 
617 	if (rec->realtime_prio) {
618 		struct sched_param param;
619 
620 		param.sched_priority = rec->realtime_prio;
621 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
622 			pr_err("Could not set realtime priority.\n");
623 			err = -1;
624 			goto out_child;
625 		}
626 	}
627 
628 	/*
629 	 * When perf is starting the traced process, all the events
630 	 * (apart from group members) have enable_on_exec=1 set,
631 	 * so don't spoil it by prematurely enabling them.
632 	 */
633 	if (!target__none(&opts->target) && !opts->initial_delay)
634 		perf_evlist__enable(rec->evlist);
635 
636 	/*
637 	 * Let the child rip
638 	 */
639 	if (forks) {
640 		union perf_event event;
641 		/*
642 		 * Some H/W events are generated before COMM event
643 		 * which is emitted during exec(), so perf script
644 		 * cannot see a correct process name for those events.
645 		 * Synthesize COMM event to prevent it.
646 		 */
647 		perf_event__synthesize_comm(tool, &event,
648 					    rec->evlist->workload.pid,
649 					    process_synthesized_event,
650 					    machine);
651 
652 		perf_evlist__start_workload(rec->evlist);
653 	}
654 
655 	if (opts->initial_delay) {
656 		usleep(opts->initial_delay * 1000);
657 		perf_evlist__enable(rec->evlist);
658 	}
659 
660 	auxtrace_snapshot_enabled = 1;
661 	for (;;) {
662 		int hits = rec->samples;
663 
664 		if (record__mmap_read_all(rec) < 0) {
665 			auxtrace_snapshot_enabled = 0;
666 			err = -1;
667 			goto out_child;
668 		}
669 
670 		if (auxtrace_record__snapshot_started) {
671 			auxtrace_record__snapshot_started = 0;
672 			if (!auxtrace_snapshot_err)
673 				record__read_auxtrace_snapshot(rec);
674 			if (auxtrace_snapshot_err) {
675 				pr_err("AUX area tracing snapshot failed\n");
676 				err = -1;
677 				goto out_child;
678 			}
679 		}
680 
681 		if (hits == rec->samples) {
682 			if (done || draining)
683 				break;
684 			err = perf_evlist__poll(rec->evlist, -1);
685 			/*
686 			 * Propagate error, only if there's any. Ignore positive
687 			 * number of returned events and interrupt error.
688 			 */
689 			if (err > 0 || (err < 0 && errno == EINTR))
690 				err = 0;
691 			waking++;
692 
693 			if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
694 				draining = true;
695 		}
696 
697 		/*
698 		 * When perf is starting the traced process, at the end events
699 		 * die with the process and we wait for that. Thus no need to
700 		 * disable events in this case.
701 		 */
702 		if (done && !disabled && !target__none(&opts->target)) {
703 			auxtrace_snapshot_enabled = 0;
704 			perf_evlist__disable(rec->evlist);
705 			disabled = true;
706 		}
707 	}
708 	auxtrace_snapshot_enabled = 0;
709 
710 	if (forks && workload_exec_errno) {
711 		char msg[STRERR_BUFSIZE];
712 		const char *emsg = strerror_r(workload_exec_errno, msg, sizeof(msg));
713 		pr_err("Workload failed: %s\n", emsg);
714 		err = -1;
715 		goto out_child;
716 	}
717 
718 	if (!quiet)
719 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
720 
721 out_child:
722 	if (forks) {
723 		int exit_status;
724 
725 		if (!child_finished)
726 			kill(rec->evlist->workload.pid, SIGTERM);
727 
728 		wait(&exit_status);
729 
730 		if (err < 0)
731 			status = err;
732 		else if (WIFEXITED(exit_status))
733 			status = WEXITSTATUS(exit_status);
734 		else if (WIFSIGNALED(exit_status))
735 			signr = WTERMSIG(exit_status);
736 	} else
737 		status = err;
738 
739 	/* this will be recalculated during process_buildids() */
740 	rec->samples = 0;
741 
742 	if (!err && !file->is_pipe) {
743 		rec->session->header.data_size += rec->bytes_written;
744 		file->size = lseek(perf_data_file__fd(file), 0, SEEK_CUR);
745 
746 		if (!rec->no_buildid) {
747 			process_buildids(rec);
748 			/*
749 			 * We take all buildids when the file contains
750 			 * AUX area tracing data because we do not decode the
751 			 * trace because it would take too long.
752 			 */
753 			if (rec->opts.full_auxtrace)
754 				dsos__hit_all(rec->session);
755 		}
756 		perf_session__write_header(rec->session, rec->evlist, fd, true);
757 	}
758 
759 	if (!err && !quiet) {
760 		char samples[128];
761 
762 		if (rec->samples && !rec->opts.full_auxtrace)
763 			scnprintf(samples, sizeof(samples),
764 				  " (%" PRIu64 " samples)", rec->samples);
765 		else
766 			samples[0] = '\0';
767 
768 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s ]\n",
769 			perf_data_file__size(file) / 1024.0 / 1024.0,
770 			file->path, samples);
771 	}
772 
773 out_delete_session:
774 	perf_session__delete(session);
775 	return status;
776 }
777 
778 static void callchain_debug(void)
779 {
780 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
781 
782 	pr_debug("callchain: type %s\n", str[callchain_param.record_mode]);
783 
784 	if (callchain_param.record_mode == CALLCHAIN_DWARF)
785 		pr_debug("callchain: stack dump size %d\n",
786 			 callchain_param.dump_size);
787 }
788 
789 int record_parse_callchain_opt(const struct option *opt,
790 			       const char *arg,
791 			       int unset)
792 {
793 	int ret;
794 	struct record_opts *record = (struct record_opts *)opt->value;
795 
796 	record->callgraph_set = true;
797 	callchain_param.enabled = !unset;
798 
799 	/* --no-call-graph */
800 	if (unset) {
801 		callchain_param.record_mode = CALLCHAIN_NONE;
802 		pr_debug("callchain: disabled\n");
803 		return 0;
804 	}
805 
806 	ret = parse_callchain_record_opt(arg, &callchain_param);
807 	if (!ret)
808 		callchain_debug();
809 
810 	return ret;
811 }
812 
813 int record_callchain_opt(const struct option *opt,
814 			 const char *arg __maybe_unused,
815 			 int unset __maybe_unused)
816 {
817 	struct record_opts *record = (struct record_opts *)opt->value;
818 
819 	record->callgraph_set = true;
820 	callchain_param.enabled = true;
821 
822 	if (callchain_param.record_mode == CALLCHAIN_NONE)
823 		callchain_param.record_mode = CALLCHAIN_FP;
824 
825 	callchain_debug();
826 	return 0;
827 }
828 
829 static int perf_record_config(const char *var, const char *value, void *cb)
830 {
831 	if (!strcmp(var, "record.call-graph"))
832 		var = "call-graph.record-mode"; /* fall-through */
833 
834 	return perf_default_config(var, value, cb);
835 }
836 
837 struct clockid_map {
838 	const char *name;
839 	int clockid;
840 };
841 
842 #define CLOCKID_MAP(n, c)	\
843 	{ .name = n, .clockid = (c), }
844 
845 #define CLOCKID_END	{ .name = NULL, }
846 
847 
848 /*
849  * Add the missing ones, we need to build on many distros...
850  */
851 #ifndef CLOCK_MONOTONIC_RAW
852 #define CLOCK_MONOTONIC_RAW 4
853 #endif
854 #ifndef CLOCK_BOOTTIME
855 #define CLOCK_BOOTTIME 7
856 #endif
857 #ifndef CLOCK_TAI
858 #define CLOCK_TAI 11
859 #endif
860 
861 static const struct clockid_map clockids[] = {
862 	/* available for all events, NMI safe */
863 	CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
864 	CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
865 
866 	/* available for some events */
867 	CLOCKID_MAP("realtime", CLOCK_REALTIME),
868 	CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
869 	CLOCKID_MAP("tai", CLOCK_TAI),
870 
871 	/* available for the lazy */
872 	CLOCKID_MAP("mono", CLOCK_MONOTONIC),
873 	CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
874 	CLOCKID_MAP("real", CLOCK_REALTIME),
875 	CLOCKID_MAP("boot", CLOCK_BOOTTIME),
876 
877 	CLOCKID_END,
878 };
879 
880 static int parse_clockid(const struct option *opt, const char *str, int unset)
881 {
882 	struct record_opts *opts = (struct record_opts *)opt->value;
883 	const struct clockid_map *cm;
884 	const char *ostr = str;
885 
886 	if (unset) {
887 		opts->use_clockid = 0;
888 		return 0;
889 	}
890 
891 	/* no arg passed */
892 	if (!str)
893 		return 0;
894 
895 	/* no setting it twice */
896 	if (opts->use_clockid)
897 		return -1;
898 
899 	opts->use_clockid = true;
900 
901 	/* if its a number, we're done */
902 	if (sscanf(str, "%d", &opts->clockid) == 1)
903 		return 0;
904 
905 	/* allow a "CLOCK_" prefix to the name */
906 	if (!strncasecmp(str, "CLOCK_", 6))
907 		str += 6;
908 
909 	for (cm = clockids; cm->name; cm++) {
910 		if (!strcasecmp(str, cm->name)) {
911 			opts->clockid = cm->clockid;
912 			return 0;
913 		}
914 	}
915 
916 	opts->use_clockid = false;
917 	ui__warning("unknown clockid %s, check man page\n", ostr);
918 	return -1;
919 }
920 
921 static int record__parse_mmap_pages(const struct option *opt,
922 				    const char *str,
923 				    int unset __maybe_unused)
924 {
925 	struct record_opts *opts = opt->value;
926 	char *s, *p;
927 	unsigned int mmap_pages;
928 	int ret;
929 
930 	if (!str)
931 		return -EINVAL;
932 
933 	s = strdup(str);
934 	if (!s)
935 		return -ENOMEM;
936 
937 	p = strchr(s, ',');
938 	if (p)
939 		*p = '\0';
940 
941 	if (*s) {
942 		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
943 		if (ret)
944 			goto out_free;
945 		opts->mmap_pages = mmap_pages;
946 	}
947 
948 	if (!p) {
949 		ret = 0;
950 		goto out_free;
951 	}
952 
953 	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
954 	if (ret)
955 		goto out_free;
956 
957 	opts->auxtrace_mmap_pages = mmap_pages;
958 
959 out_free:
960 	free(s);
961 	return ret;
962 }
963 
964 static const char * const __record_usage[] = {
965 	"perf record [<options>] [<command>]",
966 	"perf record [<options>] -- <command> [<options>]",
967 	NULL
968 };
969 const char * const *record_usage = __record_usage;
970 
971 /*
972  * XXX Ideally would be local to cmd_record() and passed to a record__new
973  * because we need to have access to it in record__exit, that is called
974  * after cmd_record() exits, but since record_options need to be accessible to
975  * builtin-script, leave it here.
976  *
977  * At least we don't ouch it in all the other functions here directly.
978  *
979  * Just say no to tons of global variables, sigh.
980  */
981 static struct record record = {
982 	.opts = {
983 		.sample_time	     = true,
984 		.mmap_pages	     = UINT_MAX,
985 		.user_freq	     = UINT_MAX,
986 		.user_interval	     = ULLONG_MAX,
987 		.freq		     = 4000,
988 		.target		     = {
989 			.uses_mmap   = true,
990 			.default_per_cpu = true,
991 		},
992 		.proc_map_timeout     = 500,
993 	},
994 	.tool = {
995 		.sample		= process_sample_event,
996 		.fork		= perf_event__process_fork,
997 		.exit		= perf_event__process_exit,
998 		.comm		= perf_event__process_comm,
999 		.mmap		= perf_event__process_mmap,
1000 		.mmap2		= perf_event__process_mmap2,
1001 		.ordered_events	= true,
1002 	},
1003 };
1004 
1005 #define CALLCHAIN_HELP "setup and enables call-graph (stack chain/backtrace) recording: "
1006 
1007 #ifdef HAVE_DWARF_UNWIND_SUPPORT
1008 const char record_callchain_help[] = CALLCHAIN_HELP "fp dwarf lbr";
1009 #else
1010 const char record_callchain_help[] = CALLCHAIN_HELP "fp lbr";
1011 #endif
1012 
1013 /*
1014  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
1015  * with it and switch to use the library functions in perf_evlist that came
1016  * from builtin-record.c, i.e. use record_opts,
1017  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
1018  * using pipes, etc.
1019  */
1020 struct option __record_options[] = {
1021 	OPT_CALLBACK('e', "event", &record.evlist, "event",
1022 		     "event selector. use 'perf list' to list available events",
1023 		     parse_events_option),
1024 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
1025 		     "event filter", parse_filter),
1026 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
1027 			   NULL, "don't record events from perf itself",
1028 			   exclude_perf),
1029 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
1030 		    "record events on existing process id"),
1031 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
1032 		    "record events on existing thread id"),
1033 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
1034 		    "collect data with this RT SCHED_FIFO priority"),
1035 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
1036 		    "collect data without buffering"),
1037 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
1038 		    "collect raw sample records from all opened counters"),
1039 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1040 			    "system-wide collection from all CPUs"),
1041 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1042 		    "list of cpus to monitor"),
1043 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
1044 	OPT_STRING('o', "output", &record.file.path, "file",
1045 		    "output file name"),
1046 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
1047 			&record.opts.no_inherit_set,
1048 			"child tasks do not inherit counters"),
1049 	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
1050 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
1051 		     "number of mmap data pages and AUX area tracing mmap pages",
1052 		     record__parse_mmap_pages),
1053 	OPT_BOOLEAN(0, "group", &record.opts.group,
1054 		    "put the counters into a counter group"),
1055 	OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
1056 			   NULL, "enables call-graph recording" ,
1057 			   &record_callchain_opt),
1058 	OPT_CALLBACK(0, "call-graph", &record.opts,
1059 		     "mode[,dump_size]", record_callchain_help,
1060 		     &record_parse_callchain_opt),
1061 	OPT_INCR('v', "verbose", &verbose,
1062 		    "be more verbose (show counter open errors, etc)"),
1063 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1064 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1065 		    "per thread counts"),
1066 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
1067 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
1068 			&record.opts.sample_time_set,
1069 			"Record the sample timestamps"),
1070 	OPT_BOOLEAN('P', "period", &record.opts.period, "Record the sample period"),
1071 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
1072 		    "don't sample"),
1073 	OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
1074 		    "do not update the buildid cache"),
1075 	OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
1076 		    "do not collect buildids in perf.data"),
1077 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
1078 		     "monitor event in cgroup name only",
1079 		     parse_cgroups),
1080 	OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
1081 		  "ms to wait before starting measurement after program start"),
1082 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
1083 		   "user to profile"),
1084 
1085 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
1086 		     "branch any", "sample any taken branches",
1087 		     parse_branch_stack),
1088 
1089 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
1090 		     "branch filter mask", "branch stack filter modes",
1091 		     parse_branch_stack),
1092 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
1093 		    "sample by weight (on special events only)"),
1094 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
1095 		    "sample transaction flags (special events only)"),
1096 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
1097 		    "use per-thread mmaps"),
1098 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
1099 		    "sample selected machine registers on interrupt,"
1100 		    " use -I ? to list register names", parse_regs),
1101 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
1102 		    "Record running/enabled time of read (:S) events"),
1103 	OPT_CALLBACK('k', "clockid", &record.opts,
1104 	"clockid", "clockid to use for events, see clock_gettime()",
1105 	parse_clockid),
1106 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
1107 			  "opts", "AUX area tracing Snapshot Mode", ""),
1108 	OPT_UINTEGER(0, "proc-map-timeout", &record.opts.proc_map_timeout,
1109 			"per thread proc mmap processing timeout in ms"),
1110 	OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
1111 		    "Record context switch events"),
1112 	OPT_END()
1113 };
1114 
1115 struct option *record_options = __record_options;
1116 
1117 int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
1118 {
1119 	int err;
1120 	struct record *rec = &record;
1121 	char errbuf[BUFSIZ];
1122 
1123 	rec->evlist = perf_evlist__new();
1124 	if (rec->evlist == NULL)
1125 		return -ENOMEM;
1126 
1127 	perf_config(perf_record_config, rec);
1128 
1129 	argc = parse_options(argc, argv, record_options, record_usage,
1130 			    PARSE_OPT_STOP_AT_NON_OPTION);
1131 	if (!argc && target__none(&rec->opts.target))
1132 		usage_with_options(record_usage, record_options);
1133 
1134 	if (nr_cgroups && !rec->opts.target.system_wide) {
1135 		ui__error("cgroup monitoring only available in"
1136 			  " system-wide mode\n");
1137 		usage_with_options(record_usage, record_options);
1138 	}
1139 	if (rec->opts.record_switch_events &&
1140 	    !perf_can_record_switch_events()) {
1141 		ui__error("kernel does not support recording context switch events (--switch-events option)\n");
1142 		usage_with_options(record_usage, record_options);
1143 	}
1144 
1145 	if (!rec->itr) {
1146 		rec->itr = auxtrace_record__init(rec->evlist, &err);
1147 		if (err)
1148 			return err;
1149 	}
1150 
1151 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
1152 					      rec->opts.auxtrace_snapshot_opts);
1153 	if (err)
1154 		return err;
1155 
1156 	err = -ENOMEM;
1157 
1158 	symbol__init(NULL);
1159 
1160 	if (symbol_conf.kptr_restrict)
1161 		pr_warning(
1162 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1163 "check /proc/sys/kernel/kptr_restrict.\n\n"
1164 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1165 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1166 "Samples in kernel modules won't be resolved at all.\n\n"
1167 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1168 "even with a suitable vmlinux or kallsyms file.\n\n");
1169 
1170 	if (rec->no_buildid_cache || rec->no_buildid)
1171 		disable_buildid_cache();
1172 
1173 	if (rec->evlist->nr_entries == 0 &&
1174 	    perf_evlist__add_default(rec->evlist) < 0) {
1175 		pr_err("Not enough memory for event selector list\n");
1176 		goto out_symbol_exit;
1177 	}
1178 
1179 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
1180 		rec->opts.no_inherit = true;
1181 
1182 	err = target__validate(&rec->opts.target);
1183 	if (err) {
1184 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1185 		ui__warning("%s", errbuf);
1186 	}
1187 
1188 	err = target__parse_uid(&rec->opts.target);
1189 	if (err) {
1190 		int saved_errno = errno;
1191 
1192 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1193 		ui__error("%s", errbuf);
1194 
1195 		err = -saved_errno;
1196 		goto out_symbol_exit;
1197 	}
1198 
1199 	err = -ENOMEM;
1200 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
1201 		usage_with_options(record_usage, record_options);
1202 
1203 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
1204 	if (err)
1205 		goto out_symbol_exit;
1206 
1207 	if (record_opts__config(&rec->opts)) {
1208 		err = -EINVAL;
1209 		goto out_symbol_exit;
1210 	}
1211 
1212 	err = __cmd_record(&record, argc, argv);
1213 out_symbol_exit:
1214 	perf_evlist__delete(rec->evlist);
1215 	symbol__exit();
1216 	auxtrace_record__free(rec->itr);
1217 	return err;
1218 }
1219 
1220 static void snapshot_sig_handler(int sig __maybe_unused)
1221 {
1222 	if (!auxtrace_snapshot_enabled)
1223 		return;
1224 	auxtrace_snapshot_enabled = 0;
1225 	auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
1226 	auxtrace_record__snapshot_started = 1;
1227 }
1228