xref: /linux/tools/perf/builtin-record.c (revision 9cfc5c90ad38c8fc11bfd39de42a107da00871ba)
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #include "builtin.h"
9 
10 #include "perf.h"
11 
12 #include "util/build-id.h"
13 #include "util/util.h"
14 #include "util/parse-options.h"
15 #include "util/parse-events.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/cpumap.h"
28 #include "util/thread_map.h"
29 #include "util/data.h"
30 #include "util/perf_regs.h"
31 #include "util/auxtrace.h"
32 #include "util/parse-branch-options.h"
33 #include "util/parse-regs-options.h"
34 #include "util/llvm-utils.h"
35 
36 #include <unistd.h>
37 #include <sched.h>
38 #include <sys/mman.h>
39 
40 
41 struct record {
42 	struct perf_tool	tool;
43 	struct record_opts	opts;
44 	u64			bytes_written;
45 	struct perf_data_file	file;
46 	struct auxtrace_record	*itr;
47 	struct perf_evlist	*evlist;
48 	struct perf_session	*session;
49 	const char		*progname;
50 	int			realtime_prio;
51 	bool			no_buildid;
52 	bool			no_buildid_cache;
53 	unsigned long long	samples;
54 };
55 
56 static int record__write(struct record *rec, void *bf, size_t size)
57 {
58 	if (perf_data_file__write(rec->session->file, bf, size) < 0) {
59 		pr_err("failed to write perf data, error: %m\n");
60 		return -1;
61 	}
62 
63 	rec->bytes_written += size;
64 	return 0;
65 }
66 
67 static int process_synthesized_event(struct perf_tool *tool,
68 				     union perf_event *event,
69 				     struct perf_sample *sample __maybe_unused,
70 				     struct machine *machine __maybe_unused)
71 {
72 	struct record *rec = container_of(tool, struct record, tool);
73 	return record__write(rec, event, event->header.size);
74 }
75 
76 static int record__mmap_read(struct record *rec, int idx)
77 {
78 	struct perf_mmap *md = &rec->evlist->mmap[idx];
79 	u64 head = perf_mmap__read_head(md);
80 	u64 old = md->prev;
81 	unsigned char *data = md->base + page_size;
82 	unsigned long size;
83 	void *buf;
84 	int rc = 0;
85 
86 	if (old == head)
87 		return 0;
88 
89 	rec->samples++;
90 
91 	size = head - old;
92 
93 	if ((old & md->mask) + size != (head & md->mask)) {
94 		buf = &data[old & md->mask];
95 		size = md->mask + 1 - (old & md->mask);
96 		old += size;
97 
98 		if (record__write(rec, buf, size) < 0) {
99 			rc = -1;
100 			goto out;
101 		}
102 	}
103 
104 	buf = &data[old & md->mask];
105 	size = head - old;
106 	old += size;
107 
108 	if (record__write(rec, buf, size) < 0) {
109 		rc = -1;
110 		goto out;
111 	}
112 
113 	md->prev = old;
114 	perf_evlist__mmap_consume(rec->evlist, idx);
115 out:
116 	return rc;
117 }
118 
119 static volatile int done;
120 static volatile int signr = -1;
121 static volatile int child_finished;
122 static volatile int auxtrace_snapshot_enabled;
123 static volatile int auxtrace_snapshot_err;
124 static volatile int auxtrace_record__snapshot_started;
125 
126 static void sig_handler(int sig)
127 {
128 	if (sig == SIGCHLD)
129 		child_finished = 1;
130 	else
131 		signr = sig;
132 
133 	done = 1;
134 }
135 
136 static void record__sig_exit(void)
137 {
138 	if (signr == -1)
139 		return;
140 
141 	signal(signr, SIG_DFL);
142 	raise(signr);
143 }
144 
145 #ifdef HAVE_AUXTRACE_SUPPORT
146 
147 static int record__process_auxtrace(struct perf_tool *tool,
148 				    union perf_event *event, void *data1,
149 				    size_t len1, void *data2, size_t len2)
150 {
151 	struct record *rec = container_of(tool, struct record, tool);
152 	struct perf_data_file *file = &rec->file;
153 	size_t padding;
154 	u8 pad[8] = {0};
155 
156 	if (!perf_data_file__is_pipe(file)) {
157 		off_t file_offset;
158 		int fd = perf_data_file__fd(file);
159 		int err;
160 
161 		file_offset = lseek(fd, 0, SEEK_CUR);
162 		if (file_offset == -1)
163 			return -1;
164 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
165 						     event, file_offset);
166 		if (err)
167 			return err;
168 	}
169 
170 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
171 	padding = (len1 + len2) & 7;
172 	if (padding)
173 		padding = 8 - padding;
174 
175 	record__write(rec, event, event->header.size);
176 	record__write(rec, data1, len1);
177 	if (len2)
178 		record__write(rec, data2, len2);
179 	record__write(rec, &pad, padding);
180 
181 	return 0;
182 }
183 
184 static int record__auxtrace_mmap_read(struct record *rec,
185 				      struct auxtrace_mmap *mm)
186 {
187 	int ret;
188 
189 	ret = auxtrace_mmap__read(mm, rec->itr, &rec->tool,
190 				  record__process_auxtrace);
191 	if (ret < 0)
192 		return ret;
193 
194 	if (ret)
195 		rec->samples++;
196 
197 	return 0;
198 }
199 
200 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
201 					       struct auxtrace_mmap *mm)
202 {
203 	int ret;
204 
205 	ret = auxtrace_mmap__read_snapshot(mm, rec->itr, &rec->tool,
206 					   record__process_auxtrace,
207 					   rec->opts.auxtrace_snapshot_size);
208 	if (ret < 0)
209 		return ret;
210 
211 	if (ret)
212 		rec->samples++;
213 
214 	return 0;
215 }
216 
217 static int record__auxtrace_read_snapshot_all(struct record *rec)
218 {
219 	int i;
220 	int rc = 0;
221 
222 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
223 		struct auxtrace_mmap *mm =
224 				&rec->evlist->mmap[i].auxtrace_mmap;
225 
226 		if (!mm->base)
227 			continue;
228 
229 		if (record__auxtrace_mmap_read_snapshot(rec, mm) != 0) {
230 			rc = -1;
231 			goto out;
232 		}
233 	}
234 out:
235 	return rc;
236 }
237 
238 static void record__read_auxtrace_snapshot(struct record *rec)
239 {
240 	pr_debug("Recording AUX area tracing snapshot\n");
241 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
242 		auxtrace_snapshot_err = -1;
243 	} else {
244 		auxtrace_snapshot_err = auxtrace_record__snapshot_finish(rec->itr);
245 		if (!auxtrace_snapshot_err)
246 			auxtrace_snapshot_enabled = 1;
247 	}
248 }
249 
250 #else
251 
252 static inline
253 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
254 			       struct auxtrace_mmap *mm __maybe_unused)
255 {
256 	return 0;
257 }
258 
259 static inline
260 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
261 {
262 }
263 
264 static inline
265 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
266 {
267 	return 0;
268 }
269 
270 #endif
271 
272 static int record__open(struct record *rec)
273 {
274 	char msg[512];
275 	struct perf_evsel *pos;
276 	struct perf_evlist *evlist = rec->evlist;
277 	struct perf_session *session = rec->session;
278 	struct record_opts *opts = &rec->opts;
279 	int rc = 0;
280 
281 	perf_evlist__config(evlist, opts);
282 
283 	evlist__for_each(evlist, pos) {
284 try_again:
285 		if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
286 			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
287 				if (verbose)
288 					ui__warning("%s\n", msg);
289 				goto try_again;
290 			}
291 
292 			rc = -errno;
293 			perf_evsel__open_strerror(pos, &opts->target,
294 						  errno, msg, sizeof(msg));
295 			ui__error("%s\n", msg);
296 			goto out;
297 		}
298 	}
299 
300 	if (perf_evlist__apply_filters(evlist, &pos)) {
301 		error("failed to set filter \"%s\" on event %s with %d (%s)\n",
302 			pos->filter, perf_evsel__name(pos), errno,
303 			strerror_r(errno, msg, sizeof(msg)));
304 		rc = -1;
305 		goto out;
306 	}
307 
308 	if (perf_evlist__mmap_ex(evlist, opts->mmap_pages, false,
309 				 opts->auxtrace_mmap_pages,
310 				 opts->auxtrace_snapshot_mode) < 0) {
311 		if (errno == EPERM) {
312 			pr_err("Permission error mapping pages.\n"
313 			       "Consider increasing "
314 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
315 			       "or try again with a smaller value of -m/--mmap_pages.\n"
316 			       "(current value: %u,%u)\n",
317 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
318 			rc = -errno;
319 		} else {
320 			pr_err("failed to mmap with %d (%s)\n", errno,
321 				strerror_r(errno, msg, sizeof(msg)));
322 			rc = -errno;
323 		}
324 		goto out;
325 	}
326 
327 	session->evlist = evlist;
328 	perf_session__set_id_hdr_size(session);
329 out:
330 	return rc;
331 }
332 
333 static int process_sample_event(struct perf_tool *tool,
334 				union perf_event *event,
335 				struct perf_sample *sample,
336 				struct perf_evsel *evsel,
337 				struct machine *machine)
338 {
339 	struct record *rec = container_of(tool, struct record, tool);
340 
341 	rec->samples++;
342 
343 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
344 }
345 
346 static int process_buildids(struct record *rec)
347 {
348 	struct perf_data_file *file  = &rec->file;
349 	struct perf_session *session = rec->session;
350 
351 	if (file->size == 0)
352 		return 0;
353 
354 	/*
355 	 * During this process, it'll load kernel map and replace the
356 	 * dso->long_name to a real pathname it found.  In this case
357 	 * we prefer the vmlinux path like
358 	 *   /lib/modules/3.16.4/build/vmlinux
359 	 *
360 	 * rather than build-id path (in debug directory).
361 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
362 	 */
363 	symbol_conf.ignore_vmlinux_buildid = true;
364 
365 	return perf_session__process_events(session);
366 }
367 
368 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
369 {
370 	int err;
371 	struct perf_tool *tool = data;
372 	/*
373 	 *As for guest kernel when processing subcommand record&report,
374 	 *we arrange module mmap prior to guest kernel mmap and trigger
375 	 *a preload dso because default guest module symbols are loaded
376 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
377 	 *method is used to avoid symbol missing when the first addr is
378 	 *in module instead of in guest kernel.
379 	 */
380 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
381 					     machine);
382 	if (err < 0)
383 		pr_err("Couldn't record guest kernel [%d]'s reference"
384 		       " relocation symbol.\n", machine->pid);
385 
386 	/*
387 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
388 	 * have no _text sometimes.
389 	 */
390 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
391 						 machine);
392 	if (err < 0)
393 		pr_err("Couldn't record guest kernel [%d]'s reference"
394 		       " relocation symbol.\n", machine->pid);
395 }
396 
397 static struct perf_event_header finished_round_event = {
398 	.size = sizeof(struct perf_event_header),
399 	.type = PERF_RECORD_FINISHED_ROUND,
400 };
401 
402 static int record__mmap_read_all(struct record *rec)
403 {
404 	u64 bytes_written = rec->bytes_written;
405 	int i;
406 	int rc = 0;
407 
408 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
409 		struct auxtrace_mmap *mm = &rec->evlist->mmap[i].auxtrace_mmap;
410 
411 		if (rec->evlist->mmap[i].base) {
412 			if (record__mmap_read(rec, i) != 0) {
413 				rc = -1;
414 				goto out;
415 			}
416 		}
417 
418 		if (mm->base && !rec->opts.auxtrace_snapshot_mode &&
419 		    record__auxtrace_mmap_read(rec, mm) != 0) {
420 			rc = -1;
421 			goto out;
422 		}
423 	}
424 
425 	/*
426 	 * Mark the round finished in case we wrote
427 	 * at least one event.
428 	 */
429 	if (bytes_written != rec->bytes_written)
430 		rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
431 
432 out:
433 	return rc;
434 }
435 
436 static void record__init_features(struct record *rec)
437 {
438 	struct perf_session *session = rec->session;
439 	int feat;
440 
441 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
442 		perf_header__set_feat(&session->header, feat);
443 
444 	if (rec->no_buildid)
445 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
446 
447 	if (!have_tracepoints(&rec->evlist->entries))
448 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
449 
450 	if (!rec->opts.branch_stack)
451 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
452 
453 	if (!rec->opts.full_auxtrace)
454 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
455 }
456 
457 static volatile int workload_exec_errno;
458 
459 /*
460  * perf_evlist__prepare_workload will send a SIGUSR1
461  * if the fork fails, since we asked by setting its
462  * want_signal to true.
463  */
464 static void workload_exec_failed_signal(int signo __maybe_unused,
465 					siginfo_t *info,
466 					void *ucontext __maybe_unused)
467 {
468 	workload_exec_errno = info->si_value.sival_int;
469 	done = 1;
470 	child_finished = 1;
471 }
472 
473 static void snapshot_sig_handler(int sig);
474 
475 static int __cmd_record(struct record *rec, int argc, const char **argv)
476 {
477 	int err;
478 	int status = 0;
479 	unsigned long waking = 0;
480 	const bool forks = argc > 0;
481 	struct machine *machine;
482 	struct perf_tool *tool = &rec->tool;
483 	struct record_opts *opts = &rec->opts;
484 	struct perf_data_file *file = &rec->file;
485 	struct perf_session *session;
486 	bool disabled = false, draining = false;
487 	int fd;
488 
489 	rec->progname = argv[0];
490 
491 	atexit(record__sig_exit);
492 	signal(SIGCHLD, sig_handler);
493 	signal(SIGINT, sig_handler);
494 	signal(SIGTERM, sig_handler);
495 	if (rec->opts.auxtrace_snapshot_mode)
496 		signal(SIGUSR2, snapshot_sig_handler);
497 	else
498 		signal(SIGUSR2, SIG_IGN);
499 
500 	session = perf_session__new(file, false, tool);
501 	if (session == NULL) {
502 		pr_err("Perf session creation failed.\n");
503 		return -1;
504 	}
505 
506 	fd = perf_data_file__fd(file);
507 	rec->session = session;
508 
509 	record__init_features(rec);
510 
511 	if (forks) {
512 		err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
513 						    argv, file->is_pipe,
514 						    workload_exec_failed_signal);
515 		if (err < 0) {
516 			pr_err("Couldn't run the workload!\n");
517 			status = err;
518 			goto out_delete_session;
519 		}
520 	}
521 
522 	if (record__open(rec) != 0) {
523 		err = -1;
524 		goto out_child;
525 	}
526 
527 	/*
528 	 * Normally perf_session__new would do this, but it doesn't have the
529 	 * evlist.
530 	 */
531 	if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
532 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
533 		rec->tool.ordered_events = false;
534 	}
535 
536 	if (!rec->evlist->nr_groups)
537 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
538 
539 	if (file->is_pipe) {
540 		err = perf_header__write_pipe(fd);
541 		if (err < 0)
542 			goto out_child;
543 	} else {
544 		err = perf_session__write_header(session, rec->evlist, fd, false);
545 		if (err < 0)
546 			goto out_child;
547 	}
548 
549 	if (!rec->no_buildid
550 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
551 		pr_err("Couldn't generate buildids. "
552 		       "Use --no-buildid to profile anyway.\n");
553 		err = -1;
554 		goto out_child;
555 	}
556 
557 	machine = &session->machines.host;
558 
559 	if (file->is_pipe) {
560 		err = perf_event__synthesize_attrs(tool, session,
561 						   process_synthesized_event);
562 		if (err < 0) {
563 			pr_err("Couldn't synthesize attrs.\n");
564 			goto out_child;
565 		}
566 
567 		if (have_tracepoints(&rec->evlist->entries)) {
568 			/*
569 			 * FIXME err <= 0 here actually means that
570 			 * there were no tracepoints so its not really
571 			 * an error, just that we don't need to
572 			 * synthesize anything.  We really have to
573 			 * return this more properly and also
574 			 * propagate errors that now are calling die()
575 			 */
576 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
577 								  process_synthesized_event);
578 			if (err <= 0) {
579 				pr_err("Couldn't record tracing data.\n");
580 				goto out_child;
581 			}
582 			rec->bytes_written += err;
583 		}
584 	}
585 
586 	if (rec->opts.full_auxtrace) {
587 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
588 					session, process_synthesized_event);
589 		if (err)
590 			goto out_delete_session;
591 	}
592 
593 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
594 						 machine);
595 	if (err < 0)
596 		pr_err("Couldn't record kernel reference relocation symbol\n"
597 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
598 		       "Check /proc/kallsyms permission or run as root.\n");
599 
600 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
601 					     machine);
602 	if (err < 0)
603 		pr_err("Couldn't record kernel module information.\n"
604 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
605 		       "Check /proc/modules permission or run as root.\n");
606 
607 	if (perf_guest) {
608 		machines__process_guests(&session->machines,
609 					 perf_event__synthesize_guest_os, tool);
610 	}
611 
612 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
613 					    process_synthesized_event, opts->sample_address,
614 					    opts->proc_map_timeout);
615 	if (err != 0)
616 		goto out_child;
617 
618 	if (rec->realtime_prio) {
619 		struct sched_param param;
620 
621 		param.sched_priority = rec->realtime_prio;
622 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
623 			pr_err("Could not set realtime priority.\n");
624 			err = -1;
625 			goto out_child;
626 		}
627 	}
628 
629 	/*
630 	 * When perf is starting the traced process, all the events
631 	 * (apart from group members) have enable_on_exec=1 set,
632 	 * so don't spoil it by prematurely enabling them.
633 	 */
634 	if (!target__none(&opts->target) && !opts->initial_delay)
635 		perf_evlist__enable(rec->evlist);
636 
637 	/*
638 	 * Let the child rip
639 	 */
640 	if (forks) {
641 		union perf_event *event;
642 
643 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
644 		if (event == NULL) {
645 			err = -ENOMEM;
646 			goto out_child;
647 		}
648 
649 		/*
650 		 * Some H/W events are generated before COMM event
651 		 * which is emitted during exec(), so perf script
652 		 * cannot see a correct process name for those events.
653 		 * Synthesize COMM event to prevent it.
654 		 */
655 		perf_event__synthesize_comm(tool, event,
656 					    rec->evlist->workload.pid,
657 					    process_synthesized_event,
658 					    machine);
659 		free(event);
660 
661 		perf_evlist__start_workload(rec->evlist);
662 	}
663 
664 	if (opts->initial_delay) {
665 		usleep(opts->initial_delay * 1000);
666 		perf_evlist__enable(rec->evlist);
667 	}
668 
669 	auxtrace_snapshot_enabled = 1;
670 	for (;;) {
671 		unsigned long long hits = rec->samples;
672 
673 		if (record__mmap_read_all(rec) < 0) {
674 			auxtrace_snapshot_enabled = 0;
675 			err = -1;
676 			goto out_child;
677 		}
678 
679 		if (auxtrace_record__snapshot_started) {
680 			auxtrace_record__snapshot_started = 0;
681 			if (!auxtrace_snapshot_err)
682 				record__read_auxtrace_snapshot(rec);
683 			if (auxtrace_snapshot_err) {
684 				pr_err("AUX area tracing snapshot failed\n");
685 				err = -1;
686 				goto out_child;
687 			}
688 		}
689 
690 		if (hits == rec->samples) {
691 			if (done || draining)
692 				break;
693 			err = perf_evlist__poll(rec->evlist, -1);
694 			/*
695 			 * Propagate error, only if there's any. Ignore positive
696 			 * number of returned events and interrupt error.
697 			 */
698 			if (err > 0 || (err < 0 && errno == EINTR))
699 				err = 0;
700 			waking++;
701 
702 			if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
703 				draining = true;
704 		}
705 
706 		/*
707 		 * When perf is starting the traced process, at the end events
708 		 * die with the process and we wait for that. Thus no need to
709 		 * disable events in this case.
710 		 */
711 		if (done && !disabled && !target__none(&opts->target)) {
712 			auxtrace_snapshot_enabled = 0;
713 			perf_evlist__disable(rec->evlist);
714 			disabled = true;
715 		}
716 	}
717 	auxtrace_snapshot_enabled = 0;
718 
719 	if (forks && workload_exec_errno) {
720 		char msg[STRERR_BUFSIZE];
721 		const char *emsg = strerror_r(workload_exec_errno, msg, sizeof(msg));
722 		pr_err("Workload failed: %s\n", emsg);
723 		err = -1;
724 		goto out_child;
725 	}
726 
727 	if (!quiet)
728 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
729 
730 out_child:
731 	if (forks) {
732 		int exit_status;
733 
734 		if (!child_finished)
735 			kill(rec->evlist->workload.pid, SIGTERM);
736 
737 		wait(&exit_status);
738 
739 		if (err < 0)
740 			status = err;
741 		else if (WIFEXITED(exit_status))
742 			status = WEXITSTATUS(exit_status);
743 		else if (WIFSIGNALED(exit_status))
744 			signr = WTERMSIG(exit_status);
745 	} else
746 		status = err;
747 
748 	/* this will be recalculated during process_buildids() */
749 	rec->samples = 0;
750 
751 	if (!err && !file->is_pipe) {
752 		rec->session->header.data_size += rec->bytes_written;
753 		file->size = lseek(perf_data_file__fd(file), 0, SEEK_CUR);
754 
755 		if (!rec->no_buildid) {
756 			process_buildids(rec);
757 			/*
758 			 * We take all buildids when the file contains
759 			 * AUX area tracing data because we do not decode the
760 			 * trace because it would take too long.
761 			 */
762 			if (rec->opts.full_auxtrace)
763 				dsos__hit_all(rec->session);
764 		}
765 		perf_session__write_header(rec->session, rec->evlist, fd, true);
766 	}
767 
768 	if (!err && !quiet) {
769 		char samples[128];
770 
771 		if (rec->samples && !rec->opts.full_auxtrace)
772 			scnprintf(samples, sizeof(samples),
773 				  " (%" PRIu64 " samples)", rec->samples);
774 		else
775 			samples[0] = '\0';
776 
777 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s ]\n",
778 			perf_data_file__size(file) / 1024.0 / 1024.0,
779 			file->path, samples);
780 	}
781 
782 out_delete_session:
783 	perf_session__delete(session);
784 	return status;
785 }
786 
787 static void callchain_debug(void)
788 {
789 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
790 
791 	pr_debug("callchain: type %s\n", str[callchain_param.record_mode]);
792 
793 	if (callchain_param.record_mode == CALLCHAIN_DWARF)
794 		pr_debug("callchain: stack dump size %d\n",
795 			 callchain_param.dump_size);
796 }
797 
798 int record_parse_callchain_opt(const struct option *opt,
799 			       const char *arg,
800 			       int unset)
801 {
802 	int ret;
803 	struct record_opts *record = (struct record_opts *)opt->value;
804 
805 	record->callgraph_set = true;
806 	callchain_param.enabled = !unset;
807 
808 	/* --no-call-graph */
809 	if (unset) {
810 		callchain_param.record_mode = CALLCHAIN_NONE;
811 		pr_debug("callchain: disabled\n");
812 		return 0;
813 	}
814 
815 	ret = parse_callchain_record_opt(arg, &callchain_param);
816 	if (!ret)
817 		callchain_debug();
818 
819 	return ret;
820 }
821 
822 int record_callchain_opt(const struct option *opt,
823 			 const char *arg __maybe_unused,
824 			 int unset __maybe_unused)
825 {
826 	struct record_opts *record = (struct record_opts *)opt->value;
827 
828 	record->callgraph_set = true;
829 	callchain_param.enabled = true;
830 
831 	if (callchain_param.record_mode == CALLCHAIN_NONE)
832 		callchain_param.record_mode = CALLCHAIN_FP;
833 
834 	callchain_debug();
835 	return 0;
836 }
837 
838 static int perf_record_config(const char *var, const char *value, void *cb)
839 {
840 	if (!strcmp(var, "record.call-graph"))
841 		var = "call-graph.record-mode"; /* fall-through */
842 
843 	return perf_default_config(var, value, cb);
844 }
845 
846 struct clockid_map {
847 	const char *name;
848 	int clockid;
849 };
850 
851 #define CLOCKID_MAP(n, c)	\
852 	{ .name = n, .clockid = (c), }
853 
854 #define CLOCKID_END	{ .name = NULL, }
855 
856 
857 /*
858  * Add the missing ones, we need to build on many distros...
859  */
860 #ifndef CLOCK_MONOTONIC_RAW
861 #define CLOCK_MONOTONIC_RAW 4
862 #endif
863 #ifndef CLOCK_BOOTTIME
864 #define CLOCK_BOOTTIME 7
865 #endif
866 #ifndef CLOCK_TAI
867 #define CLOCK_TAI 11
868 #endif
869 
870 static const struct clockid_map clockids[] = {
871 	/* available for all events, NMI safe */
872 	CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
873 	CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
874 
875 	/* available for some events */
876 	CLOCKID_MAP("realtime", CLOCK_REALTIME),
877 	CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
878 	CLOCKID_MAP("tai", CLOCK_TAI),
879 
880 	/* available for the lazy */
881 	CLOCKID_MAP("mono", CLOCK_MONOTONIC),
882 	CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
883 	CLOCKID_MAP("real", CLOCK_REALTIME),
884 	CLOCKID_MAP("boot", CLOCK_BOOTTIME),
885 
886 	CLOCKID_END,
887 };
888 
889 static int parse_clockid(const struct option *opt, const char *str, int unset)
890 {
891 	struct record_opts *opts = (struct record_opts *)opt->value;
892 	const struct clockid_map *cm;
893 	const char *ostr = str;
894 
895 	if (unset) {
896 		opts->use_clockid = 0;
897 		return 0;
898 	}
899 
900 	/* no arg passed */
901 	if (!str)
902 		return 0;
903 
904 	/* no setting it twice */
905 	if (opts->use_clockid)
906 		return -1;
907 
908 	opts->use_clockid = true;
909 
910 	/* if its a number, we're done */
911 	if (sscanf(str, "%d", &opts->clockid) == 1)
912 		return 0;
913 
914 	/* allow a "CLOCK_" prefix to the name */
915 	if (!strncasecmp(str, "CLOCK_", 6))
916 		str += 6;
917 
918 	for (cm = clockids; cm->name; cm++) {
919 		if (!strcasecmp(str, cm->name)) {
920 			opts->clockid = cm->clockid;
921 			return 0;
922 		}
923 	}
924 
925 	opts->use_clockid = false;
926 	ui__warning("unknown clockid %s, check man page\n", ostr);
927 	return -1;
928 }
929 
930 static int record__parse_mmap_pages(const struct option *opt,
931 				    const char *str,
932 				    int unset __maybe_unused)
933 {
934 	struct record_opts *opts = opt->value;
935 	char *s, *p;
936 	unsigned int mmap_pages;
937 	int ret;
938 
939 	if (!str)
940 		return -EINVAL;
941 
942 	s = strdup(str);
943 	if (!s)
944 		return -ENOMEM;
945 
946 	p = strchr(s, ',');
947 	if (p)
948 		*p = '\0';
949 
950 	if (*s) {
951 		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
952 		if (ret)
953 			goto out_free;
954 		opts->mmap_pages = mmap_pages;
955 	}
956 
957 	if (!p) {
958 		ret = 0;
959 		goto out_free;
960 	}
961 
962 	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
963 	if (ret)
964 		goto out_free;
965 
966 	opts->auxtrace_mmap_pages = mmap_pages;
967 
968 out_free:
969 	free(s);
970 	return ret;
971 }
972 
973 static const char * const __record_usage[] = {
974 	"perf record [<options>] [<command>]",
975 	"perf record [<options>] -- <command> [<options>]",
976 	NULL
977 };
978 const char * const *record_usage = __record_usage;
979 
980 /*
981  * XXX Ideally would be local to cmd_record() and passed to a record__new
982  * because we need to have access to it in record__exit, that is called
983  * after cmd_record() exits, but since record_options need to be accessible to
984  * builtin-script, leave it here.
985  *
986  * At least we don't ouch it in all the other functions here directly.
987  *
988  * Just say no to tons of global variables, sigh.
989  */
990 static struct record record = {
991 	.opts = {
992 		.sample_time	     = true,
993 		.mmap_pages	     = UINT_MAX,
994 		.user_freq	     = UINT_MAX,
995 		.user_interval	     = ULLONG_MAX,
996 		.freq		     = 4000,
997 		.target		     = {
998 			.uses_mmap   = true,
999 			.default_per_cpu = true,
1000 		},
1001 		.proc_map_timeout     = 500,
1002 	},
1003 	.tool = {
1004 		.sample		= process_sample_event,
1005 		.fork		= perf_event__process_fork,
1006 		.exit		= perf_event__process_exit,
1007 		.comm		= perf_event__process_comm,
1008 		.mmap		= perf_event__process_mmap,
1009 		.mmap2		= perf_event__process_mmap2,
1010 		.ordered_events	= true,
1011 	},
1012 };
1013 
1014 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
1015 	"\n\t\t\t\tDefault: fp";
1016 
1017 /*
1018  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
1019  * with it and switch to use the library functions in perf_evlist that came
1020  * from builtin-record.c, i.e. use record_opts,
1021  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
1022  * using pipes, etc.
1023  */
1024 struct option __record_options[] = {
1025 	OPT_CALLBACK('e', "event", &record.evlist, "event",
1026 		     "event selector. use 'perf list' to list available events",
1027 		     parse_events_option),
1028 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
1029 		     "event filter", parse_filter),
1030 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
1031 			   NULL, "don't record events from perf itself",
1032 			   exclude_perf),
1033 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
1034 		    "record events on existing process id"),
1035 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
1036 		    "record events on existing thread id"),
1037 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
1038 		    "collect data with this RT SCHED_FIFO priority"),
1039 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
1040 		    "collect data without buffering"),
1041 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
1042 		    "collect raw sample records from all opened counters"),
1043 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1044 			    "system-wide collection from all CPUs"),
1045 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1046 		    "list of cpus to monitor"),
1047 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
1048 	OPT_STRING('o', "output", &record.file.path, "file",
1049 		    "output file name"),
1050 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
1051 			&record.opts.no_inherit_set,
1052 			"child tasks do not inherit counters"),
1053 	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
1054 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
1055 		     "number of mmap data pages and AUX area tracing mmap pages",
1056 		     record__parse_mmap_pages),
1057 	OPT_BOOLEAN(0, "group", &record.opts.group,
1058 		    "put the counters into a counter group"),
1059 	OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
1060 			   NULL, "enables call-graph recording" ,
1061 			   &record_callchain_opt),
1062 	OPT_CALLBACK(0, "call-graph", &record.opts,
1063 		     "record_mode[,record_size]", record_callchain_help,
1064 		     &record_parse_callchain_opt),
1065 	OPT_INCR('v', "verbose", &verbose,
1066 		    "be more verbose (show counter open errors, etc)"),
1067 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1068 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1069 		    "per thread counts"),
1070 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
1071 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
1072 			&record.opts.sample_time_set,
1073 			"Record the sample timestamps"),
1074 	OPT_BOOLEAN('P', "period", &record.opts.period, "Record the sample period"),
1075 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
1076 		    "don't sample"),
1077 	OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
1078 		    "do not update the buildid cache"),
1079 	OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
1080 		    "do not collect buildids in perf.data"),
1081 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
1082 		     "monitor event in cgroup name only",
1083 		     parse_cgroups),
1084 	OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
1085 		  "ms to wait before starting measurement after program start"),
1086 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
1087 		   "user to profile"),
1088 
1089 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
1090 		     "branch any", "sample any taken branches",
1091 		     parse_branch_stack),
1092 
1093 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
1094 		     "branch filter mask", "branch stack filter modes",
1095 		     parse_branch_stack),
1096 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
1097 		    "sample by weight (on special events only)"),
1098 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
1099 		    "sample transaction flags (special events only)"),
1100 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
1101 		    "use per-thread mmaps"),
1102 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
1103 		    "sample selected machine registers on interrupt,"
1104 		    " use -I ? to list register names", parse_regs),
1105 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
1106 		    "Record running/enabled time of read (:S) events"),
1107 	OPT_CALLBACK('k', "clockid", &record.opts,
1108 	"clockid", "clockid to use for events, see clock_gettime()",
1109 	parse_clockid),
1110 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
1111 			  "opts", "AUX area tracing Snapshot Mode", ""),
1112 	OPT_UINTEGER(0, "proc-map-timeout", &record.opts.proc_map_timeout,
1113 			"per thread proc mmap processing timeout in ms"),
1114 	OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
1115 		    "Record context switch events"),
1116 #ifdef HAVE_LIBBPF_SUPPORT
1117 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
1118 		   "clang binary to use for compiling BPF scriptlets"),
1119 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
1120 		   "options passed to clang when compiling BPF scriptlets"),
1121 #endif
1122 	OPT_END()
1123 };
1124 
1125 struct option *record_options = __record_options;
1126 
1127 int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
1128 {
1129 	int err;
1130 	struct record *rec = &record;
1131 	char errbuf[BUFSIZ];
1132 
1133 	rec->evlist = perf_evlist__new();
1134 	if (rec->evlist == NULL)
1135 		return -ENOMEM;
1136 
1137 	perf_config(perf_record_config, rec);
1138 
1139 	argc = parse_options(argc, argv, record_options, record_usage,
1140 			    PARSE_OPT_STOP_AT_NON_OPTION);
1141 	if (!argc && target__none(&rec->opts.target))
1142 		usage_with_options(record_usage, record_options);
1143 
1144 	if (nr_cgroups && !rec->opts.target.system_wide) {
1145 		usage_with_options_msg(record_usage, record_options,
1146 			"cgroup monitoring only available in system-wide mode");
1147 
1148 	}
1149 	if (rec->opts.record_switch_events &&
1150 	    !perf_can_record_switch_events()) {
1151 		ui__error("kernel does not support recording context switch events\n");
1152 		parse_options_usage(record_usage, record_options, "switch-events", 0);
1153 		return -EINVAL;
1154 	}
1155 
1156 	if (!rec->itr) {
1157 		rec->itr = auxtrace_record__init(rec->evlist, &err);
1158 		if (err)
1159 			return err;
1160 	}
1161 
1162 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
1163 					      rec->opts.auxtrace_snapshot_opts);
1164 	if (err)
1165 		return err;
1166 
1167 	err = -ENOMEM;
1168 
1169 	symbol__init(NULL);
1170 
1171 	if (symbol_conf.kptr_restrict)
1172 		pr_warning(
1173 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1174 "check /proc/sys/kernel/kptr_restrict.\n\n"
1175 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1176 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1177 "Samples in kernel modules won't be resolved at all.\n\n"
1178 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1179 "even with a suitable vmlinux or kallsyms file.\n\n");
1180 
1181 	if (rec->no_buildid_cache || rec->no_buildid)
1182 		disable_buildid_cache();
1183 
1184 	if (rec->evlist->nr_entries == 0 &&
1185 	    perf_evlist__add_default(rec->evlist) < 0) {
1186 		pr_err("Not enough memory for event selector list\n");
1187 		goto out_symbol_exit;
1188 	}
1189 
1190 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
1191 		rec->opts.no_inherit = true;
1192 
1193 	err = target__validate(&rec->opts.target);
1194 	if (err) {
1195 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1196 		ui__warning("%s", errbuf);
1197 	}
1198 
1199 	err = target__parse_uid(&rec->opts.target);
1200 	if (err) {
1201 		int saved_errno = errno;
1202 
1203 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1204 		ui__error("%s", errbuf);
1205 
1206 		err = -saved_errno;
1207 		goto out_symbol_exit;
1208 	}
1209 
1210 	err = -ENOMEM;
1211 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
1212 		usage_with_options(record_usage, record_options);
1213 
1214 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
1215 	if (err)
1216 		goto out_symbol_exit;
1217 
1218 	if (record_opts__config(&rec->opts)) {
1219 		err = -EINVAL;
1220 		goto out_symbol_exit;
1221 	}
1222 
1223 	err = __cmd_record(&record, argc, argv);
1224 out_symbol_exit:
1225 	perf_evlist__delete(rec->evlist);
1226 	symbol__exit();
1227 	auxtrace_record__free(rec->itr);
1228 	return err;
1229 }
1230 
1231 static void snapshot_sig_handler(int sig __maybe_unused)
1232 {
1233 	if (!auxtrace_snapshot_enabled)
1234 		return;
1235 	auxtrace_snapshot_enabled = 0;
1236 	auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
1237 	auxtrace_record__snapshot_started = 1;
1238 }
1239