xref: /linux/tools/perf/builtin-record.c (revision 607bfbd7ffc60156ae0831c917497dc91a57dd8d)
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #include "builtin.h"
9 
10 #include "perf.h"
11 
12 #include "util/build-id.h"
13 #include "util/util.h"
14 #include <subcmd/parse-options.h>
15 #include "util/parse-events.h"
16 
17 #include "util/callchain.h"
18 #include "util/cgroup.h"
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/cpumap.h"
28 #include "util/thread_map.h"
29 #include "util/data.h"
30 #include "util/perf_regs.h"
31 #include "util/auxtrace.h"
32 #include "util/parse-branch-options.h"
33 #include "util/parse-regs-options.h"
34 #include "util/llvm-utils.h"
35 
36 #include <unistd.h>
37 #include <sched.h>
38 #include <sys/mman.h>
39 
40 
41 struct record {
42 	struct perf_tool	tool;
43 	struct record_opts	opts;
44 	u64			bytes_written;
45 	struct perf_data_file	file;
46 	struct auxtrace_record	*itr;
47 	struct perf_evlist	*evlist;
48 	struct perf_session	*session;
49 	const char		*progname;
50 	int			realtime_prio;
51 	bool			no_buildid;
52 	bool			no_buildid_set;
53 	bool			no_buildid_cache;
54 	bool			no_buildid_cache_set;
55 	bool			buildid_all;
56 	unsigned long long	samples;
57 };
58 
59 static int record__write(struct record *rec, void *bf, size_t size)
60 {
61 	if (perf_data_file__write(rec->session->file, bf, size) < 0) {
62 		pr_err("failed to write perf data, error: %m\n");
63 		return -1;
64 	}
65 
66 	rec->bytes_written += size;
67 	return 0;
68 }
69 
70 static int process_synthesized_event(struct perf_tool *tool,
71 				     union perf_event *event,
72 				     struct perf_sample *sample __maybe_unused,
73 				     struct machine *machine __maybe_unused)
74 {
75 	struct record *rec = container_of(tool, struct record, tool);
76 	return record__write(rec, event, event->header.size);
77 }
78 
79 static int record__mmap_read(struct record *rec, int idx)
80 {
81 	struct perf_mmap *md = &rec->evlist->mmap[idx];
82 	u64 head = perf_mmap__read_head(md);
83 	u64 old = md->prev;
84 	unsigned char *data = md->base + page_size;
85 	unsigned long size;
86 	void *buf;
87 	int rc = 0;
88 
89 	if (old == head)
90 		return 0;
91 
92 	rec->samples++;
93 
94 	size = head - old;
95 
96 	if ((old & md->mask) + size != (head & md->mask)) {
97 		buf = &data[old & md->mask];
98 		size = md->mask + 1 - (old & md->mask);
99 		old += size;
100 
101 		if (record__write(rec, buf, size) < 0) {
102 			rc = -1;
103 			goto out;
104 		}
105 	}
106 
107 	buf = &data[old & md->mask];
108 	size = head - old;
109 	old += size;
110 
111 	if (record__write(rec, buf, size) < 0) {
112 		rc = -1;
113 		goto out;
114 	}
115 
116 	md->prev = old;
117 	perf_evlist__mmap_consume(rec->evlist, idx);
118 out:
119 	return rc;
120 }
121 
122 static volatile int done;
123 static volatile int signr = -1;
124 static volatile int child_finished;
125 static volatile int auxtrace_snapshot_enabled;
126 static volatile int auxtrace_snapshot_err;
127 static volatile int auxtrace_record__snapshot_started;
128 
129 static void sig_handler(int sig)
130 {
131 	if (sig == SIGCHLD)
132 		child_finished = 1;
133 	else
134 		signr = sig;
135 
136 	done = 1;
137 }
138 
139 static void record__sig_exit(void)
140 {
141 	if (signr == -1)
142 		return;
143 
144 	signal(signr, SIG_DFL);
145 	raise(signr);
146 }
147 
148 #ifdef HAVE_AUXTRACE_SUPPORT
149 
150 static int record__process_auxtrace(struct perf_tool *tool,
151 				    union perf_event *event, void *data1,
152 				    size_t len1, void *data2, size_t len2)
153 {
154 	struct record *rec = container_of(tool, struct record, tool);
155 	struct perf_data_file *file = &rec->file;
156 	size_t padding;
157 	u8 pad[8] = {0};
158 
159 	if (!perf_data_file__is_pipe(file)) {
160 		off_t file_offset;
161 		int fd = perf_data_file__fd(file);
162 		int err;
163 
164 		file_offset = lseek(fd, 0, SEEK_CUR);
165 		if (file_offset == -1)
166 			return -1;
167 		err = auxtrace_index__auxtrace_event(&rec->session->auxtrace_index,
168 						     event, file_offset);
169 		if (err)
170 			return err;
171 	}
172 
173 	/* event.auxtrace.size includes padding, see __auxtrace_mmap__read() */
174 	padding = (len1 + len2) & 7;
175 	if (padding)
176 		padding = 8 - padding;
177 
178 	record__write(rec, event, event->header.size);
179 	record__write(rec, data1, len1);
180 	if (len2)
181 		record__write(rec, data2, len2);
182 	record__write(rec, &pad, padding);
183 
184 	return 0;
185 }
186 
187 static int record__auxtrace_mmap_read(struct record *rec,
188 				      struct auxtrace_mmap *mm)
189 {
190 	int ret;
191 
192 	ret = auxtrace_mmap__read(mm, rec->itr, &rec->tool,
193 				  record__process_auxtrace);
194 	if (ret < 0)
195 		return ret;
196 
197 	if (ret)
198 		rec->samples++;
199 
200 	return 0;
201 }
202 
203 static int record__auxtrace_mmap_read_snapshot(struct record *rec,
204 					       struct auxtrace_mmap *mm)
205 {
206 	int ret;
207 
208 	ret = auxtrace_mmap__read_snapshot(mm, rec->itr, &rec->tool,
209 					   record__process_auxtrace,
210 					   rec->opts.auxtrace_snapshot_size);
211 	if (ret < 0)
212 		return ret;
213 
214 	if (ret)
215 		rec->samples++;
216 
217 	return 0;
218 }
219 
220 static int record__auxtrace_read_snapshot_all(struct record *rec)
221 {
222 	int i;
223 	int rc = 0;
224 
225 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
226 		struct auxtrace_mmap *mm =
227 				&rec->evlist->mmap[i].auxtrace_mmap;
228 
229 		if (!mm->base)
230 			continue;
231 
232 		if (record__auxtrace_mmap_read_snapshot(rec, mm) != 0) {
233 			rc = -1;
234 			goto out;
235 		}
236 	}
237 out:
238 	return rc;
239 }
240 
241 static void record__read_auxtrace_snapshot(struct record *rec)
242 {
243 	pr_debug("Recording AUX area tracing snapshot\n");
244 	if (record__auxtrace_read_snapshot_all(rec) < 0) {
245 		auxtrace_snapshot_err = -1;
246 	} else {
247 		auxtrace_snapshot_err = auxtrace_record__snapshot_finish(rec->itr);
248 		if (!auxtrace_snapshot_err)
249 			auxtrace_snapshot_enabled = 1;
250 	}
251 }
252 
253 #else
254 
255 static inline
256 int record__auxtrace_mmap_read(struct record *rec __maybe_unused,
257 			       struct auxtrace_mmap *mm __maybe_unused)
258 {
259 	return 0;
260 }
261 
262 static inline
263 void record__read_auxtrace_snapshot(struct record *rec __maybe_unused)
264 {
265 }
266 
267 static inline
268 int auxtrace_record__snapshot_start(struct auxtrace_record *itr __maybe_unused)
269 {
270 	return 0;
271 }
272 
273 #endif
274 
275 static int record__open(struct record *rec)
276 {
277 	char msg[512];
278 	struct perf_evsel *pos;
279 	struct perf_evlist *evlist = rec->evlist;
280 	struct perf_session *session = rec->session;
281 	struct record_opts *opts = &rec->opts;
282 	int rc = 0;
283 
284 	perf_evlist__config(evlist, opts);
285 
286 	evlist__for_each(evlist, pos) {
287 try_again:
288 		if (perf_evsel__open(pos, pos->cpus, pos->threads) < 0) {
289 			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
290 				if (verbose)
291 					ui__warning("%s\n", msg);
292 				goto try_again;
293 			}
294 
295 			rc = -errno;
296 			perf_evsel__open_strerror(pos, &opts->target,
297 						  errno, msg, sizeof(msg));
298 			ui__error("%s\n", msg);
299 			goto out;
300 		}
301 	}
302 
303 	if (perf_evlist__apply_filters(evlist, &pos)) {
304 		error("failed to set filter \"%s\" on event %s with %d (%s)\n",
305 			pos->filter, perf_evsel__name(pos), errno,
306 			strerror_r(errno, msg, sizeof(msg)));
307 		rc = -1;
308 		goto out;
309 	}
310 
311 	if (perf_evlist__mmap_ex(evlist, opts->mmap_pages, false,
312 				 opts->auxtrace_mmap_pages,
313 				 opts->auxtrace_snapshot_mode) < 0) {
314 		if (errno == EPERM) {
315 			pr_err("Permission error mapping pages.\n"
316 			       "Consider increasing "
317 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
318 			       "or try again with a smaller value of -m/--mmap_pages.\n"
319 			       "(current value: %u,%u)\n",
320 			       opts->mmap_pages, opts->auxtrace_mmap_pages);
321 			rc = -errno;
322 		} else {
323 			pr_err("failed to mmap with %d (%s)\n", errno,
324 				strerror_r(errno, msg, sizeof(msg)));
325 			rc = -errno;
326 		}
327 		goto out;
328 	}
329 
330 	session->evlist = evlist;
331 	perf_session__set_id_hdr_size(session);
332 out:
333 	return rc;
334 }
335 
336 static int process_sample_event(struct perf_tool *tool,
337 				union perf_event *event,
338 				struct perf_sample *sample,
339 				struct perf_evsel *evsel,
340 				struct machine *machine)
341 {
342 	struct record *rec = container_of(tool, struct record, tool);
343 
344 	rec->samples++;
345 
346 	return build_id__mark_dso_hit(tool, event, sample, evsel, machine);
347 }
348 
349 static int process_buildids(struct record *rec)
350 {
351 	struct perf_data_file *file  = &rec->file;
352 	struct perf_session *session = rec->session;
353 
354 	if (file->size == 0)
355 		return 0;
356 
357 	/*
358 	 * During this process, it'll load kernel map and replace the
359 	 * dso->long_name to a real pathname it found.  In this case
360 	 * we prefer the vmlinux path like
361 	 *   /lib/modules/3.16.4/build/vmlinux
362 	 *
363 	 * rather than build-id path (in debug directory).
364 	 *   $HOME/.debug/.build-id/f0/6e17aa50adf4d00b88925e03775de107611551
365 	 */
366 	symbol_conf.ignore_vmlinux_buildid = true;
367 
368 	/*
369 	 * If --buildid-all is given, it marks all DSO regardless of hits,
370 	 * so no need to process samples.
371 	 */
372 	if (rec->buildid_all)
373 		rec->tool.sample = NULL;
374 
375 	return perf_session__process_events(session);
376 }
377 
378 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
379 {
380 	int err;
381 	struct perf_tool *tool = data;
382 	/*
383 	 *As for guest kernel when processing subcommand record&report,
384 	 *we arrange module mmap prior to guest kernel mmap and trigger
385 	 *a preload dso because default guest module symbols are loaded
386 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
387 	 *method is used to avoid symbol missing when the first addr is
388 	 *in module instead of in guest kernel.
389 	 */
390 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
391 					     machine);
392 	if (err < 0)
393 		pr_err("Couldn't record guest kernel [%d]'s reference"
394 		       " relocation symbol.\n", machine->pid);
395 
396 	/*
397 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
398 	 * have no _text sometimes.
399 	 */
400 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
401 						 machine);
402 	if (err < 0)
403 		pr_err("Couldn't record guest kernel [%d]'s reference"
404 		       " relocation symbol.\n", machine->pid);
405 }
406 
407 static struct perf_event_header finished_round_event = {
408 	.size = sizeof(struct perf_event_header),
409 	.type = PERF_RECORD_FINISHED_ROUND,
410 };
411 
412 static int record__mmap_read_all(struct record *rec)
413 {
414 	u64 bytes_written = rec->bytes_written;
415 	int i;
416 	int rc = 0;
417 
418 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
419 		struct auxtrace_mmap *mm = &rec->evlist->mmap[i].auxtrace_mmap;
420 
421 		if (rec->evlist->mmap[i].base) {
422 			if (record__mmap_read(rec, i) != 0) {
423 				rc = -1;
424 				goto out;
425 			}
426 		}
427 
428 		if (mm->base && !rec->opts.auxtrace_snapshot_mode &&
429 		    record__auxtrace_mmap_read(rec, mm) != 0) {
430 			rc = -1;
431 			goto out;
432 		}
433 	}
434 
435 	/*
436 	 * Mark the round finished in case we wrote
437 	 * at least one event.
438 	 */
439 	if (bytes_written != rec->bytes_written)
440 		rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
441 
442 out:
443 	return rc;
444 }
445 
446 static void record__init_features(struct record *rec)
447 {
448 	struct perf_session *session = rec->session;
449 	int feat;
450 
451 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
452 		perf_header__set_feat(&session->header, feat);
453 
454 	if (rec->no_buildid)
455 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
456 
457 	if (!have_tracepoints(&rec->evlist->entries))
458 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
459 
460 	if (!rec->opts.branch_stack)
461 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
462 
463 	if (!rec->opts.full_auxtrace)
464 		perf_header__clear_feat(&session->header, HEADER_AUXTRACE);
465 
466 	perf_header__clear_feat(&session->header, HEADER_STAT);
467 }
468 
469 static volatile int workload_exec_errno;
470 
471 /*
472  * perf_evlist__prepare_workload will send a SIGUSR1
473  * if the fork fails, since we asked by setting its
474  * want_signal to true.
475  */
476 static void workload_exec_failed_signal(int signo __maybe_unused,
477 					siginfo_t *info,
478 					void *ucontext __maybe_unused)
479 {
480 	workload_exec_errno = info->si_value.sival_int;
481 	done = 1;
482 	child_finished = 1;
483 }
484 
485 static void snapshot_sig_handler(int sig);
486 
487 static int __cmd_record(struct record *rec, int argc, const char **argv)
488 {
489 	int err;
490 	int status = 0;
491 	unsigned long waking = 0;
492 	const bool forks = argc > 0;
493 	struct machine *machine;
494 	struct perf_tool *tool = &rec->tool;
495 	struct record_opts *opts = &rec->opts;
496 	struct perf_data_file *file = &rec->file;
497 	struct perf_session *session;
498 	bool disabled = false, draining = false;
499 	int fd;
500 
501 	rec->progname = argv[0];
502 
503 	atexit(record__sig_exit);
504 	signal(SIGCHLD, sig_handler);
505 	signal(SIGINT, sig_handler);
506 	signal(SIGTERM, sig_handler);
507 	if (rec->opts.auxtrace_snapshot_mode)
508 		signal(SIGUSR2, snapshot_sig_handler);
509 	else
510 		signal(SIGUSR2, SIG_IGN);
511 
512 	session = perf_session__new(file, false, tool);
513 	if (session == NULL) {
514 		pr_err("Perf session creation failed.\n");
515 		return -1;
516 	}
517 
518 	fd = perf_data_file__fd(file);
519 	rec->session = session;
520 
521 	record__init_features(rec);
522 
523 	if (forks) {
524 		err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
525 						    argv, file->is_pipe,
526 						    workload_exec_failed_signal);
527 		if (err < 0) {
528 			pr_err("Couldn't run the workload!\n");
529 			status = err;
530 			goto out_delete_session;
531 		}
532 	}
533 
534 	if (record__open(rec) != 0) {
535 		err = -1;
536 		goto out_child;
537 	}
538 
539 	/*
540 	 * Normally perf_session__new would do this, but it doesn't have the
541 	 * evlist.
542 	 */
543 	if (rec->tool.ordered_events && !perf_evlist__sample_id_all(rec->evlist)) {
544 		pr_warning("WARNING: No sample_id_all support, falling back to unordered processing\n");
545 		rec->tool.ordered_events = false;
546 	}
547 
548 	if (!rec->evlist->nr_groups)
549 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
550 
551 	if (file->is_pipe) {
552 		err = perf_header__write_pipe(fd);
553 		if (err < 0)
554 			goto out_child;
555 	} else {
556 		err = perf_session__write_header(session, rec->evlist, fd, false);
557 		if (err < 0)
558 			goto out_child;
559 	}
560 
561 	if (!rec->no_buildid
562 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
563 		pr_err("Couldn't generate buildids. "
564 		       "Use --no-buildid to profile anyway.\n");
565 		err = -1;
566 		goto out_child;
567 	}
568 
569 	machine = &session->machines.host;
570 
571 	if (file->is_pipe) {
572 		err = perf_event__synthesize_attrs(tool, session,
573 						   process_synthesized_event);
574 		if (err < 0) {
575 			pr_err("Couldn't synthesize attrs.\n");
576 			goto out_child;
577 		}
578 
579 		if (have_tracepoints(&rec->evlist->entries)) {
580 			/*
581 			 * FIXME err <= 0 here actually means that
582 			 * there were no tracepoints so its not really
583 			 * an error, just that we don't need to
584 			 * synthesize anything.  We really have to
585 			 * return this more properly and also
586 			 * propagate errors that now are calling die()
587 			 */
588 			err = perf_event__synthesize_tracing_data(tool,	fd, rec->evlist,
589 								  process_synthesized_event);
590 			if (err <= 0) {
591 				pr_err("Couldn't record tracing data.\n");
592 				goto out_child;
593 			}
594 			rec->bytes_written += err;
595 		}
596 	}
597 
598 	if (rec->opts.full_auxtrace) {
599 		err = perf_event__synthesize_auxtrace_info(rec->itr, tool,
600 					session, process_synthesized_event);
601 		if (err)
602 			goto out_delete_session;
603 	}
604 
605 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
606 						 machine);
607 	if (err < 0)
608 		pr_err("Couldn't record kernel reference relocation symbol\n"
609 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
610 		       "Check /proc/kallsyms permission or run as root.\n");
611 
612 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
613 					     machine);
614 	if (err < 0)
615 		pr_err("Couldn't record kernel module information.\n"
616 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
617 		       "Check /proc/modules permission or run as root.\n");
618 
619 	if (perf_guest) {
620 		machines__process_guests(&session->machines,
621 					 perf_event__synthesize_guest_os, tool);
622 	}
623 
624 	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
625 					    process_synthesized_event, opts->sample_address,
626 					    opts->proc_map_timeout);
627 	if (err != 0)
628 		goto out_child;
629 
630 	if (rec->realtime_prio) {
631 		struct sched_param param;
632 
633 		param.sched_priority = rec->realtime_prio;
634 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
635 			pr_err("Could not set realtime priority.\n");
636 			err = -1;
637 			goto out_child;
638 		}
639 	}
640 
641 	/*
642 	 * When perf is starting the traced process, all the events
643 	 * (apart from group members) have enable_on_exec=1 set,
644 	 * so don't spoil it by prematurely enabling them.
645 	 */
646 	if (!target__none(&opts->target) && !opts->initial_delay)
647 		perf_evlist__enable(rec->evlist);
648 
649 	/*
650 	 * Let the child rip
651 	 */
652 	if (forks) {
653 		union perf_event *event;
654 
655 		event = malloc(sizeof(event->comm) + machine->id_hdr_size);
656 		if (event == NULL) {
657 			err = -ENOMEM;
658 			goto out_child;
659 		}
660 
661 		/*
662 		 * Some H/W events are generated before COMM event
663 		 * which is emitted during exec(), so perf script
664 		 * cannot see a correct process name for those events.
665 		 * Synthesize COMM event to prevent it.
666 		 */
667 		perf_event__synthesize_comm(tool, event,
668 					    rec->evlist->workload.pid,
669 					    process_synthesized_event,
670 					    machine);
671 		free(event);
672 
673 		perf_evlist__start_workload(rec->evlist);
674 	}
675 
676 	if (opts->initial_delay) {
677 		usleep(opts->initial_delay * 1000);
678 		perf_evlist__enable(rec->evlist);
679 	}
680 
681 	auxtrace_snapshot_enabled = 1;
682 	for (;;) {
683 		unsigned long long hits = rec->samples;
684 
685 		if (record__mmap_read_all(rec) < 0) {
686 			auxtrace_snapshot_enabled = 0;
687 			err = -1;
688 			goto out_child;
689 		}
690 
691 		if (auxtrace_record__snapshot_started) {
692 			auxtrace_record__snapshot_started = 0;
693 			if (!auxtrace_snapshot_err)
694 				record__read_auxtrace_snapshot(rec);
695 			if (auxtrace_snapshot_err) {
696 				pr_err("AUX area tracing snapshot failed\n");
697 				err = -1;
698 				goto out_child;
699 			}
700 		}
701 
702 		if (hits == rec->samples) {
703 			if (done || draining)
704 				break;
705 			err = perf_evlist__poll(rec->evlist, -1);
706 			/*
707 			 * Propagate error, only if there's any. Ignore positive
708 			 * number of returned events and interrupt error.
709 			 */
710 			if (err > 0 || (err < 0 && errno == EINTR))
711 				err = 0;
712 			waking++;
713 
714 			if (perf_evlist__filter_pollfd(rec->evlist, POLLERR | POLLHUP) == 0)
715 				draining = true;
716 		}
717 
718 		/*
719 		 * When perf is starting the traced process, at the end events
720 		 * die with the process and we wait for that. Thus no need to
721 		 * disable events in this case.
722 		 */
723 		if (done && !disabled && !target__none(&opts->target)) {
724 			auxtrace_snapshot_enabled = 0;
725 			perf_evlist__disable(rec->evlist);
726 			disabled = true;
727 		}
728 	}
729 	auxtrace_snapshot_enabled = 0;
730 
731 	if (forks && workload_exec_errno) {
732 		char msg[STRERR_BUFSIZE];
733 		const char *emsg = strerror_r(workload_exec_errno, msg, sizeof(msg));
734 		pr_err("Workload failed: %s\n", emsg);
735 		err = -1;
736 		goto out_child;
737 	}
738 
739 	if (!quiet)
740 		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
741 
742 out_child:
743 	if (forks) {
744 		int exit_status;
745 
746 		if (!child_finished)
747 			kill(rec->evlist->workload.pid, SIGTERM);
748 
749 		wait(&exit_status);
750 
751 		if (err < 0)
752 			status = err;
753 		else if (WIFEXITED(exit_status))
754 			status = WEXITSTATUS(exit_status);
755 		else if (WIFSIGNALED(exit_status))
756 			signr = WTERMSIG(exit_status);
757 	} else
758 		status = err;
759 
760 	/* this will be recalculated during process_buildids() */
761 	rec->samples = 0;
762 
763 	if (!err && !file->is_pipe) {
764 		rec->session->header.data_size += rec->bytes_written;
765 		file->size = lseek(perf_data_file__fd(file), 0, SEEK_CUR);
766 
767 		if (!rec->no_buildid) {
768 			process_buildids(rec);
769 
770 			if (rec->buildid_all)
771 				dsos__hit_all(rec->session);
772 		}
773 		perf_session__write_header(rec->session, rec->evlist, fd, true);
774 	}
775 
776 	if (!err && !quiet) {
777 		char samples[128];
778 
779 		if (rec->samples && !rec->opts.full_auxtrace)
780 			scnprintf(samples, sizeof(samples),
781 				  " (%" PRIu64 " samples)", rec->samples);
782 		else
783 			samples[0] = '\0';
784 
785 		fprintf(stderr,	"[ perf record: Captured and wrote %.3f MB %s%s ]\n",
786 			perf_data_file__size(file) / 1024.0 / 1024.0,
787 			file->path, samples);
788 	}
789 
790 out_delete_session:
791 	perf_session__delete(session);
792 	return status;
793 }
794 
795 static void callchain_debug(void)
796 {
797 	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF", "LBR" };
798 
799 	pr_debug("callchain: type %s\n", str[callchain_param.record_mode]);
800 
801 	if (callchain_param.record_mode == CALLCHAIN_DWARF)
802 		pr_debug("callchain: stack dump size %d\n",
803 			 callchain_param.dump_size);
804 }
805 
806 int record_parse_callchain_opt(const struct option *opt,
807 			       const char *arg,
808 			       int unset)
809 {
810 	int ret;
811 	struct record_opts *record = (struct record_opts *)opt->value;
812 
813 	record->callgraph_set = true;
814 	callchain_param.enabled = !unset;
815 
816 	/* --no-call-graph */
817 	if (unset) {
818 		callchain_param.record_mode = CALLCHAIN_NONE;
819 		pr_debug("callchain: disabled\n");
820 		return 0;
821 	}
822 
823 	ret = parse_callchain_record_opt(arg, &callchain_param);
824 	if (!ret) {
825 		/* Enable data address sampling for DWARF unwind. */
826 		if (callchain_param.record_mode == CALLCHAIN_DWARF)
827 			record->sample_address = true;
828 		callchain_debug();
829 	}
830 
831 	return ret;
832 }
833 
834 int record_callchain_opt(const struct option *opt,
835 			 const char *arg __maybe_unused,
836 			 int unset __maybe_unused)
837 {
838 	struct record_opts *record = (struct record_opts *)opt->value;
839 
840 	record->callgraph_set = true;
841 	callchain_param.enabled = true;
842 
843 	if (callchain_param.record_mode == CALLCHAIN_NONE)
844 		callchain_param.record_mode = CALLCHAIN_FP;
845 
846 	callchain_debug();
847 	return 0;
848 }
849 
850 static int perf_record_config(const char *var, const char *value, void *cb)
851 {
852 	struct record *rec = cb;
853 
854 	if (!strcmp(var, "record.build-id")) {
855 		if (!strcmp(value, "cache"))
856 			rec->no_buildid_cache = false;
857 		else if (!strcmp(value, "no-cache"))
858 			rec->no_buildid_cache = true;
859 		else if (!strcmp(value, "skip"))
860 			rec->no_buildid = true;
861 		else
862 			return -1;
863 		return 0;
864 	}
865 	if (!strcmp(var, "record.call-graph"))
866 		var = "call-graph.record-mode"; /* fall-through */
867 
868 	return perf_default_config(var, value, cb);
869 }
870 
871 struct clockid_map {
872 	const char *name;
873 	int clockid;
874 };
875 
876 #define CLOCKID_MAP(n, c)	\
877 	{ .name = n, .clockid = (c), }
878 
879 #define CLOCKID_END	{ .name = NULL, }
880 
881 
882 /*
883  * Add the missing ones, we need to build on many distros...
884  */
885 #ifndef CLOCK_MONOTONIC_RAW
886 #define CLOCK_MONOTONIC_RAW 4
887 #endif
888 #ifndef CLOCK_BOOTTIME
889 #define CLOCK_BOOTTIME 7
890 #endif
891 #ifndef CLOCK_TAI
892 #define CLOCK_TAI 11
893 #endif
894 
895 static const struct clockid_map clockids[] = {
896 	/* available for all events, NMI safe */
897 	CLOCKID_MAP("monotonic", CLOCK_MONOTONIC),
898 	CLOCKID_MAP("monotonic_raw", CLOCK_MONOTONIC_RAW),
899 
900 	/* available for some events */
901 	CLOCKID_MAP("realtime", CLOCK_REALTIME),
902 	CLOCKID_MAP("boottime", CLOCK_BOOTTIME),
903 	CLOCKID_MAP("tai", CLOCK_TAI),
904 
905 	/* available for the lazy */
906 	CLOCKID_MAP("mono", CLOCK_MONOTONIC),
907 	CLOCKID_MAP("raw", CLOCK_MONOTONIC_RAW),
908 	CLOCKID_MAP("real", CLOCK_REALTIME),
909 	CLOCKID_MAP("boot", CLOCK_BOOTTIME),
910 
911 	CLOCKID_END,
912 };
913 
914 static int parse_clockid(const struct option *opt, const char *str, int unset)
915 {
916 	struct record_opts *opts = (struct record_opts *)opt->value;
917 	const struct clockid_map *cm;
918 	const char *ostr = str;
919 
920 	if (unset) {
921 		opts->use_clockid = 0;
922 		return 0;
923 	}
924 
925 	/* no arg passed */
926 	if (!str)
927 		return 0;
928 
929 	/* no setting it twice */
930 	if (opts->use_clockid)
931 		return -1;
932 
933 	opts->use_clockid = true;
934 
935 	/* if its a number, we're done */
936 	if (sscanf(str, "%d", &opts->clockid) == 1)
937 		return 0;
938 
939 	/* allow a "CLOCK_" prefix to the name */
940 	if (!strncasecmp(str, "CLOCK_", 6))
941 		str += 6;
942 
943 	for (cm = clockids; cm->name; cm++) {
944 		if (!strcasecmp(str, cm->name)) {
945 			opts->clockid = cm->clockid;
946 			return 0;
947 		}
948 	}
949 
950 	opts->use_clockid = false;
951 	ui__warning("unknown clockid %s, check man page\n", ostr);
952 	return -1;
953 }
954 
955 static int record__parse_mmap_pages(const struct option *opt,
956 				    const char *str,
957 				    int unset __maybe_unused)
958 {
959 	struct record_opts *opts = opt->value;
960 	char *s, *p;
961 	unsigned int mmap_pages;
962 	int ret;
963 
964 	if (!str)
965 		return -EINVAL;
966 
967 	s = strdup(str);
968 	if (!s)
969 		return -ENOMEM;
970 
971 	p = strchr(s, ',');
972 	if (p)
973 		*p = '\0';
974 
975 	if (*s) {
976 		ret = __perf_evlist__parse_mmap_pages(&mmap_pages, s);
977 		if (ret)
978 			goto out_free;
979 		opts->mmap_pages = mmap_pages;
980 	}
981 
982 	if (!p) {
983 		ret = 0;
984 		goto out_free;
985 	}
986 
987 	ret = __perf_evlist__parse_mmap_pages(&mmap_pages, p + 1);
988 	if (ret)
989 		goto out_free;
990 
991 	opts->auxtrace_mmap_pages = mmap_pages;
992 
993 out_free:
994 	free(s);
995 	return ret;
996 }
997 
998 static const char * const __record_usage[] = {
999 	"perf record [<options>] [<command>]",
1000 	"perf record [<options>] -- <command> [<options>]",
1001 	NULL
1002 };
1003 const char * const *record_usage = __record_usage;
1004 
1005 /*
1006  * XXX Ideally would be local to cmd_record() and passed to a record__new
1007  * because we need to have access to it in record__exit, that is called
1008  * after cmd_record() exits, but since record_options need to be accessible to
1009  * builtin-script, leave it here.
1010  *
1011  * At least we don't ouch it in all the other functions here directly.
1012  *
1013  * Just say no to tons of global variables, sigh.
1014  */
1015 static struct record record = {
1016 	.opts = {
1017 		.sample_time	     = true,
1018 		.mmap_pages	     = UINT_MAX,
1019 		.user_freq	     = UINT_MAX,
1020 		.user_interval	     = ULLONG_MAX,
1021 		.freq		     = 4000,
1022 		.target		     = {
1023 			.uses_mmap   = true,
1024 			.default_per_cpu = true,
1025 		},
1026 		.proc_map_timeout     = 500,
1027 	},
1028 	.tool = {
1029 		.sample		= process_sample_event,
1030 		.fork		= perf_event__process_fork,
1031 		.exit		= perf_event__process_exit,
1032 		.comm		= perf_event__process_comm,
1033 		.mmap		= perf_event__process_mmap,
1034 		.mmap2		= perf_event__process_mmap2,
1035 		.ordered_events	= true,
1036 	},
1037 };
1038 
1039 const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
1040 	"\n\t\t\t\tDefault: fp";
1041 
1042 /*
1043  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
1044  * with it and switch to use the library functions in perf_evlist that came
1045  * from builtin-record.c, i.e. use record_opts,
1046  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
1047  * using pipes, etc.
1048  */
1049 struct option __record_options[] = {
1050 	OPT_CALLBACK('e', "event", &record.evlist, "event",
1051 		     "event selector. use 'perf list' to list available events",
1052 		     parse_events_option),
1053 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
1054 		     "event filter", parse_filter),
1055 	OPT_CALLBACK_NOOPT(0, "exclude-perf", &record.evlist,
1056 			   NULL, "don't record events from perf itself",
1057 			   exclude_perf),
1058 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
1059 		    "record events on existing process id"),
1060 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
1061 		    "record events on existing thread id"),
1062 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
1063 		    "collect data with this RT SCHED_FIFO priority"),
1064 	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
1065 		    "collect data without buffering"),
1066 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
1067 		    "collect raw sample records from all opened counters"),
1068 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1069 			    "system-wide collection from all CPUs"),
1070 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1071 		    "list of cpus to monitor"),
1072 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
1073 	OPT_STRING('o', "output", &record.file.path, "file",
1074 		    "output file name"),
1075 	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
1076 			&record.opts.no_inherit_set,
1077 			"child tasks do not inherit counters"),
1078 	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
1079 	OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
1080 		     "number of mmap data pages and AUX area tracing mmap pages",
1081 		     record__parse_mmap_pages),
1082 	OPT_BOOLEAN(0, "group", &record.opts.group,
1083 		    "put the counters into a counter group"),
1084 	OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
1085 			   NULL, "enables call-graph recording" ,
1086 			   &record_callchain_opt),
1087 	OPT_CALLBACK(0, "call-graph", &record.opts,
1088 		     "record_mode[,record_size]", record_callchain_help,
1089 		     &record_parse_callchain_opt),
1090 	OPT_INCR('v', "verbose", &verbose,
1091 		    "be more verbose (show counter open errors, etc)"),
1092 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1093 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1094 		    "per thread counts"),
1095 	OPT_BOOLEAN('d', "data", &record.opts.sample_address, "Record the sample addresses"),
1096 	OPT_BOOLEAN_SET('T', "timestamp", &record.opts.sample_time,
1097 			&record.opts.sample_time_set,
1098 			"Record the sample timestamps"),
1099 	OPT_BOOLEAN('P', "period", &record.opts.period, "Record the sample period"),
1100 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
1101 		    "don't sample"),
1102 	OPT_BOOLEAN_SET('N', "no-buildid-cache", &record.no_buildid_cache,
1103 			&record.no_buildid_cache_set,
1104 			"do not update the buildid cache"),
1105 	OPT_BOOLEAN_SET('B', "no-buildid", &record.no_buildid,
1106 			&record.no_buildid_set,
1107 			"do not collect buildids in perf.data"),
1108 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
1109 		     "monitor event in cgroup name only",
1110 		     parse_cgroups),
1111 	OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
1112 		  "ms to wait before starting measurement after program start"),
1113 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
1114 		   "user to profile"),
1115 
1116 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
1117 		     "branch any", "sample any taken branches",
1118 		     parse_branch_stack),
1119 
1120 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
1121 		     "branch filter mask", "branch stack filter modes",
1122 		     parse_branch_stack),
1123 	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
1124 		    "sample by weight (on special events only)"),
1125 	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
1126 		    "sample transaction flags (special events only)"),
1127 	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
1128 		    "use per-thread mmaps"),
1129 	OPT_CALLBACK_OPTARG('I', "intr-regs", &record.opts.sample_intr_regs, NULL, "any register",
1130 		    "sample selected machine registers on interrupt,"
1131 		    " use -I ? to list register names", parse_regs),
1132 	OPT_BOOLEAN(0, "running-time", &record.opts.running_time,
1133 		    "Record running/enabled time of read (:S) events"),
1134 	OPT_CALLBACK('k', "clockid", &record.opts,
1135 	"clockid", "clockid to use for events, see clock_gettime()",
1136 	parse_clockid),
1137 	OPT_STRING_OPTARG('S', "snapshot", &record.opts.auxtrace_snapshot_opts,
1138 			  "opts", "AUX area tracing Snapshot Mode", ""),
1139 	OPT_UINTEGER(0, "proc-map-timeout", &record.opts.proc_map_timeout,
1140 			"per thread proc mmap processing timeout in ms"),
1141 	OPT_BOOLEAN(0, "switch-events", &record.opts.record_switch_events,
1142 		    "Record context switch events"),
1143 	OPT_STRING(0, "clang-path", &llvm_param.clang_path, "clang path",
1144 		   "clang binary to use for compiling BPF scriptlets"),
1145 	OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
1146 		   "options passed to clang when compiling BPF scriptlets"),
1147 	OPT_STRING(0, "vmlinux", &symbol_conf.vmlinux_name,
1148 		   "file", "vmlinux pathname"),
1149 	OPT_BOOLEAN(0, "buildid-all", &record.buildid_all,
1150 		    "Record build-id of all DSOs regardless of hits"),
1151 	OPT_END()
1152 };
1153 
1154 struct option *record_options = __record_options;
1155 
1156 int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
1157 {
1158 	int err;
1159 	struct record *rec = &record;
1160 	char errbuf[BUFSIZ];
1161 
1162 #ifndef HAVE_LIBBPF_SUPPORT
1163 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, "NO_LIBBPF=1", c)
1164 	set_nobuild('\0', "clang-path", true);
1165 	set_nobuild('\0', "clang-opt", true);
1166 # undef set_nobuild
1167 #endif
1168 
1169 #ifndef HAVE_BPF_PROLOGUE
1170 # if !defined (HAVE_DWARF_SUPPORT)
1171 #  define REASON  "NO_DWARF=1"
1172 # elif !defined (HAVE_LIBBPF_SUPPORT)
1173 #  define REASON  "NO_LIBBPF=1"
1174 # else
1175 #  define REASON  "this architecture doesn't support BPF prologue"
1176 # endif
1177 # define set_nobuild(s, l, c) set_option_nobuild(record_options, s, l, REASON, c)
1178 	set_nobuild('\0', "vmlinux", true);
1179 # undef set_nobuild
1180 # undef REASON
1181 #endif
1182 
1183 	rec->evlist = perf_evlist__new();
1184 	if (rec->evlist == NULL)
1185 		return -ENOMEM;
1186 
1187 	perf_config(perf_record_config, rec);
1188 
1189 	argc = parse_options(argc, argv, record_options, record_usage,
1190 			    PARSE_OPT_STOP_AT_NON_OPTION);
1191 	if (!argc && target__none(&rec->opts.target))
1192 		usage_with_options(record_usage, record_options);
1193 
1194 	if (nr_cgroups && !rec->opts.target.system_wide) {
1195 		usage_with_options_msg(record_usage, record_options,
1196 			"cgroup monitoring only available in system-wide mode");
1197 
1198 	}
1199 	if (rec->opts.record_switch_events &&
1200 	    !perf_can_record_switch_events()) {
1201 		ui__error("kernel does not support recording context switch events\n");
1202 		parse_options_usage(record_usage, record_options, "switch-events", 0);
1203 		return -EINVAL;
1204 	}
1205 
1206 	if (!rec->itr) {
1207 		rec->itr = auxtrace_record__init(rec->evlist, &err);
1208 		if (err)
1209 			return err;
1210 	}
1211 
1212 	err = auxtrace_parse_snapshot_options(rec->itr, &rec->opts,
1213 					      rec->opts.auxtrace_snapshot_opts);
1214 	if (err)
1215 		return err;
1216 
1217 	err = -ENOMEM;
1218 
1219 	symbol__init(NULL);
1220 
1221 	if (symbol_conf.kptr_restrict)
1222 		pr_warning(
1223 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1224 "check /proc/sys/kernel/kptr_restrict.\n\n"
1225 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1226 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1227 "Samples in kernel modules won't be resolved at all.\n\n"
1228 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1229 "even with a suitable vmlinux or kallsyms file.\n\n");
1230 
1231 	if (rec->no_buildid_cache || rec->no_buildid)
1232 		disable_buildid_cache();
1233 
1234 	if (rec->evlist->nr_entries == 0 &&
1235 	    perf_evlist__add_default(rec->evlist) < 0) {
1236 		pr_err("Not enough memory for event selector list\n");
1237 		goto out_symbol_exit;
1238 	}
1239 
1240 	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
1241 		rec->opts.no_inherit = true;
1242 
1243 	err = target__validate(&rec->opts.target);
1244 	if (err) {
1245 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1246 		ui__warning("%s", errbuf);
1247 	}
1248 
1249 	err = target__parse_uid(&rec->opts.target);
1250 	if (err) {
1251 		int saved_errno = errno;
1252 
1253 		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1254 		ui__error("%s", errbuf);
1255 
1256 		err = -saved_errno;
1257 		goto out_symbol_exit;
1258 	}
1259 
1260 	err = -ENOMEM;
1261 	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
1262 		usage_with_options(record_usage, record_options);
1263 
1264 	err = auxtrace_record__options(rec->itr, rec->evlist, &rec->opts);
1265 	if (err)
1266 		goto out_symbol_exit;
1267 
1268 	/*
1269 	 * We take all buildids when the file contains
1270 	 * AUX area tracing data because we do not decode the
1271 	 * trace because it would take too long.
1272 	 */
1273 	if (rec->opts.full_auxtrace)
1274 		rec->buildid_all = true;
1275 
1276 	if (record_opts__config(&rec->opts)) {
1277 		err = -EINVAL;
1278 		goto out_symbol_exit;
1279 	}
1280 
1281 	err = __cmd_record(&record, argc, argv);
1282 out_symbol_exit:
1283 	perf_evlist__delete(rec->evlist);
1284 	symbol__exit();
1285 	auxtrace_record__free(rec->itr);
1286 	return err;
1287 }
1288 
1289 static void snapshot_sig_handler(int sig __maybe_unused)
1290 {
1291 	if (!auxtrace_snapshot_enabled)
1292 		return;
1293 	auxtrace_snapshot_enabled = 0;
1294 	auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
1295 	auxtrace_record__snapshot_started = 1;
1296 }
1297