xref: /linux/tools/perf/builtin-record.c (revision 4413e16d9d21673bb5048a2e542f1aaa00015c2e)
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #define _FILE_OFFSET_BITS 64
9 
10 #include "builtin.h"
11 
12 #include "perf.h"
13 
14 #include "util/build-id.h"
15 #include "util/util.h"
16 #include "util/parse-options.h"
17 #include "util/parse-events.h"
18 
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/cpumap.h"
28 #include "util/thread_map.h"
29 
30 #include <unistd.h>
31 #include <sched.h>
32 #include <sys/mman.h>
33 
34 #define CALLCHAIN_HELP "do call-graph (stack chain/backtrace) recording: "
35 
36 #ifdef NO_LIBUNWIND_SUPPORT
37 static char callchain_help[] = CALLCHAIN_HELP "[fp]";
38 #else
39 static unsigned long default_stack_dump_size = 8192;
40 static char callchain_help[] = CALLCHAIN_HELP "[fp] dwarf";
41 #endif
42 
43 enum write_mode_t {
44 	WRITE_FORCE,
45 	WRITE_APPEND
46 };
47 
48 struct perf_record {
49 	struct perf_tool	tool;
50 	struct perf_record_opts	opts;
51 	u64			bytes_written;
52 	const char		*output_name;
53 	struct perf_evlist	*evlist;
54 	struct perf_session	*session;
55 	const char		*progname;
56 	int			output;
57 	unsigned int		page_size;
58 	int			realtime_prio;
59 	enum write_mode_t	write_mode;
60 	bool			no_buildid;
61 	bool			no_buildid_cache;
62 	bool			force;
63 	bool			file_new;
64 	bool			append_file;
65 	long			samples;
66 	off_t			post_processing_offset;
67 };
68 
69 static void advance_output(struct perf_record *rec, size_t size)
70 {
71 	rec->bytes_written += size;
72 }
73 
74 static int write_output(struct perf_record *rec, void *buf, size_t size)
75 {
76 	while (size) {
77 		int ret = write(rec->output, buf, size);
78 
79 		if (ret < 0) {
80 			pr_err("failed to write\n");
81 			return -1;
82 		}
83 
84 		size -= ret;
85 		buf += ret;
86 
87 		rec->bytes_written += ret;
88 	}
89 
90 	return 0;
91 }
92 
93 static int process_synthesized_event(struct perf_tool *tool,
94 				     union perf_event *event,
95 				     struct perf_sample *sample __maybe_unused,
96 				     struct machine *machine __maybe_unused)
97 {
98 	struct perf_record *rec = container_of(tool, struct perf_record, tool);
99 	if (write_output(rec, event, event->header.size) < 0)
100 		return -1;
101 
102 	return 0;
103 }
104 
105 static int perf_record__mmap_read(struct perf_record *rec,
106 				   struct perf_mmap *md)
107 {
108 	unsigned int head = perf_mmap__read_head(md);
109 	unsigned int old = md->prev;
110 	unsigned char *data = md->base + rec->page_size;
111 	unsigned long size;
112 	void *buf;
113 	int rc = 0;
114 
115 	if (old == head)
116 		return 0;
117 
118 	rec->samples++;
119 
120 	size = head - old;
121 
122 	if ((old & md->mask) + size != (head & md->mask)) {
123 		buf = &data[old & md->mask];
124 		size = md->mask + 1 - (old & md->mask);
125 		old += size;
126 
127 		if (write_output(rec, buf, size) < 0) {
128 			rc = -1;
129 			goto out;
130 		}
131 	}
132 
133 	buf = &data[old & md->mask];
134 	size = head - old;
135 	old += size;
136 
137 	if (write_output(rec, buf, size) < 0) {
138 		rc = -1;
139 		goto out;
140 	}
141 
142 	md->prev = old;
143 	perf_mmap__write_tail(md, old);
144 
145 out:
146 	return rc;
147 }
148 
149 static volatile int done = 0;
150 static volatile int signr = -1;
151 static volatile int child_finished = 0;
152 
153 static void sig_handler(int sig)
154 {
155 	if (sig == SIGCHLD)
156 		child_finished = 1;
157 
158 	done = 1;
159 	signr = sig;
160 }
161 
162 static void perf_record__sig_exit(int exit_status __maybe_unused, void *arg)
163 {
164 	struct perf_record *rec = arg;
165 	int status;
166 
167 	if (rec->evlist->workload.pid > 0) {
168 		if (!child_finished)
169 			kill(rec->evlist->workload.pid, SIGTERM);
170 
171 		wait(&status);
172 		if (WIFSIGNALED(status))
173 			psignal(WTERMSIG(status), rec->progname);
174 	}
175 
176 	if (signr == -1 || signr == SIGUSR1)
177 		return;
178 
179 	signal(signr, SIG_DFL);
180 	kill(getpid(), signr);
181 }
182 
183 static bool perf_evlist__equal(struct perf_evlist *evlist,
184 			       struct perf_evlist *other)
185 {
186 	struct perf_evsel *pos, *pair;
187 
188 	if (evlist->nr_entries != other->nr_entries)
189 		return false;
190 
191 	pair = perf_evlist__first(other);
192 
193 	list_for_each_entry(pos, &evlist->entries, node) {
194 		if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
195 			return false;
196 		pair = perf_evsel__next(pair);
197 	}
198 
199 	return true;
200 }
201 
202 static int perf_record__open(struct perf_record *rec)
203 {
204 	struct perf_evsel *pos;
205 	struct perf_evlist *evlist = rec->evlist;
206 	struct perf_session *session = rec->session;
207 	struct perf_record_opts *opts = &rec->opts;
208 	int rc = 0;
209 
210 	perf_evlist__config_attrs(evlist, opts);
211 
212 	if (opts->group)
213 		perf_evlist__set_leader(evlist);
214 
215 	list_for_each_entry(pos, &evlist->entries, node) {
216 		struct perf_event_attr *attr = &pos->attr;
217 		/*
218 		 * Check if parse_single_tracepoint_event has already asked for
219 		 * PERF_SAMPLE_TIME.
220 		 *
221 		 * XXX this is kludgy but short term fix for problems introduced by
222 		 * eac23d1c that broke 'perf script' by having different sample_types
223 		 * when using multiple tracepoint events when we use a perf binary
224 		 * that tries to use sample_id_all on an older kernel.
225 		 *
226 		 * We need to move counter creation to perf_session, support
227 		 * different sample_types, etc.
228 		 */
229 		bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
230 
231 fallback_missing_features:
232 		if (opts->exclude_guest_missing)
233 			attr->exclude_guest = attr->exclude_host = 0;
234 retry_sample_id:
235 		attr->sample_id_all = opts->sample_id_all_missing ? 0 : 1;
236 try_again:
237 		if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) {
238 			int err = errno;
239 
240 			if (err == EPERM || err == EACCES) {
241 				ui__error_paranoid();
242 				rc = -err;
243 				goto out;
244 			} else if (err ==  ENODEV && opts->target.cpu_list) {
245 				pr_err("No such device - did you specify"
246 				       " an out-of-range profile CPU?\n");
247 				rc = -err;
248 				goto out;
249 			} else if (err == EINVAL) {
250 				if (!opts->exclude_guest_missing &&
251 				    (attr->exclude_guest || attr->exclude_host)) {
252 					pr_debug("Old kernel, cannot exclude "
253 						 "guest or host samples.\n");
254 					opts->exclude_guest_missing = true;
255 					goto fallback_missing_features;
256 				} else if (!opts->sample_id_all_missing) {
257 					/*
258 					 * Old kernel, no attr->sample_id_type_all field
259 					 */
260 					opts->sample_id_all_missing = true;
261 					if (!opts->sample_time && !opts->raw_samples && !time_needed)
262 						attr->sample_type &= ~PERF_SAMPLE_TIME;
263 
264 					goto retry_sample_id;
265 				}
266 			}
267 
268 			/*
269 			 * If it's cycles then fall back to hrtimer
270 			 * based cpu-clock-tick sw counter, which
271 			 * is always available even if no PMU support.
272 			 *
273 			 * PPC returns ENXIO until 2.6.37 (behavior changed
274 			 * with commit b0a873e).
275 			 */
276 			if ((err == ENOENT || err == ENXIO)
277 					&& attr->type == PERF_TYPE_HARDWARE
278 					&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
279 
280 				if (verbose)
281 					ui__warning("The cycles event is not supported, "
282 						    "trying to fall back to cpu-clock-ticks\n");
283 				attr->type = PERF_TYPE_SOFTWARE;
284 				attr->config = PERF_COUNT_SW_CPU_CLOCK;
285 				if (pos->name) {
286 					free(pos->name);
287 					pos->name = NULL;
288 				}
289 				goto try_again;
290 			}
291 
292 			if (err == ENOENT) {
293 				ui__error("The %s event is not supported.\n",
294 					  perf_evsel__name(pos));
295 				rc = -err;
296 				goto out;
297 			}
298 
299 			printf("\n");
300 			error("sys_perf_event_open() syscall returned with %d "
301 			      "(%s) for event %s. /bin/dmesg may provide "
302 			      "additional information.\n",
303 			      err, strerror(err), perf_evsel__name(pos));
304 
305 #if defined(__i386__) || defined(__x86_64__)
306 			if (attr->type == PERF_TYPE_HARDWARE &&
307 			    err == EOPNOTSUPP) {
308 				pr_err("No hardware sampling interrupt available."
309 				       " No APIC? If so then you can boot the kernel"
310 				       " with the \"lapic\" boot parameter to"
311 				       " force-enable it.\n");
312 				rc = -err;
313 				goto out;
314 			}
315 #endif
316 
317 			pr_err("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
318 			rc = -err;
319 			goto out;
320 		}
321 	}
322 
323 	if (perf_evlist__apply_filters(evlist)) {
324 		error("failed to set filter with %d (%s)\n", errno,
325 			strerror(errno));
326 		rc = -1;
327 		goto out;
328 	}
329 
330 	if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
331 		if (errno == EPERM) {
332 			pr_err("Permission error mapping pages.\n"
333 			       "Consider increasing "
334 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
335 			       "or try again with a smaller value of -m/--mmap_pages.\n"
336 			       "(current value: %d)\n", opts->mmap_pages);
337 			rc = -errno;
338 		} else if (!is_power_of_2(opts->mmap_pages)) {
339 			pr_err("--mmap_pages/-m value must be a power of two.");
340 			rc = -EINVAL;
341 		} else {
342 			pr_err("failed to mmap with %d (%s)\n", errno, strerror(errno));
343 			rc = -errno;
344 		}
345 		goto out;
346 	}
347 
348 	if (rec->file_new)
349 		session->evlist = evlist;
350 	else {
351 		if (!perf_evlist__equal(session->evlist, evlist)) {
352 			fprintf(stderr, "incompatible append\n");
353 			rc = -1;
354 			goto out;
355 		}
356  	}
357 
358 	perf_session__set_id_hdr_size(session);
359 out:
360 	return rc;
361 }
362 
363 static int process_buildids(struct perf_record *rec)
364 {
365 	u64 size = lseek(rec->output, 0, SEEK_CUR);
366 
367 	if (size == 0)
368 		return 0;
369 
370 	rec->session->fd = rec->output;
371 	return __perf_session__process_events(rec->session, rec->post_processing_offset,
372 					      size - rec->post_processing_offset,
373 					      size, &build_id__mark_dso_hit_ops);
374 }
375 
376 static void perf_record__exit(int status, void *arg)
377 {
378 	struct perf_record *rec = arg;
379 
380 	if (status != 0)
381 		return;
382 
383 	if (!rec->opts.pipe_output) {
384 		rec->session->header.data_size += rec->bytes_written;
385 
386 		if (!rec->no_buildid)
387 			process_buildids(rec);
388 		perf_session__write_header(rec->session, rec->evlist,
389 					   rec->output, true);
390 		perf_session__delete(rec->session);
391 		perf_evlist__delete(rec->evlist);
392 		symbol__exit();
393 	}
394 }
395 
396 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
397 {
398 	int err;
399 	struct perf_tool *tool = data;
400 
401 	if (machine__is_host(machine))
402 		return;
403 
404 	/*
405 	 *As for guest kernel when processing subcommand record&report,
406 	 *we arrange module mmap prior to guest kernel mmap and trigger
407 	 *a preload dso because default guest module symbols are loaded
408 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
409 	 *method is used to avoid symbol missing when the first addr is
410 	 *in module instead of in guest kernel.
411 	 */
412 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
413 					     machine);
414 	if (err < 0)
415 		pr_err("Couldn't record guest kernel [%d]'s reference"
416 		       " relocation symbol.\n", machine->pid);
417 
418 	/*
419 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
420 	 * have no _text sometimes.
421 	 */
422 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
423 						 machine, "_text");
424 	if (err < 0)
425 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
426 							 machine, "_stext");
427 	if (err < 0)
428 		pr_err("Couldn't record guest kernel [%d]'s reference"
429 		       " relocation symbol.\n", machine->pid);
430 }
431 
432 static struct perf_event_header finished_round_event = {
433 	.size = sizeof(struct perf_event_header),
434 	.type = PERF_RECORD_FINISHED_ROUND,
435 };
436 
437 static int perf_record__mmap_read_all(struct perf_record *rec)
438 {
439 	int i;
440 	int rc = 0;
441 
442 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
443 		if (rec->evlist->mmap[i].base) {
444 			if (perf_record__mmap_read(rec, &rec->evlist->mmap[i]) != 0) {
445 				rc = -1;
446 				goto out;
447 			}
448 		}
449 	}
450 
451 	if (perf_header__has_feat(&rec->session->header, HEADER_TRACING_DATA))
452 		rc = write_output(rec, &finished_round_event,
453 				  sizeof(finished_round_event));
454 
455 out:
456 	return rc;
457 }
458 
459 static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
460 {
461 	struct stat st;
462 	int flags;
463 	int err, output, feat;
464 	unsigned long waking = 0;
465 	const bool forks = argc > 0;
466 	struct machine *machine;
467 	struct perf_tool *tool = &rec->tool;
468 	struct perf_record_opts *opts = &rec->opts;
469 	struct perf_evlist *evsel_list = rec->evlist;
470 	const char *output_name = rec->output_name;
471 	struct perf_session *session;
472 
473 	rec->progname = argv[0];
474 
475 	rec->page_size = sysconf(_SC_PAGE_SIZE);
476 
477 	on_exit(perf_record__sig_exit, rec);
478 	signal(SIGCHLD, sig_handler);
479 	signal(SIGINT, sig_handler);
480 	signal(SIGUSR1, sig_handler);
481 
482 	if (!output_name) {
483 		if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
484 			opts->pipe_output = true;
485 		else
486 			rec->output_name = output_name = "perf.data";
487 	}
488 	if (output_name) {
489 		if (!strcmp(output_name, "-"))
490 			opts->pipe_output = true;
491 		else if (!stat(output_name, &st) && st.st_size) {
492 			if (rec->write_mode == WRITE_FORCE) {
493 				char oldname[PATH_MAX];
494 				snprintf(oldname, sizeof(oldname), "%s.old",
495 					 output_name);
496 				unlink(oldname);
497 				rename(output_name, oldname);
498 			}
499 		} else if (rec->write_mode == WRITE_APPEND) {
500 			rec->write_mode = WRITE_FORCE;
501 		}
502 	}
503 
504 	flags = O_CREAT|O_RDWR;
505 	if (rec->write_mode == WRITE_APPEND)
506 		rec->file_new = 0;
507 	else
508 		flags |= O_TRUNC;
509 
510 	if (opts->pipe_output)
511 		output = STDOUT_FILENO;
512 	else
513 		output = open(output_name, flags, S_IRUSR | S_IWUSR);
514 	if (output < 0) {
515 		perror("failed to create output file");
516 		return -1;
517 	}
518 
519 	rec->output = output;
520 
521 	session = perf_session__new(output_name, O_WRONLY,
522 				    rec->write_mode == WRITE_FORCE, false, NULL);
523 	if (session == NULL) {
524 		pr_err("Not enough memory for reading perf file header\n");
525 		return -1;
526 	}
527 
528 	rec->session = session;
529 
530 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
531 		perf_header__set_feat(&session->header, feat);
532 
533 	if (rec->no_buildid)
534 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
535 
536 	if (!have_tracepoints(&evsel_list->entries))
537 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
538 
539 	if (!rec->opts.branch_stack)
540 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
541 
542 	if (!rec->file_new) {
543 		err = perf_session__read_header(session, output);
544 		if (err < 0)
545 			goto out_delete_session;
546 	}
547 
548 	if (forks) {
549 		err = perf_evlist__prepare_workload(evsel_list, opts, argv);
550 		if (err < 0) {
551 			pr_err("Couldn't run the workload!\n");
552 			goto out_delete_session;
553 		}
554 	}
555 
556 	if (perf_record__open(rec) != 0) {
557 		err = -1;
558 		goto out_delete_session;
559 	}
560 
561 	/*
562 	 * perf_session__delete(session) will be called at perf_record__exit()
563 	 */
564 	on_exit(perf_record__exit, rec);
565 
566 	if (opts->pipe_output) {
567 		err = perf_header__write_pipe(output);
568 		if (err < 0)
569 			goto out_delete_session;
570 	} else if (rec->file_new) {
571 		err = perf_session__write_header(session, evsel_list,
572 						 output, false);
573 		if (err < 0)
574 			goto out_delete_session;
575 	}
576 
577 	if (!rec->no_buildid
578 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
579 		pr_err("Couldn't generate buildids. "
580 		       "Use --no-buildid to profile anyway.\n");
581 		err = -1;
582 		goto out_delete_session;
583 	}
584 
585 	rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
586 
587 	machine = perf_session__find_host_machine(session);
588 	if (!machine) {
589 		pr_err("Couldn't find native kernel information.\n");
590 		err = -1;
591 		goto out_delete_session;
592 	}
593 
594 	if (opts->pipe_output) {
595 		err = perf_event__synthesize_attrs(tool, session,
596 						   process_synthesized_event);
597 		if (err < 0) {
598 			pr_err("Couldn't synthesize attrs.\n");
599 			goto out_delete_session;
600 		}
601 
602 		err = perf_event__synthesize_event_types(tool, process_synthesized_event,
603 							 machine);
604 		if (err < 0) {
605 			pr_err("Couldn't synthesize event_types.\n");
606 			goto out_delete_session;
607 		}
608 
609 		if (have_tracepoints(&evsel_list->entries)) {
610 			/*
611 			 * FIXME err <= 0 here actually means that
612 			 * there were no tracepoints so its not really
613 			 * an error, just that we don't need to
614 			 * synthesize anything.  We really have to
615 			 * return this more properly and also
616 			 * propagate errors that now are calling die()
617 			 */
618 			err = perf_event__synthesize_tracing_data(tool, output, evsel_list,
619 								  process_synthesized_event);
620 			if (err <= 0) {
621 				pr_err("Couldn't record tracing data.\n");
622 				goto out_delete_session;
623 			}
624 			advance_output(rec, err);
625 		}
626 	}
627 
628 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
629 						 machine, "_text");
630 	if (err < 0)
631 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
632 							 machine, "_stext");
633 	if (err < 0)
634 		pr_err("Couldn't record kernel reference relocation symbol\n"
635 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
636 		       "Check /proc/kallsyms permission or run as root.\n");
637 
638 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
639 					     machine);
640 	if (err < 0)
641 		pr_err("Couldn't record kernel module information.\n"
642 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
643 		       "Check /proc/modules permission or run as root.\n");
644 
645 	if (perf_guest)
646 		perf_session__process_machines(session, tool,
647 					       perf_event__synthesize_guest_os);
648 
649 	if (!opts->target.system_wide)
650 		err = perf_event__synthesize_thread_map(tool, evsel_list->threads,
651 						  process_synthesized_event,
652 						  machine);
653 	else
654 		err = perf_event__synthesize_threads(tool, process_synthesized_event,
655 					       machine);
656 
657 	if (err != 0)
658 		goto out_delete_session;
659 
660 	if (rec->realtime_prio) {
661 		struct sched_param param;
662 
663 		param.sched_priority = rec->realtime_prio;
664 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
665 			pr_err("Could not set realtime priority.\n");
666 			err = -1;
667 			goto out_delete_session;
668 		}
669 	}
670 
671 	perf_evlist__enable(evsel_list);
672 
673 	/*
674 	 * Let the child rip
675 	 */
676 	if (forks)
677 		perf_evlist__start_workload(evsel_list);
678 
679 	for (;;) {
680 		int hits = rec->samples;
681 
682 		if (perf_record__mmap_read_all(rec) < 0) {
683 			err = -1;
684 			goto out_delete_session;
685 		}
686 
687 		if (hits == rec->samples) {
688 			if (done)
689 				break;
690 			err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
691 			waking++;
692 		}
693 
694 		if (done)
695 			perf_evlist__disable(evsel_list);
696 	}
697 
698 	if (quiet || signr == SIGUSR1)
699 		return 0;
700 
701 	fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
702 
703 	/*
704 	 * Approximate RIP event size: 24 bytes.
705 	 */
706 	fprintf(stderr,
707 		"[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
708 		(double)rec->bytes_written / 1024.0 / 1024.0,
709 		output_name,
710 		rec->bytes_written / 24);
711 
712 	return 0;
713 
714 out_delete_session:
715 	perf_session__delete(session);
716 	return err;
717 }
718 
719 #define BRANCH_OPT(n, m) \
720 	{ .name = n, .mode = (m) }
721 
722 #define BRANCH_END { .name = NULL }
723 
724 struct branch_mode {
725 	const char *name;
726 	int mode;
727 };
728 
729 static const struct branch_mode branch_modes[] = {
730 	BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
731 	BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
732 	BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
733 	BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
734 	BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
735 	BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
736 	BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
737 	BRANCH_END
738 };
739 
740 static int
741 parse_branch_stack(const struct option *opt, const char *str, int unset)
742 {
743 #define ONLY_PLM \
744 	(PERF_SAMPLE_BRANCH_USER	|\
745 	 PERF_SAMPLE_BRANCH_KERNEL	|\
746 	 PERF_SAMPLE_BRANCH_HV)
747 
748 	uint64_t *mode = (uint64_t *)opt->value;
749 	const struct branch_mode *br;
750 	char *s, *os = NULL, *p;
751 	int ret = -1;
752 
753 	if (unset)
754 		return 0;
755 
756 	/*
757 	 * cannot set it twice, -b + --branch-filter for instance
758 	 */
759 	if (*mode)
760 		return -1;
761 
762 	/* str may be NULL in case no arg is passed to -b */
763 	if (str) {
764 		/* because str is read-only */
765 		s = os = strdup(str);
766 		if (!s)
767 			return -1;
768 
769 		for (;;) {
770 			p = strchr(s, ',');
771 			if (p)
772 				*p = '\0';
773 
774 			for (br = branch_modes; br->name; br++) {
775 				if (!strcasecmp(s, br->name))
776 					break;
777 			}
778 			if (!br->name) {
779 				ui__warning("unknown branch filter %s,"
780 					    " check man page\n", s);
781 				goto error;
782 			}
783 
784 			*mode |= br->mode;
785 
786 			if (!p)
787 				break;
788 
789 			s = p + 1;
790 		}
791 	}
792 	ret = 0;
793 
794 	/* default to any branch */
795 	if ((*mode & ~ONLY_PLM) == 0) {
796 		*mode = PERF_SAMPLE_BRANCH_ANY;
797 	}
798 error:
799 	free(os);
800 	return ret;
801 }
802 
803 #ifndef NO_LIBUNWIND_SUPPORT
804 static int get_stack_size(char *str, unsigned long *_size)
805 {
806 	char *endptr;
807 	unsigned long size;
808 	unsigned long max_size = round_down(USHRT_MAX, sizeof(u64));
809 
810 	size = strtoul(str, &endptr, 0);
811 
812 	do {
813 		if (*endptr)
814 			break;
815 
816 		size = round_up(size, sizeof(u64));
817 		if (!size || size > max_size)
818 			break;
819 
820 		*_size = size;
821 		return 0;
822 
823 	} while (0);
824 
825 	pr_err("callchain: Incorrect stack dump size (max %ld): %s\n",
826 	       max_size, str);
827 	return -1;
828 }
829 #endif /* !NO_LIBUNWIND_SUPPORT */
830 
831 static int
832 parse_callchain_opt(const struct option *opt __maybe_unused, const char *arg,
833 		    int unset)
834 {
835 	struct perf_record *rec = (struct perf_record *)opt->value;
836 	char *tok, *name, *saveptr = NULL;
837 	char *buf;
838 	int ret = -1;
839 
840 	/* --no-call-graph */
841 	if (unset)
842 		return 0;
843 
844 	/* We specified default option if none is provided. */
845 	BUG_ON(!arg);
846 
847 	/* We need buffer that we know we can write to. */
848 	buf = malloc(strlen(arg) + 1);
849 	if (!buf)
850 		return -ENOMEM;
851 
852 	strcpy(buf, arg);
853 
854 	tok = strtok_r((char *)buf, ",", &saveptr);
855 	name = tok ? : (char *)buf;
856 
857 	do {
858 		/* Framepointer style */
859 		if (!strncmp(name, "fp", sizeof("fp"))) {
860 			if (!strtok_r(NULL, ",", &saveptr)) {
861 				rec->opts.call_graph = CALLCHAIN_FP;
862 				ret = 0;
863 			} else
864 				pr_err("callchain: No more arguments "
865 				       "needed for -g fp\n");
866 			break;
867 
868 #ifndef NO_LIBUNWIND_SUPPORT
869 		/* Dwarf style */
870 		} else if (!strncmp(name, "dwarf", sizeof("dwarf"))) {
871 			ret = 0;
872 			rec->opts.call_graph = CALLCHAIN_DWARF;
873 			rec->opts.stack_dump_size = default_stack_dump_size;
874 
875 			tok = strtok_r(NULL, ",", &saveptr);
876 			if (tok) {
877 				unsigned long size = 0;
878 
879 				ret = get_stack_size(tok, &size);
880 				rec->opts.stack_dump_size = size;
881 			}
882 
883 			if (!ret)
884 				pr_debug("callchain: stack dump size %d\n",
885 					 rec->opts.stack_dump_size);
886 #endif /* !NO_LIBUNWIND_SUPPORT */
887 		} else {
888 			pr_err("callchain: Unknown -g option "
889 			       "value: %s\n", arg);
890 			break;
891 		}
892 
893 	} while (0);
894 
895 	free(buf);
896 
897 	if (!ret)
898 		pr_debug("callchain: type %d\n", rec->opts.call_graph);
899 
900 	return ret;
901 }
902 
903 static const char * const record_usage[] = {
904 	"perf record [<options>] [<command>]",
905 	"perf record [<options>] -- <command> [<options>]",
906 	NULL
907 };
908 
909 /*
910  * XXX Ideally would be local to cmd_record() and passed to a perf_record__new
911  * because we need to have access to it in perf_record__exit, that is called
912  * after cmd_record() exits, but since record_options need to be accessible to
913  * builtin-script, leave it here.
914  *
915  * At least we don't ouch it in all the other functions here directly.
916  *
917  * Just say no to tons of global variables, sigh.
918  */
919 static struct perf_record record = {
920 	.opts = {
921 		.mmap_pages	     = UINT_MAX,
922 		.user_freq	     = UINT_MAX,
923 		.user_interval	     = ULLONG_MAX,
924 		.freq		     = 4000,
925 		.target		     = {
926 			.uses_mmap   = true,
927 		},
928 	},
929 	.write_mode = WRITE_FORCE,
930 	.file_new   = true,
931 };
932 
933 /*
934  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
935  * with it and switch to use the library functions in perf_evlist that came
936  * from builtin-record.c, i.e. use perf_record_opts,
937  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
938  * using pipes, etc.
939  */
940 const struct option record_options[] = {
941 	OPT_CALLBACK('e', "event", &record.evlist, "event",
942 		     "event selector. use 'perf list' to list available events",
943 		     parse_events_option),
944 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
945 		     "event filter", parse_filter),
946 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
947 		    "record events on existing process id"),
948 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
949 		    "record events on existing thread id"),
950 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
951 		    "collect data with this RT SCHED_FIFO priority"),
952 	OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay,
953 		    "collect data without buffering"),
954 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
955 		    "collect raw sample records from all opened counters"),
956 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
957 			    "system-wide collection from all CPUs"),
958 	OPT_BOOLEAN('A', "append", &record.append_file,
959 			    "append to the output file to do incremental profiling"),
960 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
961 		    "list of cpus to monitor"),
962 	OPT_BOOLEAN('f', "force", &record.force,
963 			"overwrite existing data file (deprecated)"),
964 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
965 	OPT_STRING('o', "output", &record.output_name, "file",
966 		    "output file name"),
967 	OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit,
968 		    "child tasks do not inherit counters"),
969 	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
970 	OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages,
971 		     "number of mmap data pages"),
972 	OPT_BOOLEAN(0, "group", &record.opts.group,
973 		    "put the counters into a counter group"),
974 	OPT_CALLBACK_DEFAULT('g', "call-graph", &record, "mode[,dump_size]",
975 			     callchain_help, &parse_callchain_opt,
976 			     "fp"),
977 	OPT_INCR('v', "verbose", &verbose,
978 		    "be more verbose (show counter open errors, etc)"),
979 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
980 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
981 		    "per thread counts"),
982 	OPT_BOOLEAN('d', "data", &record.opts.sample_address,
983 		    "Sample addresses"),
984 	OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
985 	OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
986 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
987 		    "don't sample"),
988 	OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
989 		    "do not update the buildid cache"),
990 	OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
991 		    "do not collect buildids in perf.data"),
992 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
993 		     "monitor event in cgroup name only",
994 		     parse_cgroups),
995 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
996 		   "user to profile"),
997 
998 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
999 		     "branch any", "sample any taken branches",
1000 		     parse_branch_stack),
1001 
1002 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
1003 		     "branch filter mask", "branch stack filter modes",
1004 		     parse_branch_stack),
1005 	OPT_END()
1006 };
1007 
1008 int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
1009 {
1010 	int err = -ENOMEM;
1011 	struct perf_evsel *pos;
1012 	struct perf_evlist *evsel_list;
1013 	struct perf_record *rec = &record;
1014 	char errbuf[BUFSIZ];
1015 
1016 	evsel_list = perf_evlist__new(NULL, NULL);
1017 	if (evsel_list == NULL)
1018 		return -ENOMEM;
1019 
1020 	rec->evlist = evsel_list;
1021 
1022 	argc = parse_options(argc, argv, record_options, record_usage,
1023 			    PARSE_OPT_STOP_AT_NON_OPTION);
1024 	if (!argc && perf_target__none(&rec->opts.target))
1025 		usage_with_options(record_usage, record_options);
1026 
1027 	if (rec->force && rec->append_file) {
1028 		ui__error("Can't overwrite and append at the same time."
1029 			  " You need to choose between -f and -A");
1030 		usage_with_options(record_usage, record_options);
1031 	} else if (rec->append_file) {
1032 		rec->write_mode = WRITE_APPEND;
1033 	} else {
1034 		rec->write_mode = WRITE_FORCE;
1035 	}
1036 
1037 	if (nr_cgroups && !rec->opts.target.system_wide) {
1038 		ui__error("cgroup monitoring only available in"
1039 			  " system-wide mode\n");
1040 		usage_with_options(record_usage, record_options);
1041 	}
1042 
1043 	symbol__init();
1044 
1045 	if (symbol_conf.kptr_restrict)
1046 		pr_warning(
1047 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1048 "check /proc/sys/kernel/kptr_restrict.\n\n"
1049 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1050 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1051 "Samples in kernel modules won't be resolved at all.\n\n"
1052 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1053 "even with a suitable vmlinux or kallsyms file.\n\n");
1054 
1055 	if (rec->no_buildid_cache || rec->no_buildid)
1056 		disable_buildid_cache();
1057 
1058 	if (evsel_list->nr_entries == 0 &&
1059 	    perf_evlist__add_default(evsel_list) < 0) {
1060 		pr_err("Not enough memory for event selector list\n");
1061 		goto out_symbol_exit;
1062 	}
1063 
1064 	err = perf_target__validate(&rec->opts.target);
1065 	if (err) {
1066 		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1067 		ui__warning("%s", errbuf);
1068 	}
1069 
1070 	err = perf_target__parse_uid(&rec->opts.target);
1071 	if (err) {
1072 		int saved_errno = errno;
1073 
1074 		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1075 		ui__error("%s", errbuf);
1076 
1077 		err = -saved_errno;
1078 		goto out_free_fd;
1079 	}
1080 
1081 	err = -ENOMEM;
1082 	if (perf_evlist__create_maps(evsel_list, &rec->opts.target) < 0)
1083 		usage_with_options(record_usage, record_options);
1084 
1085 	list_for_each_entry(pos, &evsel_list->entries, node) {
1086 		if (perf_header__push_event(pos->attr.config, perf_evsel__name(pos)))
1087 			goto out_free_fd;
1088 	}
1089 
1090 	if (rec->opts.user_interval != ULLONG_MAX)
1091 		rec->opts.default_interval = rec->opts.user_interval;
1092 	if (rec->opts.user_freq != UINT_MAX)
1093 		rec->opts.freq = rec->opts.user_freq;
1094 
1095 	/*
1096 	 * User specified count overrides default frequency.
1097 	 */
1098 	if (rec->opts.default_interval)
1099 		rec->opts.freq = 0;
1100 	else if (rec->opts.freq) {
1101 		rec->opts.default_interval = rec->opts.freq;
1102 	} else {
1103 		ui__error("frequency and count are zero, aborting\n");
1104 		err = -EINVAL;
1105 		goto out_free_fd;
1106 	}
1107 
1108 	err = __cmd_record(&record, argc, argv);
1109 out_free_fd:
1110 	perf_evlist__delete_maps(evsel_list);
1111 out_symbol_exit:
1112 	symbol__exit();
1113 	return err;
1114 }
1115