xref: /linux/tools/perf/builtin-record.c (revision 6e6d9ba0d1ea224a877826fc1cc0f42878b60384)
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #define _FILE_OFFSET_BITS 64
9 
10 #include "builtin.h"
11 
12 #include "perf.h"
13 
14 #include "util/build-id.h"
15 #include "util/util.h"
16 #include "util/parse-options.h"
17 #include "util/parse-events.h"
18 
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/cpumap.h"
28 #include "util/thread_map.h"
29 
30 #include <unistd.h>
31 #include <sched.h>
32 #include <sys/mman.h>
33 
34 #ifndef HAVE_ON_EXIT
35 #ifndef ATEXIT_MAX
36 #define ATEXIT_MAX 32
37 #endif
38 static int __on_exit_count = 0;
39 typedef void (*on_exit_func_t) (int, void *);
40 static on_exit_func_t __on_exit_funcs[ATEXIT_MAX];
41 static void *__on_exit_args[ATEXIT_MAX];
42 static int __exitcode = 0;
43 static void __handle_on_exit_funcs(void);
44 static int on_exit(on_exit_func_t function, void *arg);
45 #define exit(x) (exit)(__exitcode = (x))
46 
47 static int on_exit(on_exit_func_t function, void *arg)
48 {
49 	if (__on_exit_count == ATEXIT_MAX)
50 		return -ENOMEM;
51 	else if (__on_exit_count == 0)
52 		atexit(__handle_on_exit_funcs);
53 	__on_exit_funcs[__on_exit_count] = function;
54 	__on_exit_args[__on_exit_count++] = arg;
55 	return 0;
56 }
57 
58 static void __handle_on_exit_funcs(void)
59 {
60 	int i;
61 	for (i = 0; i < __on_exit_count; i++)
62 		__on_exit_funcs[i] (__exitcode, __on_exit_args[i]);
63 }
64 #endif
65 
66 enum write_mode_t {
67 	WRITE_FORCE,
68 	WRITE_APPEND
69 };
70 
71 struct perf_record {
72 	struct perf_tool	tool;
73 	struct perf_record_opts	opts;
74 	u64			bytes_written;
75 	const char		*output_name;
76 	struct perf_evlist	*evlist;
77 	struct perf_session	*session;
78 	const char		*progname;
79 	int			output;
80 	unsigned int		page_size;
81 	int			realtime_prio;
82 	enum write_mode_t	write_mode;
83 	bool			no_buildid;
84 	bool			no_buildid_cache;
85 	bool			force;
86 	bool			file_new;
87 	bool			append_file;
88 	long			samples;
89 	off_t			post_processing_offset;
90 };
91 
92 static void advance_output(struct perf_record *rec, size_t size)
93 {
94 	rec->bytes_written += size;
95 }
96 
97 static int write_output(struct perf_record *rec, void *buf, size_t size)
98 {
99 	while (size) {
100 		int ret = write(rec->output, buf, size);
101 
102 		if (ret < 0) {
103 			pr_err("failed to write\n");
104 			return -1;
105 		}
106 
107 		size -= ret;
108 		buf += ret;
109 
110 		rec->bytes_written += ret;
111 	}
112 
113 	return 0;
114 }
115 
116 static int process_synthesized_event(struct perf_tool *tool,
117 				     union perf_event *event,
118 				     struct perf_sample *sample __maybe_unused,
119 				     struct machine *machine __maybe_unused)
120 {
121 	struct perf_record *rec = container_of(tool, struct perf_record, tool);
122 	if (write_output(rec, event, event->header.size) < 0)
123 		return -1;
124 
125 	return 0;
126 }
127 
128 static int perf_record__mmap_read(struct perf_record *rec,
129 				   struct perf_mmap *md)
130 {
131 	unsigned int head = perf_mmap__read_head(md);
132 	unsigned int old = md->prev;
133 	unsigned char *data = md->base + rec->page_size;
134 	unsigned long size;
135 	void *buf;
136 	int rc = 0;
137 
138 	if (old == head)
139 		return 0;
140 
141 	rec->samples++;
142 
143 	size = head - old;
144 
145 	if ((old & md->mask) + size != (head & md->mask)) {
146 		buf = &data[old & md->mask];
147 		size = md->mask + 1 - (old & md->mask);
148 		old += size;
149 
150 		if (write_output(rec, buf, size) < 0) {
151 			rc = -1;
152 			goto out;
153 		}
154 	}
155 
156 	buf = &data[old & md->mask];
157 	size = head - old;
158 	old += size;
159 
160 	if (write_output(rec, buf, size) < 0) {
161 		rc = -1;
162 		goto out;
163 	}
164 
165 	md->prev = old;
166 	perf_mmap__write_tail(md, old);
167 
168 out:
169 	return rc;
170 }
171 
172 static volatile int done = 0;
173 static volatile int signr = -1;
174 static volatile int child_finished = 0;
175 
176 static void sig_handler(int sig)
177 {
178 	if (sig == SIGCHLD)
179 		child_finished = 1;
180 
181 	done = 1;
182 	signr = sig;
183 }
184 
185 static void perf_record__sig_exit(int exit_status __maybe_unused, void *arg)
186 {
187 	struct perf_record *rec = arg;
188 	int status;
189 
190 	if (rec->evlist->workload.pid > 0) {
191 		if (!child_finished)
192 			kill(rec->evlist->workload.pid, SIGTERM);
193 
194 		wait(&status);
195 		if (WIFSIGNALED(status))
196 			psignal(WTERMSIG(status), rec->progname);
197 	}
198 
199 	if (signr == -1 || signr == SIGUSR1)
200 		return;
201 
202 	signal(signr, SIG_DFL);
203 	kill(getpid(), signr);
204 }
205 
206 static bool perf_evlist__equal(struct perf_evlist *evlist,
207 			       struct perf_evlist *other)
208 {
209 	struct perf_evsel *pos, *pair;
210 
211 	if (evlist->nr_entries != other->nr_entries)
212 		return false;
213 
214 	pair = perf_evlist__first(other);
215 
216 	list_for_each_entry(pos, &evlist->entries, node) {
217 		if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
218 			return false;
219 		pair = perf_evsel__next(pair);
220 	}
221 
222 	return true;
223 }
224 
225 static int perf_record__open(struct perf_record *rec)
226 {
227 	char msg[512];
228 	struct perf_evsel *pos;
229 	struct perf_evlist *evlist = rec->evlist;
230 	struct perf_session *session = rec->session;
231 	struct perf_record_opts *opts = &rec->opts;
232 	int rc = 0;
233 
234 	perf_evlist__config(evlist, opts);
235 
236 	list_for_each_entry(pos, &evlist->entries, node) {
237 try_again:
238 		if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) {
239 			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
240 				if (verbose)
241 					ui__warning("%s\n", msg);
242 				goto try_again;
243 			}
244 
245 			rc = -errno;
246 			perf_evsel__open_strerror(pos, &opts->target,
247 						  errno, msg, sizeof(msg));
248 			ui__error("%s\n", msg);
249 			goto out;
250 		}
251 	}
252 
253 	if (perf_evlist__apply_filters(evlist)) {
254 		error("failed to set filter with %d (%s)\n", errno,
255 			strerror(errno));
256 		rc = -1;
257 		goto out;
258 	}
259 
260 	if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
261 		if (errno == EPERM) {
262 			pr_err("Permission error mapping pages.\n"
263 			       "Consider increasing "
264 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
265 			       "or try again with a smaller value of -m/--mmap_pages.\n"
266 			       "(current value: %d)\n", opts->mmap_pages);
267 			rc = -errno;
268 		} else if (!is_power_of_2(opts->mmap_pages) &&
269 			   (opts->mmap_pages != UINT_MAX)) {
270 			pr_err("--mmap_pages/-m value must be a power of two.");
271 			rc = -EINVAL;
272 		} else {
273 			pr_err("failed to mmap with %d (%s)\n", errno, strerror(errno));
274 			rc = -errno;
275 		}
276 		goto out;
277 	}
278 
279 	if (rec->file_new)
280 		session->evlist = evlist;
281 	else {
282 		if (!perf_evlist__equal(session->evlist, evlist)) {
283 			fprintf(stderr, "incompatible append\n");
284 			rc = -1;
285 			goto out;
286 		}
287  	}
288 
289 	perf_session__set_id_hdr_size(session);
290 out:
291 	return rc;
292 }
293 
294 static int process_buildids(struct perf_record *rec)
295 {
296 	u64 size = lseek(rec->output, 0, SEEK_CUR);
297 
298 	if (size == 0)
299 		return 0;
300 
301 	rec->session->fd = rec->output;
302 	return __perf_session__process_events(rec->session, rec->post_processing_offset,
303 					      size - rec->post_processing_offset,
304 					      size, &build_id__mark_dso_hit_ops);
305 }
306 
307 static void perf_record__exit(int status, void *arg)
308 {
309 	struct perf_record *rec = arg;
310 
311 	if (status != 0)
312 		return;
313 
314 	if (!rec->opts.pipe_output) {
315 		rec->session->header.data_size += rec->bytes_written;
316 
317 		if (!rec->no_buildid)
318 			process_buildids(rec);
319 		perf_session__write_header(rec->session, rec->evlist,
320 					   rec->output, true);
321 		perf_session__delete(rec->session);
322 		perf_evlist__delete(rec->evlist);
323 		symbol__exit();
324 	}
325 }
326 
327 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
328 {
329 	int err;
330 	struct perf_tool *tool = data;
331 	/*
332 	 *As for guest kernel when processing subcommand record&report,
333 	 *we arrange module mmap prior to guest kernel mmap and trigger
334 	 *a preload dso because default guest module symbols are loaded
335 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
336 	 *method is used to avoid symbol missing when the first addr is
337 	 *in module instead of in guest kernel.
338 	 */
339 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
340 					     machine);
341 	if (err < 0)
342 		pr_err("Couldn't record guest kernel [%d]'s reference"
343 		       " relocation symbol.\n", machine->pid);
344 
345 	/*
346 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
347 	 * have no _text sometimes.
348 	 */
349 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
350 						 machine, "_text");
351 	if (err < 0)
352 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
353 							 machine, "_stext");
354 	if (err < 0)
355 		pr_err("Couldn't record guest kernel [%d]'s reference"
356 		       " relocation symbol.\n", machine->pid);
357 }
358 
359 static struct perf_event_header finished_round_event = {
360 	.size = sizeof(struct perf_event_header),
361 	.type = PERF_RECORD_FINISHED_ROUND,
362 };
363 
364 static int perf_record__mmap_read_all(struct perf_record *rec)
365 {
366 	int i;
367 	int rc = 0;
368 
369 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
370 		if (rec->evlist->mmap[i].base) {
371 			if (perf_record__mmap_read(rec, &rec->evlist->mmap[i]) != 0) {
372 				rc = -1;
373 				goto out;
374 			}
375 		}
376 	}
377 
378 	if (perf_header__has_feat(&rec->session->header, HEADER_TRACING_DATA))
379 		rc = write_output(rec, &finished_round_event,
380 				  sizeof(finished_round_event));
381 
382 out:
383 	return rc;
384 }
385 
386 static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
387 {
388 	struct stat st;
389 	int flags;
390 	int err, output, feat;
391 	unsigned long waking = 0;
392 	const bool forks = argc > 0;
393 	struct machine *machine;
394 	struct perf_tool *tool = &rec->tool;
395 	struct perf_record_opts *opts = &rec->opts;
396 	struct perf_evlist *evsel_list = rec->evlist;
397 	const char *output_name = rec->output_name;
398 	struct perf_session *session;
399 	bool disabled = false;
400 
401 	rec->progname = argv[0];
402 
403 	rec->page_size = sysconf(_SC_PAGE_SIZE);
404 
405 	on_exit(perf_record__sig_exit, rec);
406 	signal(SIGCHLD, sig_handler);
407 	signal(SIGINT, sig_handler);
408 	signal(SIGUSR1, sig_handler);
409 
410 	if (!output_name) {
411 		if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
412 			opts->pipe_output = true;
413 		else
414 			rec->output_name = output_name = "perf.data";
415 	}
416 	if (output_name) {
417 		if (!strcmp(output_name, "-"))
418 			opts->pipe_output = true;
419 		else if (!stat(output_name, &st) && st.st_size) {
420 			if (rec->write_mode == WRITE_FORCE) {
421 				char oldname[PATH_MAX];
422 				snprintf(oldname, sizeof(oldname), "%s.old",
423 					 output_name);
424 				unlink(oldname);
425 				rename(output_name, oldname);
426 			}
427 		} else if (rec->write_mode == WRITE_APPEND) {
428 			rec->write_mode = WRITE_FORCE;
429 		}
430 	}
431 
432 	flags = O_CREAT|O_RDWR;
433 	if (rec->write_mode == WRITE_APPEND)
434 		rec->file_new = 0;
435 	else
436 		flags |= O_TRUNC;
437 
438 	if (opts->pipe_output)
439 		output = STDOUT_FILENO;
440 	else
441 		output = open(output_name, flags, S_IRUSR | S_IWUSR);
442 	if (output < 0) {
443 		perror("failed to create output file");
444 		return -1;
445 	}
446 
447 	rec->output = output;
448 
449 	session = perf_session__new(output_name, O_WRONLY,
450 				    rec->write_mode == WRITE_FORCE, false, NULL);
451 	if (session == NULL) {
452 		pr_err("Not enough memory for reading perf file header\n");
453 		return -1;
454 	}
455 
456 	rec->session = session;
457 
458 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
459 		perf_header__set_feat(&session->header, feat);
460 
461 	if (rec->no_buildid)
462 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
463 
464 	if (!have_tracepoints(&evsel_list->entries))
465 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
466 
467 	if (!rec->opts.branch_stack)
468 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
469 
470 	if (!rec->file_new) {
471 		err = perf_session__read_header(session, output);
472 		if (err < 0)
473 			goto out_delete_session;
474 	}
475 
476 	if (forks) {
477 		err = perf_evlist__prepare_workload(evsel_list, opts, argv);
478 		if (err < 0) {
479 			pr_err("Couldn't run the workload!\n");
480 			goto out_delete_session;
481 		}
482 	}
483 
484 	if (perf_record__open(rec) != 0) {
485 		err = -1;
486 		goto out_delete_session;
487 	}
488 
489 	if (!evsel_list->nr_groups)
490 		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
491 
492 	/*
493 	 * perf_session__delete(session) will be called at perf_record__exit()
494 	 */
495 	on_exit(perf_record__exit, rec);
496 
497 	if (opts->pipe_output) {
498 		err = perf_header__write_pipe(output);
499 		if (err < 0)
500 			goto out_delete_session;
501 	} else if (rec->file_new) {
502 		err = perf_session__write_header(session, evsel_list,
503 						 output, false);
504 		if (err < 0)
505 			goto out_delete_session;
506 	}
507 
508 	if (!rec->no_buildid
509 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
510 		pr_err("Couldn't generate buildids. "
511 		       "Use --no-buildid to profile anyway.\n");
512 		err = -1;
513 		goto out_delete_session;
514 	}
515 
516 	rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
517 
518 	machine = &session->machines.host;
519 
520 	if (opts->pipe_output) {
521 		err = perf_event__synthesize_attrs(tool, session,
522 						   process_synthesized_event);
523 		if (err < 0) {
524 			pr_err("Couldn't synthesize attrs.\n");
525 			goto out_delete_session;
526 		}
527 
528 		err = perf_event__synthesize_event_types(tool, process_synthesized_event,
529 							 machine);
530 		if (err < 0) {
531 			pr_err("Couldn't synthesize event_types.\n");
532 			goto out_delete_session;
533 		}
534 
535 		if (have_tracepoints(&evsel_list->entries)) {
536 			/*
537 			 * FIXME err <= 0 here actually means that
538 			 * there were no tracepoints so its not really
539 			 * an error, just that we don't need to
540 			 * synthesize anything.  We really have to
541 			 * return this more properly and also
542 			 * propagate errors that now are calling die()
543 			 */
544 			err = perf_event__synthesize_tracing_data(tool, output, evsel_list,
545 								  process_synthesized_event);
546 			if (err <= 0) {
547 				pr_err("Couldn't record tracing data.\n");
548 				goto out_delete_session;
549 			}
550 			advance_output(rec, err);
551 		}
552 	}
553 
554 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
555 						 machine, "_text");
556 	if (err < 0)
557 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
558 							 machine, "_stext");
559 	if (err < 0)
560 		pr_err("Couldn't record kernel reference relocation symbol\n"
561 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
562 		       "Check /proc/kallsyms permission or run as root.\n");
563 
564 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
565 					     machine);
566 	if (err < 0)
567 		pr_err("Couldn't record kernel module information.\n"
568 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
569 		       "Check /proc/modules permission or run as root.\n");
570 
571 	if (perf_guest) {
572 		machines__process_guests(&session->machines,
573 					 perf_event__synthesize_guest_os, tool);
574 	}
575 
576 	if (!opts->target.system_wide)
577 		err = perf_event__synthesize_thread_map(tool, evsel_list->threads,
578 						  process_synthesized_event,
579 						  machine);
580 	else
581 		err = perf_event__synthesize_threads(tool, process_synthesized_event,
582 					       machine);
583 
584 	if (err != 0)
585 		goto out_delete_session;
586 
587 	if (rec->realtime_prio) {
588 		struct sched_param param;
589 
590 		param.sched_priority = rec->realtime_prio;
591 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
592 			pr_err("Could not set realtime priority.\n");
593 			err = -1;
594 			goto out_delete_session;
595 		}
596 	}
597 
598 	/*
599 	 * When perf is starting the traced process, all the events
600 	 * (apart from group members) have enable_on_exec=1 set,
601 	 * so don't spoil it by prematurely enabling them.
602 	 */
603 	if (!perf_target__none(&opts->target))
604 		perf_evlist__enable(evsel_list);
605 
606 	/*
607 	 * Let the child rip
608 	 */
609 	if (forks)
610 		perf_evlist__start_workload(evsel_list);
611 
612 	for (;;) {
613 		int hits = rec->samples;
614 
615 		if (perf_record__mmap_read_all(rec) < 0) {
616 			err = -1;
617 			goto out_delete_session;
618 		}
619 
620 		if (hits == rec->samples) {
621 			if (done)
622 				break;
623 			err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
624 			waking++;
625 		}
626 
627 		/*
628 		 * When perf is starting the traced process, at the end events
629 		 * die with the process and we wait for that. Thus no need to
630 		 * disable events in this case.
631 		 */
632 		if (done && !disabled && !perf_target__none(&opts->target)) {
633 			perf_evlist__disable(evsel_list);
634 			disabled = true;
635 		}
636 	}
637 
638 	if (quiet || signr == SIGUSR1)
639 		return 0;
640 
641 	fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
642 
643 	/*
644 	 * Approximate RIP event size: 24 bytes.
645 	 */
646 	fprintf(stderr,
647 		"[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
648 		(double)rec->bytes_written / 1024.0 / 1024.0,
649 		output_name,
650 		rec->bytes_written / 24);
651 
652 	return 0;
653 
654 out_delete_session:
655 	perf_session__delete(session);
656 	return err;
657 }
658 
659 #define BRANCH_OPT(n, m) \
660 	{ .name = n, .mode = (m) }
661 
662 #define BRANCH_END { .name = NULL }
663 
664 struct branch_mode {
665 	const char *name;
666 	int mode;
667 };
668 
669 static const struct branch_mode branch_modes[] = {
670 	BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
671 	BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
672 	BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
673 	BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
674 	BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
675 	BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
676 	BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
677 	BRANCH_END
678 };
679 
680 static int
681 parse_branch_stack(const struct option *opt, const char *str, int unset)
682 {
683 #define ONLY_PLM \
684 	(PERF_SAMPLE_BRANCH_USER	|\
685 	 PERF_SAMPLE_BRANCH_KERNEL	|\
686 	 PERF_SAMPLE_BRANCH_HV)
687 
688 	uint64_t *mode = (uint64_t *)opt->value;
689 	const struct branch_mode *br;
690 	char *s, *os = NULL, *p;
691 	int ret = -1;
692 
693 	if (unset)
694 		return 0;
695 
696 	/*
697 	 * cannot set it twice, -b + --branch-filter for instance
698 	 */
699 	if (*mode)
700 		return -1;
701 
702 	/* str may be NULL in case no arg is passed to -b */
703 	if (str) {
704 		/* because str is read-only */
705 		s = os = strdup(str);
706 		if (!s)
707 			return -1;
708 
709 		for (;;) {
710 			p = strchr(s, ',');
711 			if (p)
712 				*p = '\0';
713 
714 			for (br = branch_modes; br->name; br++) {
715 				if (!strcasecmp(s, br->name))
716 					break;
717 			}
718 			if (!br->name) {
719 				ui__warning("unknown branch filter %s,"
720 					    " check man page\n", s);
721 				goto error;
722 			}
723 
724 			*mode |= br->mode;
725 
726 			if (!p)
727 				break;
728 
729 			s = p + 1;
730 		}
731 	}
732 	ret = 0;
733 
734 	/* default to any branch */
735 	if ((*mode & ~ONLY_PLM) == 0) {
736 		*mode = PERF_SAMPLE_BRANCH_ANY;
737 	}
738 error:
739 	free(os);
740 	return ret;
741 }
742 
743 #ifdef LIBUNWIND_SUPPORT
744 static int get_stack_size(char *str, unsigned long *_size)
745 {
746 	char *endptr;
747 	unsigned long size;
748 	unsigned long max_size = round_down(USHRT_MAX, sizeof(u64));
749 
750 	size = strtoul(str, &endptr, 0);
751 
752 	do {
753 		if (*endptr)
754 			break;
755 
756 		size = round_up(size, sizeof(u64));
757 		if (!size || size > max_size)
758 			break;
759 
760 		*_size = size;
761 		return 0;
762 
763 	} while (0);
764 
765 	pr_err("callchain: Incorrect stack dump size (max %ld): %s\n",
766 	       max_size, str);
767 	return -1;
768 }
769 #endif /* LIBUNWIND_SUPPORT */
770 
771 int record_parse_callchain_opt(const struct option *opt,
772 			       const char *arg, int unset)
773 {
774 	struct perf_record_opts *opts = opt->value;
775 	char *tok, *name, *saveptr = NULL;
776 	char *buf;
777 	int ret = -1;
778 
779 	/* --no-call-graph */
780 	if (unset)
781 		return 0;
782 
783 	/* We specified default option if none is provided. */
784 	BUG_ON(!arg);
785 
786 	/* We need buffer that we know we can write to. */
787 	buf = malloc(strlen(arg) + 1);
788 	if (!buf)
789 		return -ENOMEM;
790 
791 	strcpy(buf, arg);
792 
793 	tok = strtok_r((char *)buf, ",", &saveptr);
794 	name = tok ? : (char *)buf;
795 
796 	do {
797 		/* Framepointer style */
798 		if (!strncmp(name, "fp", sizeof("fp"))) {
799 			if (!strtok_r(NULL, ",", &saveptr)) {
800 				opts->call_graph = CALLCHAIN_FP;
801 				ret = 0;
802 			} else
803 				pr_err("callchain: No more arguments "
804 				       "needed for -g fp\n");
805 			break;
806 
807 #ifdef LIBUNWIND_SUPPORT
808 		/* Dwarf style */
809 		} else if (!strncmp(name, "dwarf", sizeof("dwarf"))) {
810 			const unsigned long default_stack_dump_size = 8192;
811 
812 			ret = 0;
813 			opts->call_graph = CALLCHAIN_DWARF;
814 			opts->stack_dump_size = default_stack_dump_size;
815 
816 			tok = strtok_r(NULL, ",", &saveptr);
817 			if (tok) {
818 				unsigned long size = 0;
819 
820 				ret = get_stack_size(tok, &size);
821 				opts->stack_dump_size = size;
822 			}
823 
824 			if (!ret)
825 				pr_debug("callchain: stack dump size %d\n",
826 					 opts->stack_dump_size);
827 #endif /* LIBUNWIND_SUPPORT */
828 		} else {
829 			pr_err("callchain: Unknown -g option "
830 			       "value: %s\n", arg);
831 			break;
832 		}
833 
834 	} while (0);
835 
836 	free(buf);
837 
838 	if (!ret)
839 		pr_debug("callchain: type %d\n", opts->call_graph);
840 
841 	return ret;
842 }
843 
844 static const char * const record_usage[] = {
845 	"perf record [<options>] [<command>]",
846 	"perf record [<options>] -- <command> [<options>]",
847 	NULL
848 };
849 
850 /*
851  * XXX Ideally would be local to cmd_record() and passed to a perf_record__new
852  * because we need to have access to it in perf_record__exit, that is called
853  * after cmd_record() exits, but since record_options need to be accessible to
854  * builtin-script, leave it here.
855  *
856  * At least we don't ouch it in all the other functions here directly.
857  *
858  * Just say no to tons of global variables, sigh.
859  */
860 static struct perf_record record = {
861 	.opts = {
862 		.mmap_pages	     = UINT_MAX,
863 		.user_freq	     = UINT_MAX,
864 		.user_interval	     = ULLONG_MAX,
865 		.freq		     = 4000,
866 		.target		     = {
867 			.uses_mmap   = true,
868 		},
869 	},
870 	.write_mode = WRITE_FORCE,
871 	.file_new   = true,
872 };
873 
874 #define CALLCHAIN_HELP "do call-graph (stack chain/backtrace) recording: "
875 
876 #ifdef LIBUNWIND_SUPPORT
877 const char record_callchain_help[] = CALLCHAIN_HELP "[fp] dwarf";
878 #else
879 const char record_callchain_help[] = CALLCHAIN_HELP "[fp]";
880 #endif
881 
882 /*
883  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
884  * with it and switch to use the library functions in perf_evlist that came
885  * from builtin-record.c, i.e. use perf_record_opts,
886  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
887  * using pipes, etc.
888  */
889 const struct option record_options[] = {
890 	OPT_CALLBACK('e', "event", &record.evlist, "event",
891 		     "event selector. use 'perf list' to list available events",
892 		     parse_events_option),
893 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
894 		     "event filter", parse_filter),
895 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
896 		    "record events on existing process id"),
897 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
898 		    "record events on existing thread id"),
899 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
900 		    "collect data with this RT SCHED_FIFO priority"),
901 	OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay,
902 		    "collect data without buffering"),
903 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
904 		    "collect raw sample records from all opened counters"),
905 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
906 			    "system-wide collection from all CPUs"),
907 	OPT_BOOLEAN('A', "append", &record.append_file,
908 			    "append to the output file to do incremental profiling"),
909 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
910 		    "list of cpus to monitor"),
911 	OPT_BOOLEAN('f', "force", &record.force,
912 			"overwrite existing data file (deprecated)"),
913 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
914 	OPT_STRING('o', "output", &record.output_name, "file",
915 		    "output file name"),
916 	OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit,
917 		    "child tasks do not inherit counters"),
918 	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
919 	OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages,
920 		     "number of mmap data pages"),
921 	OPT_BOOLEAN(0, "group", &record.opts.group,
922 		    "put the counters into a counter group"),
923 	OPT_CALLBACK_DEFAULT('g', "call-graph", &record.opts,
924 			     "mode[,dump_size]", record_callchain_help,
925 			     &record_parse_callchain_opt, "fp"),
926 	OPT_INCR('v', "verbose", &verbose,
927 		    "be more verbose (show counter open errors, etc)"),
928 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
929 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
930 		    "per thread counts"),
931 	OPT_BOOLEAN('d', "data", &record.opts.sample_address,
932 		    "Sample addresses"),
933 	OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
934 	OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
935 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
936 		    "don't sample"),
937 	OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
938 		    "do not update the buildid cache"),
939 	OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
940 		    "do not collect buildids in perf.data"),
941 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
942 		     "monitor event in cgroup name only",
943 		     parse_cgroups),
944 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
945 		   "user to profile"),
946 
947 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
948 		     "branch any", "sample any taken branches",
949 		     parse_branch_stack),
950 
951 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
952 		     "branch filter mask", "branch stack filter modes",
953 		     parse_branch_stack),
954 	OPT_END()
955 };
956 
957 int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
958 {
959 	int err = -ENOMEM;
960 	struct perf_evsel *pos;
961 	struct perf_evlist *evsel_list;
962 	struct perf_record *rec = &record;
963 	char errbuf[BUFSIZ];
964 
965 	evsel_list = perf_evlist__new(NULL, NULL);
966 	if (evsel_list == NULL)
967 		return -ENOMEM;
968 
969 	rec->evlist = evsel_list;
970 
971 	argc = parse_options(argc, argv, record_options, record_usage,
972 			    PARSE_OPT_STOP_AT_NON_OPTION);
973 	if (!argc && perf_target__none(&rec->opts.target))
974 		usage_with_options(record_usage, record_options);
975 
976 	if (rec->force && rec->append_file) {
977 		ui__error("Can't overwrite and append at the same time."
978 			  " You need to choose between -f and -A");
979 		usage_with_options(record_usage, record_options);
980 	} else if (rec->append_file) {
981 		rec->write_mode = WRITE_APPEND;
982 	} else {
983 		rec->write_mode = WRITE_FORCE;
984 	}
985 
986 	if (nr_cgroups && !rec->opts.target.system_wide) {
987 		ui__error("cgroup monitoring only available in"
988 			  " system-wide mode\n");
989 		usage_with_options(record_usage, record_options);
990 	}
991 
992 	symbol__init();
993 
994 	if (symbol_conf.kptr_restrict)
995 		pr_warning(
996 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
997 "check /proc/sys/kernel/kptr_restrict.\n\n"
998 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
999 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1000 "Samples in kernel modules won't be resolved at all.\n\n"
1001 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1002 "even with a suitable vmlinux or kallsyms file.\n\n");
1003 
1004 	if (rec->no_buildid_cache || rec->no_buildid)
1005 		disable_buildid_cache();
1006 
1007 	if (evsel_list->nr_entries == 0 &&
1008 	    perf_evlist__add_default(evsel_list) < 0) {
1009 		pr_err("Not enough memory for event selector list\n");
1010 		goto out_symbol_exit;
1011 	}
1012 
1013 	err = perf_target__validate(&rec->opts.target);
1014 	if (err) {
1015 		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1016 		ui__warning("%s", errbuf);
1017 	}
1018 
1019 	err = perf_target__parse_uid(&rec->opts.target);
1020 	if (err) {
1021 		int saved_errno = errno;
1022 
1023 		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1024 		ui__error("%s", errbuf);
1025 
1026 		err = -saved_errno;
1027 		goto out_free_fd;
1028 	}
1029 
1030 	err = -ENOMEM;
1031 	if (perf_evlist__create_maps(evsel_list, &rec->opts.target) < 0)
1032 		usage_with_options(record_usage, record_options);
1033 
1034 	list_for_each_entry(pos, &evsel_list->entries, node) {
1035 		if (perf_header__push_event(pos->attr.config, perf_evsel__name(pos)))
1036 			goto out_free_fd;
1037 	}
1038 
1039 	if (rec->opts.user_interval != ULLONG_MAX)
1040 		rec->opts.default_interval = rec->opts.user_interval;
1041 	if (rec->opts.user_freq != UINT_MAX)
1042 		rec->opts.freq = rec->opts.user_freq;
1043 
1044 	/*
1045 	 * User specified count overrides default frequency.
1046 	 */
1047 	if (rec->opts.default_interval)
1048 		rec->opts.freq = 0;
1049 	else if (rec->opts.freq) {
1050 		rec->opts.default_interval = rec->opts.freq;
1051 	} else {
1052 		ui__error("frequency and count are zero, aborting\n");
1053 		err = -EINVAL;
1054 		goto out_free_fd;
1055 	}
1056 
1057 	err = __cmd_record(&record, argc, argv);
1058 out_free_fd:
1059 	perf_evlist__delete_maps(evsel_list);
1060 out_symbol_exit:
1061 	symbol__exit();
1062 	return err;
1063 }
1064