xref: /linux/tools/perf/builtin-record.c (revision b889fcf63cb62e7fdb7816565e28f44dbe4a76a5)
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #define _FILE_OFFSET_BITS 64
9 
10 #include "builtin.h"
11 
12 #include "perf.h"
13 
14 #include "util/build-id.h"
15 #include "util/util.h"
16 #include "util/parse-options.h"
17 #include "util/parse-events.h"
18 
19 #include "util/header.h"
20 #include "util/event.h"
21 #include "util/evlist.h"
22 #include "util/evsel.h"
23 #include "util/debug.h"
24 #include "util/session.h"
25 #include "util/tool.h"
26 #include "util/symbol.h"
27 #include "util/cpumap.h"
28 #include "util/thread_map.h"
29 
30 #include <unistd.h>
31 #include <sched.h>
32 #include <sys/mman.h>
33 
34 #ifndef HAVE_ON_EXIT
35 #ifndef ATEXIT_MAX
36 #define ATEXIT_MAX 32
37 #endif
38 static int __on_exit_count = 0;
39 typedef void (*on_exit_func_t) (int, void *);
40 static on_exit_func_t __on_exit_funcs[ATEXIT_MAX];
41 static void *__on_exit_args[ATEXIT_MAX];
42 static int __exitcode = 0;
43 static void __handle_on_exit_funcs(void);
44 static int on_exit(on_exit_func_t function, void *arg);
45 #define exit(x) (exit)(__exitcode = (x))
46 
47 static int on_exit(on_exit_func_t function, void *arg)
48 {
49 	if (__on_exit_count == ATEXIT_MAX)
50 		return -ENOMEM;
51 	else if (__on_exit_count == 0)
52 		atexit(__handle_on_exit_funcs);
53 	__on_exit_funcs[__on_exit_count] = function;
54 	__on_exit_args[__on_exit_count++] = arg;
55 	return 0;
56 }
57 
58 static void __handle_on_exit_funcs(void)
59 {
60 	int i;
61 	for (i = 0; i < __on_exit_count; i++)
62 		__on_exit_funcs[i] (__exitcode, __on_exit_args[i]);
63 }
64 #endif
65 
66 enum write_mode_t {
67 	WRITE_FORCE,
68 	WRITE_APPEND
69 };
70 
71 struct perf_record {
72 	struct perf_tool	tool;
73 	struct perf_record_opts	opts;
74 	u64			bytes_written;
75 	const char		*output_name;
76 	struct perf_evlist	*evlist;
77 	struct perf_session	*session;
78 	const char		*progname;
79 	int			output;
80 	unsigned int		page_size;
81 	int			realtime_prio;
82 	enum write_mode_t	write_mode;
83 	bool			no_buildid;
84 	bool			no_buildid_cache;
85 	bool			force;
86 	bool			file_new;
87 	bool			append_file;
88 	long			samples;
89 	off_t			post_processing_offset;
90 };
91 
92 static void advance_output(struct perf_record *rec, size_t size)
93 {
94 	rec->bytes_written += size;
95 }
96 
97 static int write_output(struct perf_record *rec, void *buf, size_t size)
98 {
99 	while (size) {
100 		int ret = write(rec->output, buf, size);
101 
102 		if (ret < 0) {
103 			pr_err("failed to write\n");
104 			return -1;
105 		}
106 
107 		size -= ret;
108 		buf += ret;
109 
110 		rec->bytes_written += ret;
111 	}
112 
113 	return 0;
114 }
115 
116 static int process_synthesized_event(struct perf_tool *tool,
117 				     union perf_event *event,
118 				     struct perf_sample *sample __maybe_unused,
119 				     struct machine *machine __maybe_unused)
120 {
121 	struct perf_record *rec = container_of(tool, struct perf_record, tool);
122 	if (write_output(rec, event, event->header.size) < 0)
123 		return -1;
124 
125 	return 0;
126 }
127 
128 static int perf_record__mmap_read(struct perf_record *rec,
129 				   struct perf_mmap *md)
130 {
131 	unsigned int head = perf_mmap__read_head(md);
132 	unsigned int old = md->prev;
133 	unsigned char *data = md->base + rec->page_size;
134 	unsigned long size;
135 	void *buf;
136 	int rc = 0;
137 
138 	if (old == head)
139 		return 0;
140 
141 	rec->samples++;
142 
143 	size = head - old;
144 
145 	if ((old & md->mask) + size != (head & md->mask)) {
146 		buf = &data[old & md->mask];
147 		size = md->mask + 1 - (old & md->mask);
148 		old += size;
149 
150 		if (write_output(rec, buf, size) < 0) {
151 			rc = -1;
152 			goto out;
153 		}
154 	}
155 
156 	buf = &data[old & md->mask];
157 	size = head - old;
158 	old += size;
159 
160 	if (write_output(rec, buf, size) < 0) {
161 		rc = -1;
162 		goto out;
163 	}
164 
165 	md->prev = old;
166 	perf_mmap__write_tail(md, old);
167 
168 out:
169 	return rc;
170 }
171 
172 static volatile int done = 0;
173 static volatile int signr = -1;
174 static volatile int child_finished = 0;
175 
176 static void sig_handler(int sig)
177 {
178 	if (sig == SIGCHLD)
179 		child_finished = 1;
180 
181 	done = 1;
182 	signr = sig;
183 }
184 
185 static void perf_record__sig_exit(int exit_status __maybe_unused, void *arg)
186 {
187 	struct perf_record *rec = arg;
188 	int status;
189 
190 	if (rec->evlist->workload.pid > 0) {
191 		if (!child_finished)
192 			kill(rec->evlist->workload.pid, SIGTERM);
193 
194 		wait(&status);
195 		if (WIFSIGNALED(status))
196 			psignal(WTERMSIG(status), rec->progname);
197 	}
198 
199 	if (signr == -1 || signr == SIGUSR1)
200 		return;
201 
202 	signal(signr, SIG_DFL);
203 	kill(getpid(), signr);
204 }
205 
206 static bool perf_evlist__equal(struct perf_evlist *evlist,
207 			       struct perf_evlist *other)
208 {
209 	struct perf_evsel *pos, *pair;
210 
211 	if (evlist->nr_entries != other->nr_entries)
212 		return false;
213 
214 	pair = perf_evlist__first(other);
215 
216 	list_for_each_entry(pos, &evlist->entries, node) {
217 		if (memcmp(&pos->attr, &pair->attr, sizeof(pos->attr) != 0))
218 			return false;
219 		pair = perf_evsel__next(pair);
220 	}
221 
222 	return true;
223 }
224 
225 static int perf_record__open(struct perf_record *rec)
226 {
227 	struct perf_evsel *pos;
228 	struct perf_evlist *evlist = rec->evlist;
229 	struct perf_session *session = rec->session;
230 	struct perf_record_opts *opts = &rec->opts;
231 	int rc = 0;
232 
233 	/*
234 	 * Set the evsel leader links before we configure attributes,
235 	 * since some might depend on this info.
236 	 */
237 	if (opts->group)
238 		perf_evlist__set_leader(evlist);
239 
240 	perf_evlist__config_attrs(evlist, opts);
241 
242 	list_for_each_entry(pos, &evlist->entries, node) {
243 		struct perf_event_attr *attr = &pos->attr;
244 		/*
245 		 * Check if parse_single_tracepoint_event has already asked for
246 		 * PERF_SAMPLE_TIME.
247 		 *
248 		 * XXX this is kludgy but short term fix for problems introduced by
249 		 * eac23d1c that broke 'perf script' by having different sample_types
250 		 * when using multiple tracepoint events when we use a perf binary
251 		 * that tries to use sample_id_all on an older kernel.
252 		 *
253 		 * We need to move counter creation to perf_session, support
254 		 * different sample_types, etc.
255 		 */
256 		bool time_needed = attr->sample_type & PERF_SAMPLE_TIME;
257 
258 fallback_missing_features:
259 		if (opts->exclude_guest_missing)
260 			attr->exclude_guest = attr->exclude_host = 0;
261 retry_sample_id:
262 		attr->sample_id_all = opts->sample_id_all_missing ? 0 : 1;
263 try_again:
264 		if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) {
265 			int err = errno;
266 
267 			if (err == EPERM || err == EACCES) {
268 				ui__error_paranoid();
269 				rc = -err;
270 				goto out;
271 			} else if (err ==  ENODEV && opts->target.cpu_list) {
272 				pr_err("No such device - did you specify"
273 				       " an out-of-range profile CPU?\n");
274 				rc = -err;
275 				goto out;
276 			} else if (err == EINVAL) {
277 				if (!opts->exclude_guest_missing &&
278 				    (attr->exclude_guest || attr->exclude_host)) {
279 					pr_debug("Old kernel, cannot exclude "
280 						 "guest or host samples.\n");
281 					opts->exclude_guest_missing = true;
282 					goto fallback_missing_features;
283 				} else if (!opts->sample_id_all_missing) {
284 					/*
285 					 * Old kernel, no attr->sample_id_type_all field
286 					 */
287 					opts->sample_id_all_missing = true;
288 					if (!opts->sample_time && !opts->raw_samples && !time_needed)
289 						attr->sample_type &= ~PERF_SAMPLE_TIME;
290 
291 					goto retry_sample_id;
292 				}
293 			}
294 
295 			/*
296 			 * If it's cycles then fall back to hrtimer
297 			 * based cpu-clock-tick sw counter, which
298 			 * is always available even if no PMU support.
299 			 *
300 			 * PPC returns ENXIO until 2.6.37 (behavior changed
301 			 * with commit b0a873e).
302 			 */
303 			if ((err == ENOENT || err == ENXIO)
304 					&& attr->type == PERF_TYPE_HARDWARE
305 					&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
306 
307 				if (verbose)
308 					ui__warning("The cycles event is not supported, "
309 						    "trying to fall back to cpu-clock-ticks\n");
310 				attr->type = PERF_TYPE_SOFTWARE;
311 				attr->config = PERF_COUNT_SW_CPU_CLOCK;
312 				if (pos->name) {
313 					free(pos->name);
314 					pos->name = NULL;
315 				}
316 				goto try_again;
317 			}
318 
319 			if (err == ENOENT) {
320 				ui__error("The %s event is not supported.\n",
321 					  perf_evsel__name(pos));
322 				rc = -err;
323 				goto out;
324 			} else if ((err == EOPNOTSUPP) && (attr->precise_ip)) {
325 				ui__error("\'precise\' request may not be supported. "
326 					  "Try removing 'p' modifier\n");
327 				rc = -err;
328 				goto out;
329 			}
330 
331 			printf("\n");
332 			error("sys_perf_event_open() syscall returned with %d "
333 			      "(%s) for event %s. /bin/dmesg may provide "
334 			      "additional information.\n",
335 			      err, strerror(err), perf_evsel__name(pos));
336 
337 #if defined(__i386__) || defined(__x86_64__)
338 			if (attr->type == PERF_TYPE_HARDWARE &&
339 			    err == EOPNOTSUPP) {
340 				pr_err("No hardware sampling interrupt available."
341 				       " No APIC? If so then you can boot the kernel"
342 				       " with the \"lapic\" boot parameter to"
343 				       " force-enable it.\n");
344 				rc = -err;
345 				goto out;
346 			}
347 #endif
348 
349 			pr_err("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
350 			rc = -err;
351 			goto out;
352 		}
353 	}
354 
355 	if (perf_evlist__apply_filters(evlist)) {
356 		error("failed to set filter with %d (%s)\n", errno,
357 			strerror(errno));
358 		rc = -1;
359 		goto out;
360 	}
361 
362 	if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
363 		if (errno == EPERM) {
364 			pr_err("Permission error mapping pages.\n"
365 			       "Consider increasing "
366 			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
367 			       "or try again with a smaller value of -m/--mmap_pages.\n"
368 			       "(current value: %d)\n", opts->mmap_pages);
369 			rc = -errno;
370 		} else if (!is_power_of_2(opts->mmap_pages) &&
371 			   (opts->mmap_pages != UINT_MAX)) {
372 			pr_err("--mmap_pages/-m value must be a power of two.");
373 			rc = -EINVAL;
374 		} else {
375 			pr_err("failed to mmap with %d (%s)\n", errno, strerror(errno));
376 			rc = -errno;
377 		}
378 		goto out;
379 	}
380 
381 	if (rec->file_new)
382 		session->evlist = evlist;
383 	else {
384 		if (!perf_evlist__equal(session->evlist, evlist)) {
385 			fprintf(stderr, "incompatible append\n");
386 			rc = -1;
387 			goto out;
388 		}
389  	}
390 
391 	perf_session__set_id_hdr_size(session);
392 out:
393 	return rc;
394 }
395 
396 static int process_buildids(struct perf_record *rec)
397 {
398 	u64 size = lseek(rec->output, 0, SEEK_CUR);
399 
400 	if (size == 0)
401 		return 0;
402 
403 	rec->session->fd = rec->output;
404 	return __perf_session__process_events(rec->session, rec->post_processing_offset,
405 					      size - rec->post_processing_offset,
406 					      size, &build_id__mark_dso_hit_ops);
407 }
408 
409 static void perf_record__exit(int status, void *arg)
410 {
411 	struct perf_record *rec = arg;
412 
413 	if (status != 0)
414 		return;
415 
416 	if (!rec->opts.pipe_output) {
417 		rec->session->header.data_size += rec->bytes_written;
418 
419 		if (!rec->no_buildid)
420 			process_buildids(rec);
421 		perf_session__write_header(rec->session, rec->evlist,
422 					   rec->output, true);
423 		perf_session__delete(rec->session);
424 		perf_evlist__delete(rec->evlist);
425 		symbol__exit();
426 	}
427 }
428 
429 static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
430 {
431 	int err;
432 	struct perf_tool *tool = data;
433 
434 	if (machine__is_host(machine))
435 		return;
436 
437 	/*
438 	 *As for guest kernel when processing subcommand record&report,
439 	 *we arrange module mmap prior to guest kernel mmap and trigger
440 	 *a preload dso because default guest module symbols are loaded
441 	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
442 	 *method is used to avoid symbol missing when the first addr is
443 	 *in module instead of in guest kernel.
444 	 */
445 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
446 					     machine);
447 	if (err < 0)
448 		pr_err("Couldn't record guest kernel [%d]'s reference"
449 		       " relocation symbol.\n", machine->pid);
450 
451 	/*
452 	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
453 	 * have no _text sometimes.
454 	 */
455 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
456 						 machine, "_text");
457 	if (err < 0)
458 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
459 							 machine, "_stext");
460 	if (err < 0)
461 		pr_err("Couldn't record guest kernel [%d]'s reference"
462 		       " relocation symbol.\n", machine->pid);
463 }
464 
465 static struct perf_event_header finished_round_event = {
466 	.size = sizeof(struct perf_event_header),
467 	.type = PERF_RECORD_FINISHED_ROUND,
468 };
469 
470 static int perf_record__mmap_read_all(struct perf_record *rec)
471 {
472 	int i;
473 	int rc = 0;
474 
475 	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
476 		if (rec->evlist->mmap[i].base) {
477 			if (perf_record__mmap_read(rec, &rec->evlist->mmap[i]) != 0) {
478 				rc = -1;
479 				goto out;
480 			}
481 		}
482 	}
483 
484 	if (perf_header__has_feat(&rec->session->header, HEADER_TRACING_DATA))
485 		rc = write_output(rec, &finished_round_event,
486 				  sizeof(finished_round_event));
487 
488 out:
489 	return rc;
490 }
491 
492 static int __cmd_record(struct perf_record *rec, int argc, const char **argv)
493 {
494 	struct stat st;
495 	int flags;
496 	int err, output, feat;
497 	unsigned long waking = 0;
498 	const bool forks = argc > 0;
499 	struct machine *machine;
500 	struct perf_tool *tool = &rec->tool;
501 	struct perf_record_opts *opts = &rec->opts;
502 	struct perf_evlist *evsel_list = rec->evlist;
503 	const char *output_name = rec->output_name;
504 	struct perf_session *session;
505 	bool disabled = false;
506 
507 	rec->progname = argv[0];
508 
509 	rec->page_size = sysconf(_SC_PAGE_SIZE);
510 
511 	on_exit(perf_record__sig_exit, rec);
512 	signal(SIGCHLD, sig_handler);
513 	signal(SIGINT, sig_handler);
514 	signal(SIGUSR1, sig_handler);
515 
516 	if (!output_name) {
517 		if (!fstat(STDOUT_FILENO, &st) && S_ISFIFO(st.st_mode))
518 			opts->pipe_output = true;
519 		else
520 			rec->output_name = output_name = "perf.data";
521 	}
522 	if (output_name) {
523 		if (!strcmp(output_name, "-"))
524 			opts->pipe_output = true;
525 		else if (!stat(output_name, &st) && st.st_size) {
526 			if (rec->write_mode == WRITE_FORCE) {
527 				char oldname[PATH_MAX];
528 				snprintf(oldname, sizeof(oldname), "%s.old",
529 					 output_name);
530 				unlink(oldname);
531 				rename(output_name, oldname);
532 			}
533 		} else if (rec->write_mode == WRITE_APPEND) {
534 			rec->write_mode = WRITE_FORCE;
535 		}
536 	}
537 
538 	flags = O_CREAT|O_RDWR;
539 	if (rec->write_mode == WRITE_APPEND)
540 		rec->file_new = 0;
541 	else
542 		flags |= O_TRUNC;
543 
544 	if (opts->pipe_output)
545 		output = STDOUT_FILENO;
546 	else
547 		output = open(output_name, flags, S_IRUSR | S_IWUSR);
548 	if (output < 0) {
549 		perror("failed to create output file");
550 		return -1;
551 	}
552 
553 	rec->output = output;
554 
555 	session = perf_session__new(output_name, O_WRONLY,
556 				    rec->write_mode == WRITE_FORCE, false, NULL);
557 	if (session == NULL) {
558 		pr_err("Not enough memory for reading perf file header\n");
559 		return -1;
560 	}
561 
562 	rec->session = session;
563 
564 	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
565 		perf_header__set_feat(&session->header, feat);
566 
567 	if (rec->no_buildid)
568 		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
569 
570 	if (!have_tracepoints(&evsel_list->entries))
571 		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
572 
573 	if (!rec->opts.branch_stack)
574 		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
575 
576 	if (!rec->file_new) {
577 		err = perf_session__read_header(session, output);
578 		if (err < 0)
579 			goto out_delete_session;
580 	}
581 
582 	if (forks) {
583 		err = perf_evlist__prepare_workload(evsel_list, opts, argv);
584 		if (err < 0) {
585 			pr_err("Couldn't run the workload!\n");
586 			goto out_delete_session;
587 		}
588 	}
589 
590 	if (perf_record__open(rec) != 0) {
591 		err = -1;
592 		goto out_delete_session;
593 	}
594 
595 	/*
596 	 * perf_session__delete(session) will be called at perf_record__exit()
597 	 */
598 	on_exit(perf_record__exit, rec);
599 
600 	if (opts->pipe_output) {
601 		err = perf_header__write_pipe(output);
602 		if (err < 0)
603 			goto out_delete_session;
604 	} else if (rec->file_new) {
605 		err = perf_session__write_header(session, evsel_list,
606 						 output, false);
607 		if (err < 0)
608 			goto out_delete_session;
609 	}
610 
611 	if (!rec->no_buildid
612 	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
613 		pr_err("Couldn't generate buildids. "
614 		       "Use --no-buildid to profile anyway.\n");
615 		err = -1;
616 		goto out_delete_session;
617 	}
618 
619 	rec->post_processing_offset = lseek(output, 0, SEEK_CUR);
620 
621 	machine = perf_session__find_host_machine(session);
622 	if (!machine) {
623 		pr_err("Couldn't find native kernel information.\n");
624 		err = -1;
625 		goto out_delete_session;
626 	}
627 
628 	if (opts->pipe_output) {
629 		err = perf_event__synthesize_attrs(tool, session,
630 						   process_synthesized_event);
631 		if (err < 0) {
632 			pr_err("Couldn't synthesize attrs.\n");
633 			goto out_delete_session;
634 		}
635 
636 		err = perf_event__synthesize_event_types(tool, process_synthesized_event,
637 							 machine);
638 		if (err < 0) {
639 			pr_err("Couldn't synthesize event_types.\n");
640 			goto out_delete_session;
641 		}
642 
643 		if (have_tracepoints(&evsel_list->entries)) {
644 			/*
645 			 * FIXME err <= 0 here actually means that
646 			 * there were no tracepoints so its not really
647 			 * an error, just that we don't need to
648 			 * synthesize anything.  We really have to
649 			 * return this more properly and also
650 			 * propagate errors that now are calling die()
651 			 */
652 			err = perf_event__synthesize_tracing_data(tool, output, evsel_list,
653 								  process_synthesized_event);
654 			if (err <= 0) {
655 				pr_err("Couldn't record tracing data.\n");
656 				goto out_delete_session;
657 			}
658 			advance_output(rec, err);
659 		}
660 	}
661 
662 	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
663 						 machine, "_text");
664 	if (err < 0)
665 		err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
666 							 machine, "_stext");
667 	if (err < 0)
668 		pr_err("Couldn't record kernel reference relocation symbol\n"
669 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
670 		       "Check /proc/kallsyms permission or run as root.\n");
671 
672 	err = perf_event__synthesize_modules(tool, process_synthesized_event,
673 					     machine);
674 	if (err < 0)
675 		pr_err("Couldn't record kernel module information.\n"
676 		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
677 		       "Check /proc/modules permission or run as root.\n");
678 
679 	if (perf_guest)
680 		perf_session__process_machines(session, tool,
681 					       perf_event__synthesize_guest_os);
682 
683 	if (!opts->target.system_wide)
684 		err = perf_event__synthesize_thread_map(tool, evsel_list->threads,
685 						  process_synthesized_event,
686 						  machine);
687 	else
688 		err = perf_event__synthesize_threads(tool, process_synthesized_event,
689 					       machine);
690 
691 	if (err != 0)
692 		goto out_delete_session;
693 
694 	if (rec->realtime_prio) {
695 		struct sched_param param;
696 
697 		param.sched_priority = rec->realtime_prio;
698 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
699 			pr_err("Could not set realtime priority.\n");
700 			err = -1;
701 			goto out_delete_session;
702 		}
703 	}
704 
705 	/*
706 	 * When perf is starting the traced process, all the events
707 	 * (apart from group members) have enable_on_exec=1 set,
708 	 * so don't spoil it by prematurely enabling them.
709 	 */
710 	if (!perf_target__none(&opts->target))
711 		perf_evlist__enable(evsel_list);
712 
713 	/*
714 	 * Let the child rip
715 	 */
716 	if (forks)
717 		perf_evlist__start_workload(evsel_list);
718 
719 	for (;;) {
720 		int hits = rec->samples;
721 
722 		if (perf_record__mmap_read_all(rec) < 0) {
723 			err = -1;
724 			goto out_delete_session;
725 		}
726 
727 		if (hits == rec->samples) {
728 			if (done)
729 				break;
730 			err = poll(evsel_list->pollfd, evsel_list->nr_fds, -1);
731 			waking++;
732 		}
733 
734 		/*
735 		 * When perf is starting the traced process, at the end events
736 		 * die with the process and we wait for that. Thus no need to
737 		 * disable events in this case.
738 		 */
739 		if (done && !disabled && !perf_target__none(&opts->target)) {
740 			perf_evlist__disable(evsel_list);
741 			disabled = true;
742 		}
743 	}
744 
745 	if (quiet || signr == SIGUSR1)
746 		return 0;
747 
748 	fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
749 
750 	/*
751 	 * Approximate RIP event size: 24 bytes.
752 	 */
753 	fprintf(stderr,
754 		"[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
755 		(double)rec->bytes_written / 1024.0 / 1024.0,
756 		output_name,
757 		rec->bytes_written / 24);
758 
759 	return 0;
760 
761 out_delete_session:
762 	perf_session__delete(session);
763 	return err;
764 }
765 
766 #define BRANCH_OPT(n, m) \
767 	{ .name = n, .mode = (m) }
768 
769 #define BRANCH_END { .name = NULL }
770 
771 struct branch_mode {
772 	const char *name;
773 	int mode;
774 };
775 
776 static const struct branch_mode branch_modes[] = {
777 	BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
778 	BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
779 	BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
780 	BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
781 	BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
782 	BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
783 	BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
784 	BRANCH_END
785 };
786 
787 static int
788 parse_branch_stack(const struct option *opt, const char *str, int unset)
789 {
790 #define ONLY_PLM \
791 	(PERF_SAMPLE_BRANCH_USER	|\
792 	 PERF_SAMPLE_BRANCH_KERNEL	|\
793 	 PERF_SAMPLE_BRANCH_HV)
794 
795 	uint64_t *mode = (uint64_t *)opt->value;
796 	const struct branch_mode *br;
797 	char *s, *os = NULL, *p;
798 	int ret = -1;
799 
800 	if (unset)
801 		return 0;
802 
803 	/*
804 	 * cannot set it twice, -b + --branch-filter for instance
805 	 */
806 	if (*mode)
807 		return -1;
808 
809 	/* str may be NULL in case no arg is passed to -b */
810 	if (str) {
811 		/* because str is read-only */
812 		s = os = strdup(str);
813 		if (!s)
814 			return -1;
815 
816 		for (;;) {
817 			p = strchr(s, ',');
818 			if (p)
819 				*p = '\0';
820 
821 			for (br = branch_modes; br->name; br++) {
822 				if (!strcasecmp(s, br->name))
823 					break;
824 			}
825 			if (!br->name) {
826 				ui__warning("unknown branch filter %s,"
827 					    " check man page\n", s);
828 				goto error;
829 			}
830 
831 			*mode |= br->mode;
832 
833 			if (!p)
834 				break;
835 
836 			s = p + 1;
837 		}
838 	}
839 	ret = 0;
840 
841 	/* default to any branch */
842 	if ((*mode & ~ONLY_PLM) == 0) {
843 		*mode = PERF_SAMPLE_BRANCH_ANY;
844 	}
845 error:
846 	free(os);
847 	return ret;
848 }
849 
850 #ifdef LIBUNWIND_SUPPORT
851 static int get_stack_size(char *str, unsigned long *_size)
852 {
853 	char *endptr;
854 	unsigned long size;
855 	unsigned long max_size = round_down(USHRT_MAX, sizeof(u64));
856 
857 	size = strtoul(str, &endptr, 0);
858 
859 	do {
860 		if (*endptr)
861 			break;
862 
863 		size = round_up(size, sizeof(u64));
864 		if (!size || size > max_size)
865 			break;
866 
867 		*_size = size;
868 		return 0;
869 
870 	} while (0);
871 
872 	pr_err("callchain: Incorrect stack dump size (max %ld): %s\n",
873 	       max_size, str);
874 	return -1;
875 }
876 #endif /* LIBUNWIND_SUPPORT */
877 
878 static int
879 parse_callchain_opt(const struct option *opt __maybe_unused, const char *arg,
880 		    int unset)
881 {
882 	struct perf_record *rec = (struct perf_record *)opt->value;
883 	char *tok, *name, *saveptr = NULL;
884 	char *buf;
885 	int ret = -1;
886 
887 	/* --no-call-graph */
888 	if (unset)
889 		return 0;
890 
891 	/* We specified default option if none is provided. */
892 	BUG_ON(!arg);
893 
894 	/* We need buffer that we know we can write to. */
895 	buf = malloc(strlen(arg) + 1);
896 	if (!buf)
897 		return -ENOMEM;
898 
899 	strcpy(buf, arg);
900 
901 	tok = strtok_r((char *)buf, ",", &saveptr);
902 	name = tok ? : (char *)buf;
903 
904 	do {
905 		/* Framepointer style */
906 		if (!strncmp(name, "fp", sizeof("fp"))) {
907 			if (!strtok_r(NULL, ",", &saveptr)) {
908 				rec->opts.call_graph = CALLCHAIN_FP;
909 				ret = 0;
910 			} else
911 				pr_err("callchain: No more arguments "
912 				       "needed for -g fp\n");
913 			break;
914 
915 #ifdef LIBUNWIND_SUPPORT
916 		/* Dwarf style */
917 		} else if (!strncmp(name, "dwarf", sizeof("dwarf"))) {
918 			const unsigned long default_stack_dump_size = 8192;
919 
920 			ret = 0;
921 			rec->opts.call_graph = CALLCHAIN_DWARF;
922 			rec->opts.stack_dump_size = default_stack_dump_size;
923 
924 			tok = strtok_r(NULL, ",", &saveptr);
925 			if (tok) {
926 				unsigned long size = 0;
927 
928 				ret = get_stack_size(tok, &size);
929 				rec->opts.stack_dump_size = size;
930 			}
931 
932 			if (!ret)
933 				pr_debug("callchain: stack dump size %d\n",
934 					 rec->opts.stack_dump_size);
935 #endif /* LIBUNWIND_SUPPORT */
936 		} else {
937 			pr_err("callchain: Unknown -g option "
938 			       "value: %s\n", arg);
939 			break;
940 		}
941 
942 	} while (0);
943 
944 	free(buf);
945 
946 	if (!ret)
947 		pr_debug("callchain: type %d\n", rec->opts.call_graph);
948 
949 	return ret;
950 }
951 
952 static const char * const record_usage[] = {
953 	"perf record [<options>] [<command>]",
954 	"perf record [<options>] -- <command> [<options>]",
955 	NULL
956 };
957 
958 /*
959  * XXX Ideally would be local to cmd_record() and passed to a perf_record__new
960  * because we need to have access to it in perf_record__exit, that is called
961  * after cmd_record() exits, but since record_options need to be accessible to
962  * builtin-script, leave it here.
963  *
964  * At least we don't ouch it in all the other functions here directly.
965  *
966  * Just say no to tons of global variables, sigh.
967  */
968 static struct perf_record record = {
969 	.opts = {
970 		.mmap_pages	     = UINT_MAX,
971 		.user_freq	     = UINT_MAX,
972 		.user_interval	     = ULLONG_MAX,
973 		.freq		     = 4000,
974 		.target		     = {
975 			.uses_mmap   = true,
976 		},
977 	},
978 	.write_mode = WRITE_FORCE,
979 	.file_new   = true,
980 };
981 
982 #define CALLCHAIN_HELP "do call-graph (stack chain/backtrace) recording: "
983 
984 #ifdef LIBUNWIND_SUPPORT
985 static const char callchain_help[] = CALLCHAIN_HELP "[fp] dwarf";
986 #else
987 static const char callchain_help[] = CALLCHAIN_HELP "[fp]";
988 #endif
989 
990 /*
991  * XXX Will stay a global variable till we fix builtin-script.c to stop messing
992  * with it and switch to use the library functions in perf_evlist that came
993  * from builtin-record.c, i.e. use perf_record_opts,
994  * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
995  * using pipes, etc.
996  */
997 const struct option record_options[] = {
998 	OPT_CALLBACK('e', "event", &record.evlist, "event",
999 		     "event selector. use 'perf list' to list available events",
1000 		     parse_events_option),
1001 	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
1002 		     "event filter", parse_filter),
1003 	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
1004 		    "record events on existing process id"),
1005 	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
1006 		    "record events on existing thread id"),
1007 	OPT_INTEGER('r', "realtime", &record.realtime_prio,
1008 		    "collect data with this RT SCHED_FIFO priority"),
1009 	OPT_BOOLEAN('D', "no-delay", &record.opts.no_delay,
1010 		    "collect data without buffering"),
1011 	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
1012 		    "collect raw sample records from all opened counters"),
1013 	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
1014 			    "system-wide collection from all CPUs"),
1015 	OPT_BOOLEAN('A', "append", &record.append_file,
1016 			    "append to the output file to do incremental profiling"),
1017 	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
1018 		    "list of cpus to monitor"),
1019 	OPT_BOOLEAN('f', "force", &record.force,
1020 			"overwrite existing data file (deprecated)"),
1021 	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
1022 	OPT_STRING('o', "output", &record.output_name, "file",
1023 		    "output file name"),
1024 	OPT_BOOLEAN('i', "no-inherit", &record.opts.no_inherit,
1025 		    "child tasks do not inherit counters"),
1026 	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
1027 	OPT_UINTEGER('m', "mmap-pages", &record.opts.mmap_pages,
1028 		     "number of mmap data pages"),
1029 	OPT_BOOLEAN(0, "group", &record.opts.group,
1030 		    "put the counters into a counter group"),
1031 	OPT_CALLBACK_DEFAULT('g', "call-graph", &record, "mode[,dump_size]",
1032 			     callchain_help, &parse_callchain_opt,
1033 			     "fp"),
1034 	OPT_INCR('v', "verbose", &verbose,
1035 		    "be more verbose (show counter open errors, etc)"),
1036 	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
1037 	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
1038 		    "per thread counts"),
1039 	OPT_BOOLEAN('d', "data", &record.opts.sample_address,
1040 		    "Sample addresses"),
1041 	OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
1042 	OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
1043 	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
1044 		    "don't sample"),
1045 	OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
1046 		    "do not update the buildid cache"),
1047 	OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
1048 		    "do not collect buildids in perf.data"),
1049 	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
1050 		     "monitor event in cgroup name only",
1051 		     parse_cgroups),
1052 	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
1053 		   "user to profile"),
1054 
1055 	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
1056 		     "branch any", "sample any taken branches",
1057 		     parse_branch_stack),
1058 
1059 	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
1060 		     "branch filter mask", "branch stack filter modes",
1061 		     parse_branch_stack),
1062 	OPT_END()
1063 };
1064 
1065 int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
1066 {
1067 	int err = -ENOMEM;
1068 	struct perf_evsel *pos;
1069 	struct perf_evlist *evsel_list;
1070 	struct perf_record *rec = &record;
1071 	char errbuf[BUFSIZ];
1072 
1073 	evsel_list = perf_evlist__new(NULL, NULL);
1074 	if (evsel_list == NULL)
1075 		return -ENOMEM;
1076 
1077 	rec->evlist = evsel_list;
1078 
1079 	argc = parse_options(argc, argv, record_options, record_usage,
1080 			    PARSE_OPT_STOP_AT_NON_OPTION);
1081 	if (!argc && perf_target__none(&rec->opts.target))
1082 		usage_with_options(record_usage, record_options);
1083 
1084 	if (rec->force && rec->append_file) {
1085 		ui__error("Can't overwrite and append at the same time."
1086 			  " You need to choose between -f and -A");
1087 		usage_with_options(record_usage, record_options);
1088 	} else if (rec->append_file) {
1089 		rec->write_mode = WRITE_APPEND;
1090 	} else {
1091 		rec->write_mode = WRITE_FORCE;
1092 	}
1093 
1094 	if (nr_cgroups && !rec->opts.target.system_wide) {
1095 		ui__error("cgroup monitoring only available in"
1096 			  " system-wide mode\n");
1097 		usage_with_options(record_usage, record_options);
1098 	}
1099 
1100 	symbol__init();
1101 
1102 	if (symbol_conf.kptr_restrict)
1103 		pr_warning(
1104 "WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
1105 "check /proc/sys/kernel/kptr_restrict.\n\n"
1106 "Samples in kernel functions may not be resolved if a suitable vmlinux\n"
1107 "file is not found in the buildid cache or in the vmlinux path.\n\n"
1108 "Samples in kernel modules won't be resolved at all.\n\n"
1109 "If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
1110 "even with a suitable vmlinux or kallsyms file.\n\n");
1111 
1112 	if (rec->no_buildid_cache || rec->no_buildid)
1113 		disable_buildid_cache();
1114 
1115 	if (evsel_list->nr_entries == 0 &&
1116 	    perf_evlist__add_default(evsel_list) < 0) {
1117 		pr_err("Not enough memory for event selector list\n");
1118 		goto out_symbol_exit;
1119 	}
1120 
1121 	err = perf_target__validate(&rec->opts.target);
1122 	if (err) {
1123 		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1124 		ui__warning("%s", errbuf);
1125 	}
1126 
1127 	err = perf_target__parse_uid(&rec->opts.target);
1128 	if (err) {
1129 		int saved_errno = errno;
1130 
1131 		perf_target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
1132 		ui__error("%s", errbuf);
1133 
1134 		err = -saved_errno;
1135 		goto out_free_fd;
1136 	}
1137 
1138 	err = -ENOMEM;
1139 	if (perf_evlist__create_maps(evsel_list, &rec->opts.target) < 0)
1140 		usage_with_options(record_usage, record_options);
1141 
1142 	list_for_each_entry(pos, &evsel_list->entries, node) {
1143 		if (perf_header__push_event(pos->attr.config, perf_evsel__name(pos)))
1144 			goto out_free_fd;
1145 	}
1146 
1147 	if (rec->opts.user_interval != ULLONG_MAX)
1148 		rec->opts.default_interval = rec->opts.user_interval;
1149 	if (rec->opts.user_freq != UINT_MAX)
1150 		rec->opts.freq = rec->opts.user_freq;
1151 
1152 	/*
1153 	 * User specified count overrides default frequency.
1154 	 */
1155 	if (rec->opts.default_interval)
1156 		rec->opts.freq = 0;
1157 	else if (rec->opts.freq) {
1158 		rec->opts.default_interval = rec->opts.freq;
1159 	} else {
1160 		ui__error("frequency and count are zero, aborting\n");
1161 		err = -EINVAL;
1162 		goto out_free_fd;
1163 	}
1164 
1165 	err = __cmd_record(&record, argc, argv);
1166 out_free_fd:
1167 	perf_evlist__delete_maps(evsel_list);
1168 out_symbol_exit:
1169 	symbol__exit();
1170 	return err;
1171 }
1172