xref: /linux/tools/perf/builtin-record.c (revision a33f32244d8550da8b4a26e277ce07d5c6d158b5)
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #define _FILE_OFFSET_BITS 64
9 
10 #include "builtin.h"
11 
12 #include "perf.h"
13 
14 #include "util/build-id.h"
15 #include "util/util.h"
16 #include "util/parse-options.h"
17 #include "util/parse-events.h"
18 #include "util/string.h"
19 
20 #include "util/header.h"
21 #include "util/event.h"
22 #include "util/debug.h"
23 #include "util/session.h"
24 #include "util/symbol.h"
25 #include "util/cpumap.h"
26 
27 #include <unistd.h>
28 #include <sched.h>
29 
30 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
31 
32 static long			default_interval		=      0;
33 
34 static int			nr_cpus				=      0;
35 static unsigned int		page_size;
36 static unsigned int		mmap_pages			=    128;
37 static int			freq				=   1000;
38 static int			output;
39 static const char		*output_name			= "perf.data";
40 static int			group				=      0;
41 static unsigned int		realtime_prio			=      0;
42 static int			raw_samples			=      0;
43 static int			system_wide			=      0;
44 static int			profile_cpu			=     -1;
45 static pid_t			target_pid			=     -1;
46 static pid_t			child_pid			=     -1;
47 static int			inherit				=      1;
48 static int			force				=      0;
49 static int			append_file			=      0;
50 static int			call_graph			=      0;
51 static int			inherit_stat			=      0;
52 static int			no_samples			=      0;
53 static int			sample_address			=      0;
54 static int			multiplex			=      0;
55 static int			multiplex_fd			=     -1;
56 
57 static long			samples				=      0;
58 static struct timeval		last_read;
59 static struct timeval		this_read;
60 
61 static u64			bytes_written			=      0;
62 
63 static struct pollfd		event_array[MAX_NR_CPUS * MAX_COUNTERS];
64 
65 static int			nr_poll				=      0;
66 static int			nr_cpu				=      0;
67 
68 static int			file_new			=      1;
69 static off_t			post_processing_offset;
70 
71 static struct perf_session	*session;
72 
73 struct mmap_data {
74 	int			counter;
75 	void			*base;
76 	unsigned int		mask;
77 	unsigned int		prev;
78 };
79 
80 static struct mmap_data		mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
81 
82 static unsigned long mmap_read_head(struct mmap_data *md)
83 {
84 	struct perf_event_mmap_page *pc = md->base;
85 	long head;
86 
87 	head = pc->data_head;
88 	rmb();
89 
90 	return head;
91 }
92 
93 static void mmap_write_tail(struct mmap_data *md, unsigned long tail)
94 {
95 	struct perf_event_mmap_page *pc = md->base;
96 
97 	/*
98 	 * ensure all reads are done before we write the tail out.
99 	 */
100 	/* mb(); */
101 	pc->data_tail = tail;
102 }
103 
104 static void write_output(void *buf, size_t size)
105 {
106 	while (size) {
107 		int ret = write(output, buf, size);
108 
109 		if (ret < 0)
110 			die("failed to write");
111 
112 		size -= ret;
113 		buf += ret;
114 
115 		bytes_written += ret;
116 	}
117 }
118 
119 static int process_synthesized_event(event_t *event,
120 				     struct perf_session *self __used)
121 {
122 	write_output(event, event->header.size);
123 	return 0;
124 }
125 
126 static void mmap_read(struct mmap_data *md)
127 {
128 	unsigned int head = mmap_read_head(md);
129 	unsigned int old = md->prev;
130 	unsigned char *data = md->base + page_size;
131 	unsigned long size;
132 	void *buf;
133 	int diff;
134 
135 	gettimeofday(&this_read, NULL);
136 
137 	/*
138 	 * If we're further behind than half the buffer, there's a chance
139 	 * the writer will bite our tail and mess up the samples under us.
140 	 *
141 	 * If we somehow ended up ahead of the head, we got messed up.
142 	 *
143 	 * In either case, truncate and restart at head.
144 	 */
145 	diff = head - old;
146 	if (diff < 0) {
147 		struct timeval iv;
148 		unsigned long msecs;
149 
150 		timersub(&this_read, &last_read, &iv);
151 		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
152 
153 		fprintf(stderr, "WARNING: failed to keep up with mmap data."
154 				"  Last read %lu msecs ago.\n", msecs);
155 
156 		/*
157 		 * head points to a known good entry, start there.
158 		 */
159 		old = head;
160 	}
161 
162 	last_read = this_read;
163 
164 	if (old != head)
165 		samples++;
166 
167 	size = head - old;
168 
169 	if ((old & md->mask) + size != (head & md->mask)) {
170 		buf = &data[old & md->mask];
171 		size = md->mask + 1 - (old & md->mask);
172 		old += size;
173 
174 		write_output(buf, size);
175 	}
176 
177 	buf = &data[old & md->mask];
178 	size = head - old;
179 	old += size;
180 
181 	write_output(buf, size);
182 
183 	md->prev = old;
184 	mmap_write_tail(md, old);
185 }
186 
187 static volatile int done = 0;
188 static volatile int signr = -1;
189 
190 static void sig_handler(int sig)
191 {
192 	done = 1;
193 	signr = sig;
194 }
195 
196 static void sig_atexit(void)
197 {
198 	if (child_pid != -1)
199 		kill(child_pid, SIGTERM);
200 
201 	if (signr == -1)
202 		return;
203 
204 	signal(signr, SIG_DFL);
205 	kill(getpid(), signr);
206 }
207 
208 static int group_fd;
209 
210 static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int nr)
211 {
212 	struct perf_header_attr *h_attr;
213 
214 	if (nr < session->header.attrs) {
215 		h_attr = session->header.attr[nr];
216 	} else {
217 		h_attr = perf_header_attr__new(a);
218 		if (h_attr != NULL)
219 			if (perf_header__add_attr(&session->header, h_attr) < 0) {
220 				perf_header_attr__delete(h_attr);
221 				h_attr = NULL;
222 			}
223 	}
224 
225 	return h_attr;
226 }
227 
228 static void create_counter(int counter, int cpu, pid_t pid)
229 {
230 	char *filter = filters[counter];
231 	struct perf_event_attr *attr = attrs + counter;
232 	struct perf_header_attr *h_attr;
233 	int track = !counter; /* only the first counter needs these */
234 	int ret;
235 	struct {
236 		u64 count;
237 		u64 time_enabled;
238 		u64 time_running;
239 		u64 id;
240 	} read_data;
241 
242 	attr->read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
243 				  PERF_FORMAT_TOTAL_TIME_RUNNING |
244 				  PERF_FORMAT_ID;
245 
246 	attr->sample_type	|= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
247 
248 	if (nr_counters > 1)
249 		attr->sample_type |= PERF_SAMPLE_ID;
250 
251 	if (freq) {
252 		attr->sample_type	|= PERF_SAMPLE_PERIOD;
253 		attr->freq		= 1;
254 		attr->sample_freq	= freq;
255 	}
256 
257 	if (no_samples)
258 		attr->sample_freq = 0;
259 
260 	if (inherit_stat)
261 		attr->inherit_stat = 1;
262 
263 	if (sample_address)
264 		attr->sample_type	|= PERF_SAMPLE_ADDR;
265 
266 	if (call_graph)
267 		attr->sample_type	|= PERF_SAMPLE_CALLCHAIN;
268 
269 	if (raw_samples) {
270 		attr->sample_type	|= PERF_SAMPLE_TIME;
271 		attr->sample_type	|= PERF_SAMPLE_RAW;
272 		attr->sample_type	|= PERF_SAMPLE_CPU;
273 	}
274 
275 	attr->mmap		= track;
276 	attr->comm		= track;
277 	attr->inherit		= inherit;
278 	attr->disabled		= 1;
279 
280 try_again:
281 	fd[nr_cpu][counter] = sys_perf_event_open(attr, pid, cpu, group_fd, 0);
282 
283 	if (fd[nr_cpu][counter] < 0) {
284 		int err = errno;
285 
286 		if (err == EPERM || err == EACCES)
287 			die("Permission error - are you root?\n");
288 		else if (err ==  ENODEV && profile_cpu != -1)
289 			die("No such device - did you specify an out-of-range profile CPU?\n");
290 
291 		/*
292 		 * If it's cycles then fall back to hrtimer
293 		 * based cpu-clock-tick sw counter, which
294 		 * is always available even if no PMU support:
295 		 */
296 		if (attr->type == PERF_TYPE_HARDWARE
297 			&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
298 
299 			if (verbose)
300 				warning(" ... trying to fall back to cpu-clock-ticks\n");
301 			attr->type = PERF_TYPE_SOFTWARE;
302 			attr->config = PERF_COUNT_SW_CPU_CLOCK;
303 			goto try_again;
304 		}
305 		printf("\n");
306 		error("perfcounter syscall returned with %d (%s)\n",
307 			fd[nr_cpu][counter], strerror(err));
308 
309 #if defined(__i386__) || defined(__x86_64__)
310 		if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
311 			die("No hardware sampling interrupt available. No APIC? If so then you can boot the kernel with the \"lapic\" boot parameter to force-enable it.\n");
312 #endif
313 
314 		die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
315 		exit(-1);
316 	}
317 
318 	h_attr = get_header_attr(attr, counter);
319 	if (h_attr == NULL)
320 		die("nomem\n");
321 
322 	if (!file_new) {
323 		if (memcmp(&h_attr->attr, attr, sizeof(*attr))) {
324 			fprintf(stderr, "incompatible append\n");
325 			exit(-1);
326 		}
327 	}
328 
329 	if (read(fd[nr_cpu][counter], &read_data, sizeof(read_data)) == -1) {
330 		perror("Unable to read perf file descriptor\n");
331 		exit(-1);
332 	}
333 
334 	if (perf_header_attr__add_id(h_attr, read_data.id) < 0) {
335 		pr_warning("Not enough memory to add id\n");
336 		exit(-1);
337 	}
338 
339 	assert(fd[nr_cpu][counter] >= 0);
340 	fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
341 
342 	/*
343 	 * First counter acts as the group leader:
344 	 */
345 	if (group && group_fd == -1)
346 		group_fd = fd[nr_cpu][counter];
347 	if (multiplex && multiplex_fd == -1)
348 		multiplex_fd = fd[nr_cpu][counter];
349 
350 	if (multiplex && fd[nr_cpu][counter] != multiplex_fd) {
351 
352 		ret = ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_SET_OUTPUT, multiplex_fd);
353 		assert(ret != -1);
354 	} else {
355 		event_array[nr_poll].fd = fd[nr_cpu][counter];
356 		event_array[nr_poll].events = POLLIN;
357 		nr_poll++;
358 
359 		mmap_array[nr_cpu][counter].counter = counter;
360 		mmap_array[nr_cpu][counter].prev = 0;
361 		mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
362 		mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
363 				PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter], 0);
364 		if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
365 			error("failed to mmap with %d (%s)\n", errno, strerror(errno));
366 			exit(-1);
367 		}
368 	}
369 
370 	if (filter != NULL) {
371 		ret = ioctl(fd[nr_cpu][counter],
372 			    PERF_EVENT_IOC_SET_FILTER, filter);
373 		if (ret) {
374 			error("failed to set filter with %d (%s)\n", errno,
375 			      strerror(errno));
376 			exit(-1);
377 		}
378 	}
379 
380 	ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_ENABLE);
381 }
382 
383 static void open_counters(int cpu, pid_t pid)
384 {
385 	int counter;
386 
387 	group_fd = -1;
388 	for (counter = 0; counter < nr_counters; counter++)
389 		create_counter(counter, cpu, pid);
390 
391 	nr_cpu++;
392 }
393 
394 static int process_buildids(void)
395 {
396 	u64 size = lseek(output, 0, SEEK_CUR);
397 
398 	if (size == 0)
399 		return 0;
400 
401 	session->fd = output;
402 	return __perf_session__process_events(session, post_processing_offset,
403 					      size - post_processing_offset,
404 					      size, &build_id__mark_dso_hit_ops);
405 }
406 
407 static void atexit_header(void)
408 {
409 	session->header.data_size += bytes_written;
410 
411 	process_buildids();
412 	perf_header__write(&session->header, output, true);
413 }
414 
415 static int __cmd_record(int argc, const char **argv)
416 {
417 	int i, counter;
418 	struct stat st;
419 	pid_t pid = 0;
420 	int flags;
421 	int err;
422 	unsigned long waking = 0;
423 	int child_ready_pipe[2], go_pipe[2];
424 	const bool forks = target_pid == -1 && argc > 0;
425 	char buf;
426 
427 	page_size = sysconf(_SC_PAGE_SIZE);
428 
429 	atexit(sig_atexit);
430 	signal(SIGCHLD, sig_handler);
431 	signal(SIGINT, sig_handler);
432 
433 	if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) {
434 		perror("failed to create pipes");
435 		exit(-1);
436 	}
437 
438 	if (!stat(output_name, &st) && st.st_size) {
439 		if (!force) {
440 			if (!append_file) {
441 				pr_err("Error, output file %s exists, use -A "
442 				       "to append or -f to overwrite.\n",
443 				       output_name);
444 				exit(-1);
445 			}
446 		} else {
447 			char oldname[PATH_MAX];
448 			snprintf(oldname, sizeof(oldname), "%s.old",
449 				 output_name);
450 			unlink(oldname);
451 			rename(output_name, oldname);
452 		}
453 	} else {
454 		append_file = 0;
455 	}
456 
457 	flags = O_CREAT|O_RDWR;
458 	if (append_file)
459 		file_new = 0;
460 	else
461 		flags |= O_TRUNC;
462 
463 	output = open(output_name, flags, S_IRUSR|S_IWUSR);
464 	if (output < 0) {
465 		perror("failed to create output file");
466 		exit(-1);
467 	}
468 
469 	session = perf_session__new(output_name, O_WRONLY, force);
470 	if (session == NULL) {
471 		pr_err("Not enough memory for reading perf file header\n");
472 		return -1;
473 	}
474 
475 	if (!file_new) {
476 		err = perf_header__read(&session->header, output);
477 		if (err < 0)
478 			return err;
479 	}
480 
481 	if (raw_samples) {
482 		perf_header__set_feat(&session->header, HEADER_TRACE_INFO);
483 	} else {
484 		for (i = 0; i < nr_counters; i++) {
485 			if (attrs[i].sample_type & PERF_SAMPLE_RAW) {
486 				perf_header__set_feat(&session->header, HEADER_TRACE_INFO);
487 				break;
488 			}
489 		}
490 	}
491 
492 	atexit(atexit_header);
493 
494 	if (forks) {
495 		pid = fork();
496 		if (pid < 0) {
497 			perror("failed to fork");
498 			exit(-1);
499 		}
500 
501 		if (!pid) {
502 			close(child_ready_pipe[0]);
503 			close(go_pipe[1]);
504 			fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
505 
506 			/*
507 			 * Do a dummy execvp to get the PLT entry resolved,
508 			 * so we avoid the resolver overhead on the real
509 			 * execvp call.
510 			 */
511 			execvp("", (char **)argv);
512 
513 			/*
514 			 * Tell the parent we're ready to go
515 			 */
516 			close(child_ready_pipe[1]);
517 
518 			/*
519 			 * Wait until the parent tells us to go.
520 			 */
521 			if (read(go_pipe[0], &buf, 1) == -1)
522 				perror("unable to read pipe");
523 
524 			execvp(argv[0], (char **)argv);
525 
526 			perror(argv[0]);
527 			exit(-1);
528 		}
529 
530 		child_pid = pid;
531 
532 		if (!system_wide)
533 			target_pid = pid;
534 
535 		close(child_ready_pipe[1]);
536 		close(go_pipe[0]);
537 		/*
538 		 * wait for child to settle
539 		 */
540 		if (read(child_ready_pipe[0], &buf, 1) == -1) {
541 			perror("unable to read pipe");
542 			exit(-1);
543 		}
544 		close(child_ready_pipe[0]);
545 	}
546 
547 
548 	if ((!system_wide && !inherit) || profile_cpu != -1) {
549 		open_counters(profile_cpu, target_pid);
550 	} else {
551 		nr_cpus = read_cpu_map();
552 		for (i = 0; i < nr_cpus; i++)
553 			open_counters(cpumap[i], target_pid);
554 	}
555 
556 	if (file_new) {
557 		err = perf_header__write(&session->header, output, false);
558 		if (err < 0)
559 			return err;
560 	}
561 
562 	post_processing_offset = lseek(output, 0, SEEK_CUR);
563 
564 	err = event__synthesize_kernel_mmap(process_synthesized_event,
565 					    session, "_text");
566 	if (err < 0) {
567 		pr_err("Couldn't record kernel reference relocation symbol.\n");
568 		return err;
569 	}
570 
571 	err = event__synthesize_modules(process_synthesized_event, session);
572 	if (err < 0) {
573 		pr_err("Couldn't record kernel reference relocation symbol.\n");
574 		return err;
575 	}
576 
577 	if (!system_wide && profile_cpu == -1)
578 		event__synthesize_thread(target_pid, process_synthesized_event,
579 					 session);
580 	else
581 		event__synthesize_threads(process_synthesized_event, session);
582 
583 	if (realtime_prio) {
584 		struct sched_param param;
585 
586 		param.sched_priority = realtime_prio;
587 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
588 			pr_err("Could not set realtime priority.\n");
589 			exit(-1);
590 		}
591 	}
592 
593 	/*
594 	 * Let the child rip
595 	 */
596 	if (forks)
597 		close(go_pipe[1]);
598 
599 	for (;;) {
600 		int hits = samples;
601 
602 		for (i = 0; i < nr_cpu; i++) {
603 			for (counter = 0; counter < nr_counters; counter++) {
604 				if (mmap_array[i][counter].base)
605 					mmap_read(&mmap_array[i][counter]);
606 			}
607 		}
608 
609 		if (hits == samples) {
610 			if (done)
611 				break;
612 			err = poll(event_array, nr_poll, -1);
613 			waking++;
614 		}
615 
616 		if (done) {
617 			for (i = 0; i < nr_cpu; i++) {
618 				for (counter = 0; counter < nr_counters; counter++)
619 					ioctl(fd[i][counter], PERF_EVENT_IOC_DISABLE);
620 			}
621 		}
622 	}
623 
624 	fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
625 
626 	/*
627 	 * Approximate RIP event size: 24 bytes.
628 	 */
629 	fprintf(stderr,
630 		"[ perf record: Captured and wrote %.3f MB %s (~%lld samples) ]\n",
631 		(double)bytes_written / 1024.0 / 1024.0,
632 		output_name,
633 		bytes_written / 24);
634 
635 	return 0;
636 }
637 
638 static const char * const record_usage[] = {
639 	"perf record [<options>] [<command>]",
640 	"perf record [<options>] -- <command> [<options>]",
641 	NULL
642 };
643 
644 static const struct option options[] = {
645 	OPT_CALLBACK('e', "event", NULL, "event",
646 		     "event selector. use 'perf list' to list available events",
647 		     parse_events),
648 	OPT_CALLBACK(0, "filter", NULL, "filter",
649 		     "event filter", parse_filter),
650 	OPT_INTEGER('p', "pid", &target_pid,
651 		    "record events on existing pid"),
652 	OPT_INTEGER('r', "realtime", &realtime_prio,
653 		    "collect data with this RT SCHED_FIFO priority"),
654 	OPT_BOOLEAN('R', "raw-samples", &raw_samples,
655 		    "collect raw sample records from all opened counters"),
656 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
657 			    "system-wide collection from all CPUs"),
658 	OPT_BOOLEAN('A', "append", &append_file,
659 			    "append to the output file to do incremental profiling"),
660 	OPT_INTEGER('C', "profile_cpu", &profile_cpu,
661 			    "CPU to profile on"),
662 	OPT_BOOLEAN('f', "force", &force,
663 			"overwrite existing data file"),
664 	OPT_LONG('c', "count", &default_interval,
665 		    "event period to sample"),
666 	OPT_STRING('o', "output", &output_name, "file",
667 		    "output file name"),
668 	OPT_BOOLEAN('i', "inherit", &inherit,
669 		    "child tasks inherit counters"),
670 	OPT_INTEGER('F', "freq", &freq,
671 		    "profile at this frequency"),
672 	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
673 		    "number of mmap data pages"),
674 	OPT_BOOLEAN('g', "call-graph", &call_graph,
675 		    "do call-graph (stack chain/backtrace) recording"),
676 	OPT_BOOLEAN('v', "verbose", &verbose,
677 		    "be more verbose (show counter open errors, etc)"),
678 	OPT_BOOLEAN('s', "stat", &inherit_stat,
679 		    "per thread counts"),
680 	OPT_BOOLEAN('d', "data", &sample_address,
681 		    "Sample addresses"),
682 	OPT_BOOLEAN('n', "no-samples", &no_samples,
683 		    "don't sample"),
684 	OPT_BOOLEAN('M', "multiplex", &multiplex,
685 		    "multiplex counter output in a single channel"),
686 	OPT_END()
687 };
688 
689 int cmd_record(int argc, const char **argv, const char *prefix __used)
690 {
691 	int counter;
692 
693 	argc = parse_options(argc, argv, options, record_usage,
694 			    PARSE_OPT_STOP_AT_NON_OPTION);
695 	if (!argc && target_pid == -1 && !system_wide && profile_cpu == -1)
696 		usage_with_options(record_usage, options);
697 
698 	symbol__init();
699 
700 	if (!nr_counters) {
701 		nr_counters	= 1;
702 		attrs[0].type	= PERF_TYPE_HARDWARE;
703 		attrs[0].config = PERF_COUNT_HW_CPU_CYCLES;
704 	}
705 
706 	/*
707 	 * User specified count overrides default frequency.
708 	 */
709 	if (default_interval)
710 		freq = 0;
711 	else if (freq) {
712 		default_interval = freq;
713 	} else {
714 		fprintf(stderr, "frequency and count are zero, aborting\n");
715 		exit(EXIT_FAILURE);
716 	}
717 
718 	for (counter = 0; counter < nr_counters; counter++) {
719 		if (attrs[counter].sample_period)
720 			continue;
721 
722 		attrs[counter].sample_period = default_interval;
723 	}
724 
725 	return __cmd_record(argc, argv);
726 }
727