xref: /linux/tools/perf/builtin-record.c (revision a0b54e256d513ed99e456bea6e4e188ff92e7c46)
1 /*
2  * builtin-record.c
3  *
4  * Builtin record command: Record the profile of a workload
5  * (or a CPU, or a PID) into the perf.data output file - for
6  * later analysis via perf report.
7  */
8 #include "builtin.h"
9 
10 #include "perf.h"
11 
12 #include "util/util.h"
13 #include "util/parse-options.h"
14 #include "util/parse-events.h"
15 #include "util/string.h"
16 
17 #include "util/header.h"
18 #include "util/event.h"
19 #include "util/debug.h"
20 #include "util/trace-event.h"
21 
22 #include <unistd.h>
23 #include <sched.h>
24 
25 #define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
26 #define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
27 
28 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
29 
30 static long			default_interval		= 100000;
31 
32 static int			nr_cpus				= 0;
33 static unsigned int		page_size;
34 static unsigned int		mmap_pages			= 128;
35 static int			freq				= 0;
36 static int			output;
37 static const char		*output_name			= "perf.data";
38 static int			group				= 0;
39 static unsigned int		realtime_prio			= 0;
40 static int			raw_samples			= 0;
41 static int			system_wide			= 0;
42 static int			profile_cpu			= -1;
43 static pid_t			target_pid			= -1;
44 static int			inherit				= 1;
45 static int			force				= 0;
46 static int			append_file			= 0;
47 static int			call_graph			= 0;
48 static int			inherit_stat			= 0;
49 static int			no_samples			= 0;
50 static int			sample_address			= 0;
51 
52 static long			samples;
53 static struct timeval		last_read;
54 static struct timeval		this_read;
55 
56 static u64			bytes_written;
57 
58 static struct pollfd		event_array[MAX_NR_CPUS * MAX_COUNTERS];
59 
60 static int			nr_poll;
61 static int			nr_cpu;
62 
63 static int			file_new = 1;
64 
65 struct perf_header		*header;
66 
67 struct mmap_data {
68 	int			counter;
69 	void			*base;
70 	unsigned int		mask;
71 	unsigned int		prev;
72 };
73 
74 static struct mmap_data		mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
75 
76 static unsigned long mmap_read_head(struct mmap_data *md)
77 {
78 	struct perf_counter_mmap_page *pc = md->base;
79 	long head;
80 
81 	head = pc->data_head;
82 	rmb();
83 
84 	return head;
85 }
86 
87 static void mmap_write_tail(struct mmap_data *md, unsigned long tail)
88 {
89 	struct perf_counter_mmap_page *pc = md->base;
90 
91 	/*
92 	 * ensure all reads are done before we write the tail out.
93 	 */
94 	/* mb(); */
95 	pc->data_tail = tail;
96 }
97 
98 static void write_output(void *buf, size_t size)
99 {
100 	while (size) {
101 		int ret = write(output, buf, size);
102 
103 		if (ret < 0)
104 			die("failed to write");
105 
106 		size -= ret;
107 		buf += ret;
108 
109 		bytes_written += ret;
110 	}
111 }
112 
113 static void mmap_read(struct mmap_data *md)
114 {
115 	unsigned int head = mmap_read_head(md);
116 	unsigned int old = md->prev;
117 	unsigned char *data = md->base + page_size;
118 	unsigned long size;
119 	void *buf;
120 	int diff;
121 
122 	gettimeofday(&this_read, NULL);
123 
124 	/*
125 	 * If we're further behind than half the buffer, there's a chance
126 	 * the writer will bite our tail and mess up the samples under us.
127 	 *
128 	 * If we somehow ended up ahead of the head, we got messed up.
129 	 *
130 	 * In either case, truncate and restart at head.
131 	 */
132 	diff = head - old;
133 	if (diff < 0) {
134 		struct timeval iv;
135 		unsigned long msecs;
136 
137 		timersub(&this_read, &last_read, &iv);
138 		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
139 
140 		fprintf(stderr, "WARNING: failed to keep up with mmap data."
141 				"  Last read %lu msecs ago.\n", msecs);
142 
143 		/*
144 		 * head points to a known good entry, start there.
145 		 */
146 		old = head;
147 	}
148 
149 	last_read = this_read;
150 
151 	if (old != head)
152 		samples++;
153 
154 	size = head - old;
155 
156 	if ((old & md->mask) + size != (head & md->mask)) {
157 		buf = &data[old & md->mask];
158 		size = md->mask + 1 - (old & md->mask);
159 		old += size;
160 
161 		write_output(buf, size);
162 	}
163 
164 	buf = &data[old & md->mask];
165 	size = head - old;
166 	old += size;
167 
168 	write_output(buf, size);
169 
170 	md->prev = old;
171 	mmap_write_tail(md, old);
172 }
173 
174 static volatile int done = 0;
175 static volatile int signr = -1;
176 
177 static void sig_handler(int sig)
178 {
179 	done = 1;
180 	signr = sig;
181 }
182 
183 static void sig_atexit(void)
184 {
185 	if (signr == -1)
186 		return;
187 
188 	signal(signr, SIG_DFL);
189 	kill(getpid(), signr);
190 }
191 
192 static pid_t pid_synthesize_comm_event(pid_t pid, int full)
193 {
194 	struct comm_event comm_ev;
195 	char filename[PATH_MAX];
196 	char bf[BUFSIZ];
197 	FILE *fp;
198 	size_t size = 0;
199 	DIR *tasks;
200 	struct dirent dirent, *next;
201 	pid_t tgid = 0;
202 
203 	snprintf(filename, sizeof(filename), "/proc/%d/status", pid);
204 
205 	fp = fopen(filename, "r");
206 	if (fp == NULL) {
207 		/*
208 		 * We raced with a task exiting - just return:
209 		 */
210 		if (verbose)
211 			fprintf(stderr, "couldn't open %s\n", filename);
212 		return 0;
213 	}
214 
215 	memset(&comm_ev, 0, sizeof(comm_ev));
216 	while (!comm_ev.comm[0] || !comm_ev.pid) {
217 		if (fgets(bf, sizeof(bf), fp) == NULL)
218 			goto out_failure;
219 
220 		if (memcmp(bf, "Name:", 5) == 0) {
221 			char *name = bf + 5;
222 			while (*name && isspace(*name))
223 				++name;
224 			size = strlen(name) - 1;
225 			memcpy(comm_ev.comm, name, size++);
226 		} else if (memcmp(bf, "Tgid:", 5) == 0) {
227 			char *tgids = bf + 5;
228 			while (*tgids && isspace(*tgids))
229 				++tgids;
230 			tgid = comm_ev.pid = atoi(tgids);
231 		}
232 	}
233 
234 	comm_ev.header.type = PERF_EVENT_COMM;
235 	size = ALIGN(size, sizeof(u64));
236 	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
237 
238 	if (!full) {
239 		comm_ev.tid = pid;
240 
241 		write_output(&comm_ev, comm_ev.header.size);
242 		goto out_fclose;
243 	}
244 
245 	snprintf(filename, sizeof(filename), "/proc/%d/task", pid);
246 
247 	tasks = opendir(filename);
248 	while (!readdir_r(tasks, &dirent, &next) && next) {
249 		char *end;
250 		pid = strtol(dirent.d_name, &end, 10);
251 		if (*end)
252 			continue;
253 
254 		comm_ev.tid = pid;
255 
256 		write_output(&comm_ev, comm_ev.header.size);
257 	}
258 	closedir(tasks);
259 
260 out_fclose:
261 	fclose(fp);
262 	return tgid;
263 
264 out_failure:
265 	fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
266 		filename);
267 	exit(EXIT_FAILURE);
268 }
269 
270 static void pid_synthesize_mmap_samples(pid_t pid, pid_t tgid)
271 {
272 	char filename[PATH_MAX];
273 	FILE *fp;
274 
275 	snprintf(filename, sizeof(filename), "/proc/%d/maps", pid);
276 
277 	fp = fopen(filename, "r");
278 	if (fp == NULL) {
279 		/*
280 		 * We raced with a task exiting - just return:
281 		 */
282 		if (verbose)
283 			fprintf(stderr, "couldn't open %s\n", filename);
284 		return;
285 	}
286 	while (1) {
287 		char bf[BUFSIZ], *pbf = bf;
288 		struct mmap_event mmap_ev = {
289 			.header = { .type = PERF_EVENT_MMAP },
290 		};
291 		int n;
292 		size_t size;
293 		if (fgets(bf, sizeof(bf), fp) == NULL)
294 			break;
295 
296 		/* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
297 		n = hex2u64(pbf, &mmap_ev.start);
298 		if (n < 0)
299 			continue;
300 		pbf += n + 1;
301 		n = hex2u64(pbf, &mmap_ev.len);
302 		if (n < 0)
303 			continue;
304 		pbf += n + 3;
305 		if (*pbf == 'x') { /* vm_exec */
306 			char *execname = strchr(bf, '/');
307 
308 			/* Catch VDSO */
309 			if (execname == NULL)
310 				execname = strstr(bf, "[vdso]");
311 
312 			if (execname == NULL)
313 				continue;
314 
315 			size = strlen(execname);
316 			execname[size - 1] = '\0'; /* Remove \n */
317 			memcpy(mmap_ev.filename, execname, size);
318 			size = ALIGN(size, sizeof(u64));
319 			mmap_ev.len -= mmap_ev.start;
320 			mmap_ev.header.size = (sizeof(mmap_ev) -
321 					       (sizeof(mmap_ev.filename) - size));
322 			mmap_ev.pid = tgid;
323 			mmap_ev.tid = pid;
324 
325 			write_output(&mmap_ev, mmap_ev.header.size);
326 		}
327 	}
328 
329 	fclose(fp);
330 }
331 
332 static void synthesize_all(void)
333 {
334 	DIR *proc;
335 	struct dirent dirent, *next;
336 
337 	proc = opendir("/proc");
338 
339 	while (!readdir_r(proc, &dirent, &next) && next) {
340 		char *end;
341 		pid_t pid, tgid;
342 
343 		pid = strtol(dirent.d_name, &end, 10);
344 		if (*end) /* only interested in proper numerical dirents */
345 			continue;
346 
347 		tgid = pid_synthesize_comm_event(pid, 1);
348 		pid_synthesize_mmap_samples(pid, tgid);
349 	}
350 
351 	closedir(proc);
352 }
353 
354 static int group_fd;
355 
356 static struct perf_header_attr *get_header_attr(struct perf_counter_attr *a, int nr)
357 {
358 	struct perf_header_attr *h_attr;
359 
360 	if (nr < header->attrs) {
361 		h_attr = header->attr[nr];
362 	} else {
363 		h_attr = perf_header_attr__new(a);
364 		perf_header__add_attr(header, h_attr);
365 	}
366 
367 	return h_attr;
368 }
369 
370 static void create_counter(int counter, int cpu, pid_t pid)
371 {
372 	struct perf_counter_attr *attr = attrs + counter;
373 	struct perf_header_attr *h_attr;
374 	int track = !counter; /* only the first counter needs these */
375 	struct {
376 		u64 count;
377 		u64 time_enabled;
378 		u64 time_running;
379 		u64 id;
380 	} read_data;
381 
382 	attr->read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
383 				  PERF_FORMAT_TOTAL_TIME_RUNNING |
384 				  PERF_FORMAT_ID;
385 
386 	attr->sample_type	|= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
387 
388 	if (freq) {
389 		attr->sample_type	|= PERF_SAMPLE_PERIOD;
390 		attr->freq		= 1;
391 		attr->sample_freq	= freq;
392 	}
393 
394 	if (no_samples)
395 		attr->sample_freq = 0;
396 
397 	if (inherit_stat)
398 		attr->inherit_stat = 1;
399 
400 	if (sample_address)
401 		attr->sample_type	|= PERF_SAMPLE_ADDR;
402 
403 	if (call_graph)
404 		attr->sample_type	|= PERF_SAMPLE_CALLCHAIN;
405 
406 	if (raw_samples) {
407 		attr->sample_type	|= PERF_SAMPLE_TIME;
408 		attr->sample_type	|= PERF_SAMPLE_RAW;
409 		attr->sample_type	|= PERF_SAMPLE_CPU;
410 	}
411 
412 	attr->mmap		= track;
413 	attr->comm		= track;
414 	attr->inherit		= (cpu < 0) && inherit;
415 	attr->disabled		= 1;
416 
417 try_again:
418 	fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
419 
420 	if (fd[nr_cpu][counter] < 0) {
421 		int err = errno;
422 
423 		if (err == EPERM)
424 			die("Permission error - are you root?\n");
425 		else if (err ==  ENODEV && profile_cpu != -1)
426 			die("No such device - did you specify an out-of-range profile CPU?\n");
427 
428 		/*
429 		 * If it's cycles then fall back to hrtimer
430 		 * based cpu-clock-tick sw counter, which
431 		 * is always available even if no PMU support:
432 		 */
433 		if (attr->type == PERF_TYPE_HARDWARE
434 			&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
435 
436 			if (verbose)
437 				warning(" ... trying to fall back to cpu-clock-ticks\n");
438 			attr->type = PERF_TYPE_SOFTWARE;
439 			attr->config = PERF_COUNT_SW_CPU_CLOCK;
440 			goto try_again;
441 		}
442 		printf("\n");
443 		error("perfcounter syscall returned with %d (%s)\n",
444 			fd[nr_cpu][counter], strerror(err));
445 		die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
446 		exit(-1);
447 	}
448 
449 	h_attr = get_header_attr(attr, counter);
450 
451 	if (!file_new) {
452 		if (memcmp(&h_attr->attr, attr, sizeof(*attr))) {
453 			fprintf(stderr, "incompatible append\n");
454 			exit(-1);
455 		}
456 	}
457 
458 	if (read(fd[nr_cpu][counter], &read_data, sizeof(read_data)) == -1) {
459 		perror("Unable to read perf file descriptor\n");
460 		exit(-1);
461 	}
462 
463 	perf_header_attr__add_id(h_attr, read_data.id);
464 
465 	assert(fd[nr_cpu][counter] >= 0);
466 	fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
467 
468 	/*
469 	 * First counter acts as the group leader:
470 	 */
471 	if (group && group_fd == -1)
472 		group_fd = fd[nr_cpu][counter];
473 
474 	event_array[nr_poll].fd = fd[nr_cpu][counter];
475 	event_array[nr_poll].events = POLLIN;
476 	nr_poll++;
477 
478 	mmap_array[nr_cpu][counter].counter = counter;
479 	mmap_array[nr_cpu][counter].prev = 0;
480 	mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
481 	mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
482 			PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter], 0);
483 	if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
484 		error("failed to mmap with %d (%s)\n", errno, strerror(errno));
485 		exit(-1);
486 	}
487 
488 	ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE);
489 }
490 
491 static void open_counters(int cpu, pid_t pid)
492 {
493 	int counter;
494 
495 	group_fd = -1;
496 	for (counter = 0; counter < nr_counters; counter++)
497 		create_counter(counter, cpu, pid);
498 
499 	nr_cpu++;
500 }
501 
502 static void atexit_header(void)
503 {
504 	header->data_size += bytes_written;
505 
506 	perf_header__write(header, output);
507 }
508 
509 static int __cmd_record(int argc, const char **argv)
510 {
511 	int i, counter;
512 	struct stat st;
513 	pid_t pid = 0;
514 	int flags;
515 	int ret;
516 
517 	page_size = sysconf(_SC_PAGE_SIZE);
518 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
519 	assert(nr_cpus <= MAX_NR_CPUS);
520 	assert(nr_cpus >= 0);
521 
522 	atexit(sig_atexit);
523 	signal(SIGCHLD, sig_handler);
524 	signal(SIGINT, sig_handler);
525 
526 	if (!stat(output_name, &st) && st.st_size) {
527 		if (!force && !append_file) {
528 			fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n",
529 					output_name);
530 			exit(-1);
531 		}
532 	} else {
533 		append_file = 0;
534 	}
535 
536 	flags = O_CREAT|O_RDWR;
537 	if (append_file)
538 		file_new = 0;
539 	else
540 		flags |= O_TRUNC;
541 
542 	output = open(output_name, flags, S_IRUSR|S_IWUSR);
543 	if (output < 0) {
544 		perror("failed to create output file");
545 		exit(-1);
546 	}
547 
548 	if (!file_new)
549 		header = perf_header__read(output);
550 	else
551 		header = perf_header__new();
552 
553 
554 	if (raw_samples) {
555 		read_tracing_data(attrs, nr_counters);
556 	} else {
557 		for (i = 0; i < nr_counters; i++) {
558 			if (attrs[i].sample_type & PERF_SAMPLE_RAW) {
559 				read_tracing_data(attrs, nr_counters);
560 				break;
561 			}
562 		}
563 	}
564 	atexit(atexit_header);
565 
566 	if (!system_wide) {
567 		pid = target_pid;
568 		if (pid == -1)
569 			pid = getpid();
570 
571 		open_counters(profile_cpu, pid);
572 	} else {
573 		if (profile_cpu != -1) {
574 			open_counters(profile_cpu, target_pid);
575 		} else {
576 			for (i = 0; i < nr_cpus; i++)
577 				open_counters(i, target_pid);
578 		}
579 	}
580 
581 	if (file_new)
582 		perf_header__write(header, output);
583 
584 	if (!system_wide) {
585 		pid_t tgid = pid_synthesize_comm_event(pid, 0);
586 		pid_synthesize_mmap_samples(pid, tgid);
587 	} else
588 		synthesize_all();
589 
590 	if (target_pid == -1 && argc) {
591 		pid = fork();
592 		if (pid < 0)
593 			perror("failed to fork");
594 
595 		if (!pid) {
596 			if (execvp(argv[0], (char **)argv)) {
597 				perror(argv[0]);
598 				exit(-1);
599 			}
600 		}
601 	}
602 
603 	if (realtime_prio) {
604 		struct sched_param param;
605 
606 		param.sched_priority = realtime_prio;
607 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
608 			printf("Could not set realtime priority.\n");
609 			exit(-1);
610 		}
611 	}
612 
613 	for (;;) {
614 		int hits = samples;
615 
616 		for (i = 0; i < nr_cpu; i++) {
617 			for (counter = 0; counter < nr_counters; counter++)
618 				mmap_read(&mmap_array[i][counter]);
619 		}
620 
621 		if (hits == samples) {
622 			if (done)
623 				break;
624 			ret = poll(event_array, nr_poll, 100);
625 		}
626 	}
627 
628 	/*
629 	 * Approximate RIP event size: 24 bytes.
630 	 */
631 	fprintf(stderr,
632 		"[ perf record: Captured and wrote %.3f MB %s (~%lld samples) ]\n",
633 		(double)bytes_written / 1024.0 / 1024.0,
634 		output_name,
635 		bytes_written / 24);
636 
637 	return 0;
638 }
639 
640 static const char * const record_usage[] = {
641 	"perf record [<options>] [<command>]",
642 	"perf record [<options>] -- <command> [<options>]",
643 	NULL
644 };
645 
646 static const struct option options[] = {
647 	OPT_CALLBACK('e', "event", NULL, "event",
648 		     "event selector. use 'perf list' to list available events",
649 		     parse_events),
650 	OPT_INTEGER('p', "pid", &target_pid,
651 		    "record events on existing pid"),
652 	OPT_INTEGER('r', "realtime", &realtime_prio,
653 		    "collect data with this RT SCHED_FIFO priority"),
654 	OPT_BOOLEAN('R', "raw-samples", &raw_samples,
655 		    "collect raw sample records from all opened counters"),
656 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
657 			    "system-wide collection from all CPUs"),
658 	OPT_BOOLEAN('A', "append", &append_file,
659 			    "append to the output file to do incremental profiling"),
660 	OPT_INTEGER('C', "profile_cpu", &profile_cpu,
661 			    "CPU to profile on"),
662 	OPT_BOOLEAN('f', "force", &force,
663 			"overwrite existing data file"),
664 	OPT_LONG('c', "count", &default_interval,
665 		    "event period to sample"),
666 	OPT_STRING('o', "output", &output_name, "file",
667 		    "output file name"),
668 	OPT_BOOLEAN('i', "inherit", &inherit,
669 		    "child tasks inherit counters"),
670 	OPT_INTEGER('F', "freq", &freq,
671 		    "profile at this frequency"),
672 	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
673 		    "number of mmap data pages"),
674 	OPT_BOOLEAN('g', "call-graph", &call_graph,
675 		    "do call-graph (stack chain/backtrace) recording"),
676 	OPT_BOOLEAN('v', "verbose", &verbose,
677 		    "be more verbose (show counter open errors, etc)"),
678 	OPT_BOOLEAN('s', "stat", &inherit_stat,
679 		    "per thread counts"),
680 	OPT_BOOLEAN('d', "data", &sample_address,
681 		    "Sample addresses"),
682 	OPT_BOOLEAN('n', "no-samples", &no_samples,
683 		    "don't sample"),
684 	OPT_END()
685 };
686 
687 int cmd_record(int argc, const char **argv, const char *prefix __used)
688 {
689 	int counter;
690 
691 	argc = parse_options(argc, argv, options, record_usage,
692 		PARSE_OPT_STOP_AT_NON_OPTION);
693 	if (!argc && target_pid == -1 && !system_wide)
694 		usage_with_options(record_usage, options);
695 
696 	if (!nr_counters) {
697 		nr_counters	= 1;
698 		attrs[0].type	= PERF_TYPE_HARDWARE;
699 		attrs[0].config = PERF_COUNT_HW_CPU_CYCLES;
700 	}
701 
702 	for (counter = 0; counter < nr_counters; counter++) {
703 		if (attrs[counter].sample_period)
704 			continue;
705 
706 		attrs[counter].sample_period = default_interval;
707 	}
708 
709 	return __cmd_record(argc, argv);
710 }
711