xref: /linux/tools/perf/builtin-trace.c (revision 564f7dfde24a405d877168f150ae5d29d3ad99c7)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/path.h"
28 #include "util/session.h"
29 #include "util/thread.h"
30 #include <subcmd/parse-options.h>
31 #include "util/strlist.h"
32 #include "util/intlist.h"
33 #include "util/thread_map.h"
34 #include "util/stat.h"
35 #include "trace/beauty/beauty.h"
36 #include "trace-event.h"
37 #include "util/parse-events.h"
38 #include "util/bpf-loader.h"
39 #include "callchain.h"
40 #include "print_binary.h"
41 #include "string2.h"
42 #include "syscalltbl.h"
43 #include "rb_resort.h"
44 
45 #include <errno.h>
46 #include <inttypes.h>
47 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
48 #include <poll.h>
49 #include <signal.h>
50 #include <stdlib.h>
51 #include <string.h>
52 #include <linux/err.h>
53 #include <linux/filter.h>
54 #include <linux/audit.h>
55 #include <linux/kernel.h>
56 #include <linux/random.h>
57 #include <linux/stringify.h>
58 #include <linux/time64.h>
59 
60 #include "sane_ctype.h"
61 
62 #ifndef O_CLOEXEC
63 # define O_CLOEXEC		02000000
64 #endif
65 
66 struct trace {
67 	struct perf_tool	tool;
68 	struct syscalltbl	*sctbl;
69 	struct {
70 		int		max;
71 		struct syscall  *table;
72 		struct {
73 			struct perf_evsel *sys_enter,
74 					  *sys_exit;
75 		}		events;
76 	} syscalls;
77 	struct record_opts	opts;
78 	struct perf_evlist	*evlist;
79 	struct machine		*host;
80 	struct thread		*current;
81 	u64			base_time;
82 	FILE			*output;
83 	unsigned long		nr_events;
84 	struct strlist		*ev_qualifier;
85 	struct {
86 		size_t		nr;
87 		int		*entries;
88 	}			ev_qualifier_ids;
89 	struct {
90 		size_t		nr;
91 		pid_t		*entries;
92 	}			filter_pids;
93 	double			duration_filter;
94 	double			runtime_ms;
95 	struct {
96 		u64		vfs_getname,
97 				proc_getname;
98 	} stats;
99 	unsigned int		max_stack;
100 	unsigned int		min_stack;
101 	bool			not_ev_qualifier;
102 	bool			live;
103 	bool			full_time;
104 	bool			sched;
105 	bool			multiple_threads;
106 	bool			summary;
107 	bool			summary_only;
108 	bool			show_comm;
109 	bool			show_tool_stats;
110 	bool			trace_syscalls;
111 	bool			kernel_syscallchains;
112 	bool			force;
113 	bool			vfs_getname;
114 	int			trace_pgfaults;
115 	int			open_id;
116 };
117 
118 struct tp_field {
119 	int offset;
120 	union {
121 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
122 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
123 	};
124 };
125 
126 #define TP_UINT_FIELD(bits) \
127 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
128 { \
129 	u##bits value; \
130 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
131 	return value;  \
132 }
133 
134 TP_UINT_FIELD(8);
135 TP_UINT_FIELD(16);
136 TP_UINT_FIELD(32);
137 TP_UINT_FIELD(64);
138 
139 #define TP_UINT_FIELD__SWAPPED(bits) \
140 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
141 { \
142 	u##bits value; \
143 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
144 	return bswap_##bits(value);\
145 }
146 
147 TP_UINT_FIELD__SWAPPED(16);
148 TP_UINT_FIELD__SWAPPED(32);
149 TP_UINT_FIELD__SWAPPED(64);
150 
151 static int tp_field__init_uint(struct tp_field *field,
152 			       struct format_field *format_field,
153 			       bool needs_swap)
154 {
155 	field->offset = format_field->offset;
156 
157 	switch (format_field->size) {
158 	case 1:
159 		field->integer = tp_field__u8;
160 		break;
161 	case 2:
162 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
163 		break;
164 	case 4:
165 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
166 		break;
167 	case 8:
168 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
169 		break;
170 	default:
171 		return -1;
172 	}
173 
174 	return 0;
175 }
176 
177 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
178 {
179 	return sample->raw_data + field->offset;
180 }
181 
182 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
183 {
184 	field->offset = format_field->offset;
185 	field->pointer = tp_field__ptr;
186 	return 0;
187 }
188 
189 struct syscall_tp {
190 	struct tp_field id;
191 	union {
192 		struct tp_field args, ret;
193 	};
194 };
195 
196 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
197 					  struct tp_field *field,
198 					  const char *name)
199 {
200 	struct format_field *format_field = perf_evsel__field(evsel, name);
201 
202 	if (format_field == NULL)
203 		return -1;
204 
205 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
206 }
207 
208 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
209 	({ struct syscall_tp *sc = evsel->priv;\
210 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
211 
212 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
213 					 struct tp_field *field,
214 					 const char *name)
215 {
216 	struct format_field *format_field = perf_evsel__field(evsel, name);
217 
218 	if (format_field == NULL)
219 		return -1;
220 
221 	return tp_field__init_ptr(field, format_field);
222 }
223 
224 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
225 	({ struct syscall_tp *sc = evsel->priv;\
226 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
227 
228 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
229 {
230 	zfree(&evsel->priv);
231 	perf_evsel__delete(evsel);
232 }
233 
234 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
235 {
236 	evsel->priv = malloc(sizeof(struct syscall_tp));
237 	if (evsel->priv != NULL) {
238 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
239 			goto out_delete;
240 
241 		evsel->handler = handler;
242 		return 0;
243 	}
244 
245 	return -ENOMEM;
246 
247 out_delete:
248 	zfree(&evsel->priv);
249 	return -ENOENT;
250 }
251 
252 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
253 {
254 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
255 
256 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
257 	if (IS_ERR(evsel))
258 		evsel = perf_evsel__newtp("syscalls", direction);
259 
260 	if (IS_ERR(evsel))
261 		return NULL;
262 
263 	if (perf_evsel__init_syscall_tp(evsel, handler))
264 		goto out_delete;
265 
266 	return evsel;
267 
268 out_delete:
269 	perf_evsel__delete_priv(evsel);
270 	return NULL;
271 }
272 
273 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
274 	({ struct syscall_tp *fields = evsel->priv; \
275 	   fields->name.integer(&fields->name, sample); })
276 
277 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
278 	({ struct syscall_tp *fields = evsel->priv; \
279 	   fields->name.pointer(&fields->name, sample); })
280 
281 struct strarray {
282 	int	    offset;
283 	int	    nr_entries;
284 	const char **entries;
285 };
286 
287 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
288 	.nr_entries = ARRAY_SIZE(array), \
289 	.entries = array, \
290 }
291 
292 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
293 	.offset	    = off, \
294 	.nr_entries = ARRAY_SIZE(array), \
295 	.entries = array, \
296 }
297 
298 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
299 						const char *intfmt,
300 					        struct syscall_arg *arg)
301 {
302 	struct strarray *sa = arg->parm;
303 	int idx = arg->val - sa->offset;
304 
305 	if (idx < 0 || idx >= sa->nr_entries)
306 		return scnprintf(bf, size, intfmt, arg->val);
307 
308 	return scnprintf(bf, size, "%s", sa->entries[idx]);
309 }
310 
311 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
312 					      struct syscall_arg *arg)
313 {
314 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
315 }
316 
317 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
318 
319 #if defined(__i386__) || defined(__x86_64__)
320 /*
321  * FIXME: Make this available to all arches as soon as the ioctl beautifier
322  * 	  gets rewritten to support all arches.
323  */
324 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
325 						 struct syscall_arg *arg)
326 {
327 	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
328 }
329 
330 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
331 #endif /* defined(__i386__) || defined(__x86_64__) */
332 
333 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
334 					struct syscall_arg *arg);
335 
336 #define SCA_FD syscall_arg__scnprintf_fd
337 
338 #ifndef AT_FDCWD
339 #define AT_FDCWD	-100
340 #endif
341 
342 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
343 					   struct syscall_arg *arg)
344 {
345 	int fd = arg->val;
346 
347 	if (fd == AT_FDCWD)
348 		return scnprintf(bf, size, "CWD");
349 
350 	return syscall_arg__scnprintf_fd(bf, size, arg);
351 }
352 
353 #define SCA_FDAT syscall_arg__scnprintf_fd_at
354 
355 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
356 					      struct syscall_arg *arg);
357 
358 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
359 
360 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
361 					 struct syscall_arg *arg)
362 {
363 	return scnprintf(bf, size, "%#lx", arg->val);
364 }
365 
366 #define SCA_HEX syscall_arg__scnprintf_hex
367 
368 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
369 					 struct syscall_arg *arg)
370 {
371 	return scnprintf(bf, size, "%d", arg->val);
372 }
373 
374 #define SCA_INT syscall_arg__scnprintf_int
375 
376 static const char *bpf_cmd[] = {
377 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
378 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
379 };
380 static DEFINE_STRARRAY(bpf_cmd);
381 
382 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
383 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
384 
385 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
386 static DEFINE_STRARRAY(itimers);
387 
388 static const char *keyctl_options[] = {
389 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
390 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
391 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
392 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
393 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
394 };
395 static DEFINE_STRARRAY(keyctl_options);
396 
397 static const char *whences[] = { "SET", "CUR", "END",
398 #ifdef SEEK_DATA
399 "DATA",
400 #endif
401 #ifdef SEEK_HOLE
402 "HOLE",
403 #endif
404 };
405 static DEFINE_STRARRAY(whences);
406 
407 static const char *fcntl_cmds[] = {
408 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
409 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
410 	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
411 	"F_GETOWNER_UIDS",
412 };
413 static DEFINE_STRARRAY(fcntl_cmds);
414 
415 static const char *rlimit_resources[] = {
416 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
417 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
418 	"RTTIME",
419 };
420 static DEFINE_STRARRAY(rlimit_resources);
421 
422 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
423 static DEFINE_STRARRAY(sighow);
424 
425 static const char *clockid[] = {
426 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
427 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
428 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
429 };
430 static DEFINE_STRARRAY(clockid);
431 
432 static const char *socket_families[] = {
433 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
434 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
435 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
436 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
437 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
438 	"ALG", "NFC", "VSOCK",
439 };
440 static DEFINE_STRARRAY(socket_families);
441 
442 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
443 						 struct syscall_arg *arg)
444 {
445 	size_t printed = 0;
446 	int mode = arg->val;
447 
448 	if (mode == F_OK) /* 0 */
449 		return scnprintf(bf, size, "F");
450 #define	P_MODE(n) \
451 	if (mode & n##_OK) { \
452 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
453 		mode &= ~n##_OK; \
454 	}
455 
456 	P_MODE(R);
457 	P_MODE(W);
458 	P_MODE(X);
459 #undef P_MODE
460 
461 	if (mode)
462 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
463 
464 	return printed;
465 }
466 
467 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
468 
469 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
470 					      struct syscall_arg *arg);
471 
472 #define SCA_FILENAME syscall_arg__scnprintf_filename
473 
474 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
475 						struct syscall_arg *arg)
476 {
477 	int printed = 0, flags = arg->val;
478 
479 #define	P_FLAG(n) \
480 	if (flags & O_##n) { \
481 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
482 		flags &= ~O_##n; \
483 	}
484 
485 	P_FLAG(CLOEXEC);
486 	P_FLAG(NONBLOCK);
487 #undef P_FLAG
488 
489 	if (flags)
490 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
491 
492 	return printed;
493 }
494 
495 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
496 
497 #if defined(__i386__) || defined(__x86_64__)
498 /*
499  * FIXME: Make this available to all arches.
500  */
501 #define TCGETS		0x5401
502 
503 static const char *tioctls[] = {
504 	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
505 	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
506 	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
507 	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
508 	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
509 	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
510 	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
511 	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
512 	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
513 	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
514 	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
515 	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
516 	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
517 	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
518 	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
519 };
520 
521 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
522 #endif /* defined(__i386__) || defined(__x86_64__) */
523 
524 #ifndef GRND_NONBLOCK
525 #define GRND_NONBLOCK	0x0001
526 #endif
527 #ifndef GRND_RANDOM
528 #define GRND_RANDOM	0x0002
529 #endif
530 
531 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
532 						   struct syscall_arg *arg)
533 {
534 	int printed = 0, flags = arg->val;
535 
536 #define	P_FLAG(n) \
537 	if (flags & GRND_##n) { \
538 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
539 		flags &= ~GRND_##n; \
540 	}
541 
542 	P_FLAG(RANDOM);
543 	P_FLAG(NONBLOCK);
544 #undef P_FLAG
545 
546 	if (flags)
547 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
548 
549 	return printed;
550 }
551 
552 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
553 
554 #define STRARRAY(arg, name, array) \
555 	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
556 	  .arg_parm	 = { [arg] = &strarray__##array, }
557 
558 #include "trace/beauty/eventfd.c"
559 #include "trace/beauty/flock.c"
560 #include "trace/beauty/futex_op.c"
561 #include "trace/beauty/mmap.c"
562 #include "trace/beauty/mode_t.c"
563 #include "trace/beauty/msg_flags.c"
564 #include "trace/beauty/open_flags.c"
565 #include "trace/beauty/perf_event_open.c"
566 #include "trace/beauty/pid.c"
567 #include "trace/beauty/sched_policy.c"
568 #include "trace/beauty/seccomp.c"
569 #include "trace/beauty/signum.c"
570 #include "trace/beauty/socket_type.c"
571 #include "trace/beauty/waitid_options.c"
572 
573 static struct syscall_fmt {
574 	const char *name;
575 	const char *alias;
576 	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
577 	void	   *arg_parm[6];
578 	bool	   errmsg;
579 	bool	   errpid;
580 	bool	   timeout;
581 	bool	   hexret;
582 } syscall_fmts[] = {
583 	{ .name	    = "access",	    .errmsg = true,
584 	  .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
585 	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
586 	{ .name	    = "bpf",	    .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
587 	{ .name	    = "brk",	    .hexret = true,
588 	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
589 	{ .name	    = "chdir",	    .errmsg = true, },
590 	{ .name	    = "chmod",	    .errmsg = true, },
591 	{ .name	    = "chroot",	    .errmsg = true, },
592 	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
593 	{ .name	    = "clone",	    .errpid = true, },
594 	{ .name	    = "close",	    .errmsg = true,
595 	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
596 	{ .name	    = "connect",    .errmsg = true, },
597 	{ .name	    = "creat",	    .errmsg = true, },
598 	{ .name	    = "dup",	    .errmsg = true, },
599 	{ .name	    = "dup2",	    .errmsg = true, },
600 	{ .name	    = "dup3",	    .errmsg = true, },
601 	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
602 	{ .name	    = "eventfd2",   .errmsg = true,
603 	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
604 	{ .name	    = "faccessat",  .errmsg = true, },
605 	{ .name	    = "fadvise64",  .errmsg = true, },
606 	{ .name	    = "fallocate",  .errmsg = true, },
607 	{ .name	    = "fchdir",	    .errmsg = true, },
608 	{ .name	    = "fchmod",	    .errmsg = true, },
609 	{ .name	    = "fchmodat",   .errmsg = true,
610 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
611 	{ .name	    = "fchown",	    .errmsg = true, },
612 	{ .name	    = "fchownat",   .errmsg = true,
613 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
614 	{ .name	    = "fcntl",	    .errmsg = true,
615 	  .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
616 	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
617 	{ .name	    = "fdatasync",  .errmsg = true, },
618 	{ .name	    = "flock",	    .errmsg = true,
619 	  .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
620 	{ .name	    = "fsetxattr",  .errmsg = true, },
621 	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat", },
622 	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat", },
623 	{ .name	    = "fstatfs",    .errmsg = true, },
624 	{ .name	    = "fsync",    .errmsg = true, },
625 	{ .name	    = "ftruncate", .errmsg = true, },
626 	{ .name	    = "futex",	    .errmsg = true,
627 	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
628 	{ .name	    = "futimesat", .errmsg = true,
629 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
630 	{ .name	    = "getdents",   .errmsg = true, },
631 	{ .name	    = "getdents64", .errmsg = true, },
632 	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
633 	{ .name	    = "getpid",	    .errpid = true, },
634 	{ .name	    = "getpgid",    .errpid = true, },
635 	{ .name	    = "getppid",    .errpid = true, },
636 	{ .name	    = "getrandom",  .errmsg = true,
637 	  .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
638 	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
639 	{ .name	    = "getxattr",   .errmsg = true, },
640 	{ .name	    = "inotify_add_watch",	    .errmsg = true, },
641 	{ .name	    = "ioctl",	    .errmsg = true,
642 	  .arg_scnprintf = {
643 #if defined(__i386__) || defined(__x86_64__)
644 /*
645  * FIXME: Make this available to all arches.
646  */
647 			     [1] = SCA_STRHEXARRAY, /* cmd */
648 			     [2] = SCA_HEX, /* arg */ },
649 	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
650 #else
651 			     [2] = SCA_HEX, /* arg */ }, },
652 #endif
653 	{ .name	    = "keyctl",	    .errmsg = true, STRARRAY(0, option, keyctl_options), },
654 	{ .name	    = "kill",	    .errmsg = true,
655 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
656 	{ .name	    = "lchown",    .errmsg = true, },
657 	{ .name	    = "lgetxattr",  .errmsg = true, },
658 	{ .name	    = "linkat",	    .errmsg = true,
659 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
660 	{ .name	    = "listxattr",  .errmsg = true, },
661 	{ .name	    = "llistxattr", .errmsg = true, },
662 	{ .name	    = "lremovexattr",  .errmsg = true, },
663 	{ .name	    = "lseek",	    .errmsg = true,
664 	  .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
665 	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
666 	{ .name	    = "lsetxattr",  .errmsg = true, },
667 	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat", },
668 	{ .name	    = "lsxattr",    .errmsg = true, },
669 	{ .name     = "madvise",    .errmsg = true,
670 	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
671 			     [2] = SCA_MADV_BHV, /* behavior */ }, },
672 	{ .name	    = "mkdir",    .errmsg = true, },
673 	{ .name	    = "mkdirat",    .errmsg = true,
674 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
675 	{ .name	    = "mknod",      .errmsg = true, },
676 	{ .name	    = "mknodat",    .errmsg = true,
677 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
678 	{ .name	    = "mlock",	    .errmsg = true,
679 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
680 	{ .name	    = "mlockall",   .errmsg = true,
681 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
682 	{ .name	    = "mmap",	    .hexret = true,
683 	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
684 			     [2] = SCA_MMAP_PROT, /* prot */
685 			     [3] = SCA_MMAP_FLAGS, /* flags */ }, },
686 	{ .name	    = "mprotect",   .errmsg = true,
687 	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
688 			     [2] = SCA_MMAP_PROT, /* prot */ }, },
689 	{ .name	    = "mq_unlink", .errmsg = true,
690 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
691 	{ .name	    = "mremap",	    .hexret = true,
692 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
693 			     [3] = SCA_MREMAP_FLAGS, /* flags */
694 			     [4] = SCA_HEX, /* new_addr */ }, },
695 	{ .name	    = "munlock",    .errmsg = true,
696 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
697 	{ .name	    = "munmap",	    .errmsg = true,
698 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
699 	{ .name	    = "name_to_handle_at", .errmsg = true,
700 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
701 	{ .name	    = "newfstatat", .errmsg = true,
702 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
703 	{ .name	    = "open",	    .errmsg = true,
704 	  .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
705 	{ .name	    = "open_by_handle_at", .errmsg = true,
706 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
707 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
708 	{ .name	    = "openat",	    .errmsg = true,
709 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
710 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
711 	{ .name	    = "perf_event_open", .errmsg = true,
712 	  .arg_scnprintf = { [2] = SCA_INT, /* cpu */
713 			     [3] = SCA_FD,  /* group_fd */
714 			     [4] = SCA_PERF_FLAGS,  /* flags */ }, },
715 	{ .name	    = "pipe2",	    .errmsg = true,
716 	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
717 	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
718 	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
719 	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64", },
720 	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread", },
721 	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
722 	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64", },
723 	{ .name	    = "pwritev",    .errmsg = true, },
724 	{ .name	    = "read",	    .errmsg = true, },
725 	{ .name	    = "readlink",   .errmsg = true, },
726 	{ .name	    = "readlinkat", .errmsg = true,
727 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
728 	{ .name	    = "readv",	    .errmsg = true, },
729 	{ .name	    = "recvfrom",   .errmsg = true,
730 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
731 	{ .name	    = "recvmmsg",   .errmsg = true,
732 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
733 	{ .name	    = "recvmsg",    .errmsg = true,
734 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
735 	{ .name	    = "removexattr", .errmsg = true, },
736 	{ .name	    = "renameat",   .errmsg = true,
737 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
738 	{ .name	    = "rmdir",    .errmsg = true, },
739 	{ .name	    = "rt_sigaction", .errmsg = true,
740 	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
741 	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
742 	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
743 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
744 	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
745 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
746 	{ .name	    = "sched_getattr",	      .errmsg = true, },
747 	{ .name	    = "sched_setattr",	      .errmsg = true, },
748 	{ .name	    = "sched_setscheduler",   .errmsg = true,
749 	  .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
750 	{ .name	    = "seccomp", .errmsg = true,
751 	  .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
752 			     [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
753 	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
754 	{ .name	    = "sendmmsg",    .errmsg = true,
755 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
756 	{ .name	    = "sendmsg",    .errmsg = true,
757 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
758 	{ .name	    = "sendto",	    .errmsg = true,
759 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
760 	{ .name	    = "set_tid_address", .errpid = true, },
761 	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
762 	{ .name	    = "setpgid",    .errmsg = true, },
763 	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
764 	{ .name	    = "setxattr",   .errmsg = true, },
765 	{ .name	    = "shutdown",   .errmsg = true, },
766 	{ .name	    = "socket",	    .errmsg = true,
767 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
768 			     [1] = SCA_SK_TYPE, /* type */ },
769 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
770 	{ .name	    = "socketpair", .errmsg = true,
771 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
772 			     [1] = SCA_SK_TYPE, /* type */ },
773 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
774 	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat", },
775 	{ .name	    = "statfs",	    .errmsg = true, },
776 	{ .name	    = "statx",	    .errmsg = true,
777 	  .arg_scnprintf = { [0] = SCA_FDAT, /* flags */
778 			     [2] = SCA_STATX_FLAGS, /* flags */
779 			     [3] = SCA_STATX_MASK, /* mask */ }, },
780 	{ .name	    = "swapoff",    .errmsg = true,
781 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
782 	{ .name	    = "swapon",	    .errmsg = true,
783 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
784 	{ .name	    = "symlinkat",  .errmsg = true,
785 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
786 	{ .name	    = "tgkill",	    .errmsg = true,
787 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
788 	{ .name	    = "tkill",	    .errmsg = true,
789 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
790 	{ .name	    = "truncate",   .errmsg = true, },
791 	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
792 	{ .name	    = "unlinkat",   .errmsg = true,
793 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
794 	{ .name	    = "utime",  .errmsg = true, },
795 	{ .name	    = "utimensat",  .errmsg = true,
796 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
797 	{ .name	    = "utimes",  .errmsg = true, },
798 	{ .name	    = "vmsplice",  .errmsg = true, },
799 	{ .name	    = "wait4",	    .errpid = true,
800 	  .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
801 	{ .name	    = "waitid",	    .errpid = true,
802 	  .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
803 	{ .name	    = "write",	    .errmsg = true, },
804 	{ .name	    = "writev",	    .errmsg = true, },
805 };
806 
807 static int syscall_fmt__cmp(const void *name, const void *fmtp)
808 {
809 	const struct syscall_fmt *fmt = fmtp;
810 	return strcmp(name, fmt->name);
811 }
812 
813 static struct syscall_fmt *syscall_fmt__find(const char *name)
814 {
815 	const int nmemb = ARRAY_SIZE(syscall_fmts);
816 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
817 }
818 
819 struct syscall {
820 	struct event_format *tp_format;
821 	int		    nr_args;
822 	struct format_field *args;
823 	const char	    *name;
824 	bool		    is_exit;
825 	struct syscall_fmt  *fmt;
826 	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
827 	void		    **arg_parm;
828 };
829 
830 /*
831  * We need to have this 'calculated' boolean because in some cases we really
832  * don't know what is the duration of a syscall, for instance, when we start
833  * a session and some threads are waiting for a syscall to finish, say 'poll',
834  * in which case all we can do is to print "( ? ) for duration and for the
835  * start timestamp.
836  */
837 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
838 {
839 	double duration = (double)t / NSEC_PER_MSEC;
840 	size_t printed = fprintf(fp, "(");
841 
842 	if (!calculated)
843 		printed += fprintf(fp, "     ?   ");
844 	else if (duration >= 1.0)
845 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
846 	else if (duration >= 0.01)
847 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
848 	else
849 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
850 	return printed + fprintf(fp, "): ");
851 }
852 
853 /**
854  * filename.ptr: The filename char pointer that will be vfs_getname'd
855  * filename.entry_str_pos: Where to insert the string translated from
856  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
857  */
858 struct thread_trace {
859 	u64		  entry_time;
860 	bool		  entry_pending;
861 	unsigned long	  nr_events;
862 	unsigned long	  pfmaj, pfmin;
863 	char		  *entry_str;
864 	double		  runtime_ms;
865         struct {
866 		unsigned long ptr;
867 		short int     entry_str_pos;
868 		bool	      pending_open;
869 		unsigned int  namelen;
870 		char	      *name;
871 	} filename;
872 	struct {
873 		int	  max;
874 		char	  **table;
875 	} paths;
876 
877 	struct intlist *syscall_stats;
878 };
879 
880 static struct thread_trace *thread_trace__new(void)
881 {
882 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
883 
884 	if (ttrace)
885 		ttrace->paths.max = -1;
886 
887 	ttrace->syscall_stats = intlist__new(NULL);
888 
889 	return ttrace;
890 }
891 
892 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
893 {
894 	struct thread_trace *ttrace;
895 
896 	if (thread == NULL)
897 		goto fail;
898 
899 	if (thread__priv(thread) == NULL)
900 		thread__set_priv(thread, thread_trace__new());
901 
902 	if (thread__priv(thread) == NULL)
903 		goto fail;
904 
905 	ttrace = thread__priv(thread);
906 	++ttrace->nr_events;
907 
908 	return ttrace;
909 fail:
910 	color_fprintf(fp, PERF_COLOR_RED,
911 		      "WARNING: not enough memory, dropping samples!\n");
912 	return NULL;
913 }
914 
915 #define TRACE_PFMAJ		(1 << 0)
916 #define TRACE_PFMIN		(1 << 1)
917 
918 static const size_t trace__entry_str_size = 2048;
919 
920 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
921 {
922 	struct thread_trace *ttrace = thread__priv(thread);
923 
924 	if (fd > ttrace->paths.max) {
925 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
926 
927 		if (npath == NULL)
928 			return -1;
929 
930 		if (ttrace->paths.max != -1) {
931 			memset(npath + ttrace->paths.max + 1, 0,
932 			       (fd - ttrace->paths.max) * sizeof(char *));
933 		} else {
934 			memset(npath, 0, (fd + 1) * sizeof(char *));
935 		}
936 
937 		ttrace->paths.table = npath;
938 		ttrace->paths.max   = fd;
939 	}
940 
941 	ttrace->paths.table[fd] = strdup(pathname);
942 
943 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
944 }
945 
946 static int thread__read_fd_path(struct thread *thread, int fd)
947 {
948 	char linkname[PATH_MAX], pathname[PATH_MAX];
949 	struct stat st;
950 	int ret;
951 
952 	if (thread->pid_ == thread->tid) {
953 		scnprintf(linkname, sizeof(linkname),
954 			  "/proc/%d/fd/%d", thread->pid_, fd);
955 	} else {
956 		scnprintf(linkname, sizeof(linkname),
957 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
958 	}
959 
960 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
961 		return -1;
962 
963 	ret = readlink(linkname, pathname, sizeof(pathname));
964 
965 	if (ret < 0 || ret > st.st_size)
966 		return -1;
967 
968 	pathname[ret] = '\0';
969 	return trace__set_fd_pathname(thread, fd, pathname);
970 }
971 
972 static const char *thread__fd_path(struct thread *thread, int fd,
973 				   struct trace *trace)
974 {
975 	struct thread_trace *ttrace = thread__priv(thread);
976 
977 	if (ttrace == NULL)
978 		return NULL;
979 
980 	if (fd < 0)
981 		return NULL;
982 
983 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
984 		if (!trace->live)
985 			return NULL;
986 		++trace->stats.proc_getname;
987 		if (thread__read_fd_path(thread, fd))
988 			return NULL;
989 	}
990 
991 	return ttrace->paths.table[fd];
992 }
993 
994 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
995 					struct syscall_arg *arg)
996 {
997 	int fd = arg->val;
998 	size_t printed = scnprintf(bf, size, "%d", fd);
999 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1000 
1001 	if (path)
1002 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1003 
1004 	return printed;
1005 }
1006 
1007 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1008 					      struct syscall_arg *arg)
1009 {
1010 	int fd = arg->val;
1011 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1012 	struct thread_trace *ttrace = thread__priv(arg->thread);
1013 
1014 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1015 		zfree(&ttrace->paths.table[fd]);
1016 
1017 	return printed;
1018 }
1019 
1020 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1021 				     unsigned long ptr)
1022 {
1023 	struct thread_trace *ttrace = thread__priv(thread);
1024 
1025 	ttrace->filename.ptr = ptr;
1026 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1027 }
1028 
1029 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1030 					      struct syscall_arg *arg)
1031 {
1032 	unsigned long ptr = arg->val;
1033 
1034 	if (!arg->trace->vfs_getname)
1035 		return scnprintf(bf, size, "%#x", ptr);
1036 
1037 	thread__set_filename_pos(arg->thread, bf, ptr);
1038 	return 0;
1039 }
1040 
1041 static bool trace__filter_duration(struct trace *trace, double t)
1042 {
1043 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1044 }
1045 
1046 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1047 {
1048 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1049 
1050 	return fprintf(fp, "%10.3f ", ts);
1051 }
1052 
1053 /*
1054  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1055  * using ttrace->entry_time for a thread that receives a sys_exit without
1056  * first having received a sys_enter ("poll" issued before tracing session
1057  * starts, lost sys_enter exit due to ring buffer overflow).
1058  */
1059 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1060 {
1061 	if (tstamp > 0)
1062 		return __trace__fprintf_tstamp(trace, tstamp, fp);
1063 
1064 	return fprintf(fp, "         ? ");
1065 }
1066 
1067 static bool done = false;
1068 static bool interrupted = false;
1069 
1070 static void sig_handler(int sig)
1071 {
1072 	done = true;
1073 	interrupted = sig == SIGINT;
1074 }
1075 
1076 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1077 					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1078 {
1079 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1080 	printed += fprintf_duration(duration, duration_calculated, fp);
1081 
1082 	if (trace->multiple_threads) {
1083 		if (trace->show_comm)
1084 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1085 		printed += fprintf(fp, "%d ", thread->tid);
1086 	}
1087 
1088 	return printed;
1089 }
1090 
1091 static int trace__process_event(struct trace *trace, struct machine *machine,
1092 				union perf_event *event, struct perf_sample *sample)
1093 {
1094 	int ret = 0;
1095 
1096 	switch (event->header.type) {
1097 	case PERF_RECORD_LOST:
1098 		color_fprintf(trace->output, PERF_COLOR_RED,
1099 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1100 		ret = machine__process_lost_event(machine, event, sample);
1101 		break;
1102 	default:
1103 		ret = machine__process_event(machine, event, sample);
1104 		break;
1105 	}
1106 
1107 	return ret;
1108 }
1109 
1110 static int trace__tool_process(struct perf_tool *tool,
1111 			       union perf_event *event,
1112 			       struct perf_sample *sample,
1113 			       struct machine *machine)
1114 {
1115 	struct trace *trace = container_of(tool, struct trace, tool);
1116 	return trace__process_event(trace, machine, event, sample);
1117 }
1118 
1119 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1120 {
1121 	struct machine *machine = vmachine;
1122 
1123 	if (machine->kptr_restrict_warned)
1124 		return NULL;
1125 
1126 	if (symbol_conf.kptr_restrict) {
1127 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1128 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1129 			   "Kernel samples will not be resolved.\n");
1130 		machine->kptr_restrict_warned = true;
1131 		return NULL;
1132 	}
1133 
1134 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1135 }
1136 
1137 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1138 {
1139 	int err = symbol__init(NULL);
1140 
1141 	if (err)
1142 		return err;
1143 
1144 	trace->host = machine__new_host();
1145 	if (trace->host == NULL)
1146 		return -ENOMEM;
1147 
1148 	if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1149 		return -errno;
1150 
1151 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1152 					    evlist->threads, trace__tool_process, false,
1153 					    trace->opts.proc_map_timeout);
1154 	if (err)
1155 		symbol__exit();
1156 
1157 	return err;
1158 }
1159 
1160 static int syscall__set_arg_fmts(struct syscall *sc)
1161 {
1162 	struct format_field *field;
1163 	int idx = 0, len;
1164 
1165 	sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1166 	if (sc->arg_scnprintf == NULL)
1167 		return -1;
1168 
1169 	if (sc->fmt)
1170 		sc->arg_parm = sc->fmt->arg_parm;
1171 
1172 	for (field = sc->args; field; field = field->next) {
1173 		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1174 			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1175 		else if (strcmp(field->type, "const char *") == 0 &&
1176 			 (strcmp(field->name, "filename") == 0 ||
1177 			  strcmp(field->name, "path") == 0 ||
1178 			  strcmp(field->name, "pathname") == 0))
1179 			sc->arg_scnprintf[idx] = SCA_FILENAME;
1180 		else if (field->flags & FIELD_IS_POINTER)
1181 			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1182 		else if (strcmp(field->type, "pid_t") == 0)
1183 			sc->arg_scnprintf[idx] = SCA_PID;
1184 		else if (strcmp(field->type, "umode_t") == 0)
1185 			sc->arg_scnprintf[idx] = SCA_MODE_T;
1186 		else if ((strcmp(field->type, "int") == 0 ||
1187 			  strcmp(field->type, "unsigned int") == 0 ||
1188 			  strcmp(field->type, "long") == 0) &&
1189 			 (len = strlen(field->name)) >= 2 &&
1190 			 strcmp(field->name + len - 2, "fd") == 0) {
1191 			/*
1192 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1193 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1194 			 * 65 int
1195 			 * 23 unsigned int
1196 			 * 7 unsigned long
1197 			 */
1198 			sc->arg_scnprintf[idx] = SCA_FD;
1199 		}
1200 		++idx;
1201 	}
1202 
1203 	return 0;
1204 }
1205 
1206 static int trace__read_syscall_info(struct trace *trace, int id)
1207 {
1208 	char tp_name[128];
1209 	struct syscall *sc;
1210 	const char *name = syscalltbl__name(trace->sctbl, id);
1211 
1212 	if (name == NULL)
1213 		return -1;
1214 
1215 	if (id > trace->syscalls.max) {
1216 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1217 
1218 		if (nsyscalls == NULL)
1219 			return -1;
1220 
1221 		if (trace->syscalls.max != -1) {
1222 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1223 			       (id - trace->syscalls.max) * sizeof(*sc));
1224 		} else {
1225 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1226 		}
1227 
1228 		trace->syscalls.table = nsyscalls;
1229 		trace->syscalls.max   = id;
1230 	}
1231 
1232 	sc = trace->syscalls.table + id;
1233 	sc->name = name;
1234 
1235 	sc->fmt  = syscall_fmt__find(sc->name);
1236 
1237 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1238 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1239 
1240 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1241 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1242 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1243 	}
1244 
1245 	if (IS_ERR(sc->tp_format))
1246 		return -1;
1247 
1248 	sc->args = sc->tp_format->format.fields;
1249 	sc->nr_args = sc->tp_format->format.nr_fields;
1250 	/*
1251 	 * We need to check and discard the first variable '__syscall_nr'
1252 	 * or 'nr' that mean the syscall number. It is needless here.
1253 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1254 	 */
1255 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1256 		sc->args = sc->args->next;
1257 		--sc->nr_args;
1258 	}
1259 
1260 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1261 
1262 	return syscall__set_arg_fmts(sc);
1263 }
1264 
1265 static int trace__validate_ev_qualifier(struct trace *trace)
1266 {
1267 	int err = 0, i;
1268 	struct str_node *pos;
1269 
1270 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1271 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1272 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1273 
1274 	if (trace->ev_qualifier_ids.entries == NULL) {
1275 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1276 		       trace->output);
1277 		err = -EINVAL;
1278 		goto out;
1279 	}
1280 
1281 	i = 0;
1282 
1283 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1284 		const char *sc = pos->s;
1285 		int id = syscalltbl__id(trace->sctbl, sc);
1286 
1287 		if (id < 0) {
1288 			if (err == 0) {
1289 				fputs("Error:\tInvalid syscall ", trace->output);
1290 				err = -EINVAL;
1291 			} else {
1292 				fputs(", ", trace->output);
1293 			}
1294 
1295 			fputs(sc, trace->output);
1296 		}
1297 
1298 		trace->ev_qualifier_ids.entries[i++] = id;
1299 	}
1300 
1301 	if (err < 0) {
1302 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1303 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1304 		zfree(&trace->ev_qualifier_ids.entries);
1305 		trace->ev_qualifier_ids.nr = 0;
1306 	}
1307 out:
1308 	return err;
1309 }
1310 
1311 /*
1312  * args is to be interpreted as a series of longs but we need to handle
1313  * 8-byte unaligned accesses. args points to raw_data within the event
1314  * and raw_data is guaranteed to be 8-byte unaligned because it is
1315  * preceded by raw_size which is a u32. So we need to copy args to a temp
1316  * variable to read it. Most notably this avoids extended load instructions
1317  * on unaligned addresses
1318  */
1319 
1320 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1321 				      unsigned char *args, struct trace *trace,
1322 				      struct thread *thread)
1323 {
1324 	size_t printed = 0;
1325 	unsigned char *p;
1326 	unsigned long val;
1327 
1328 	if (sc->args != NULL) {
1329 		struct format_field *field;
1330 		u8 bit = 1;
1331 		struct syscall_arg arg = {
1332 			.idx	= 0,
1333 			.mask	= 0,
1334 			.trace  = trace,
1335 			.thread = thread,
1336 		};
1337 
1338 		for (field = sc->args; field;
1339 		     field = field->next, ++arg.idx, bit <<= 1) {
1340 			if (arg.mask & bit)
1341 				continue;
1342 
1343 			/* special care for unaligned accesses */
1344 			p = args + sizeof(unsigned long) * arg.idx;
1345 			memcpy(&val, p, sizeof(val));
1346 
1347 			/*
1348  			 * Suppress this argument if its value is zero and
1349  			 * and we don't have a string associated in an
1350  			 * strarray for it.
1351  			 */
1352 			if (val == 0 &&
1353 			    !(sc->arg_scnprintf &&
1354 			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1355 			      sc->arg_parm[arg.idx]))
1356 				continue;
1357 
1358 			printed += scnprintf(bf + printed, size - printed,
1359 					     "%s%s: ", printed ? ", " : "", field->name);
1360 			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1361 				arg.val = val;
1362 				if (sc->arg_parm)
1363 					arg.parm = sc->arg_parm[arg.idx];
1364 				printed += sc->arg_scnprintf[arg.idx](bf + printed,
1365 								      size - printed, &arg);
1366 			} else {
1367 				printed += scnprintf(bf + printed, size - printed,
1368 						     "%ld", val);
1369 			}
1370 		}
1371 	} else if (IS_ERR(sc->tp_format)) {
1372 		/*
1373 		 * If we managed to read the tracepoint /format file, then we
1374 		 * may end up not having any args, like with gettid(), so only
1375 		 * print the raw args when we didn't manage to read it.
1376 		 */
1377 		int i = 0;
1378 
1379 		while (i < 6) {
1380 			/* special care for unaligned accesses */
1381 			p = args + sizeof(unsigned long) * i;
1382 			memcpy(&val, p, sizeof(val));
1383 			printed += scnprintf(bf + printed, size - printed,
1384 					     "%sarg%d: %ld",
1385 					     printed ? ", " : "", i, val);
1386 			++i;
1387 		}
1388 	}
1389 
1390 	return printed;
1391 }
1392 
1393 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1394 				  union perf_event *event,
1395 				  struct perf_sample *sample);
1396 
1397 static struct syscall *trace__syscall_info(struct trace *trace,
1398 					   struct perf_evsel *evsel, int id)
1399 {
1400 
1401 	if (id < 0) {
1402 
1403 		/*
1404 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1405 		 * before that, leaving at a higher verbosity level till that is
1406 		 * explained. Reproduced with plain ftrace with:
1407 		 *
1408 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1409 		 * grep "NR -1 " /t/trace_pipe
1410 		 *
1411 		 * After generating some load on the machine.
1412  		 */
1413 		if (verbose > 1) {
1414 			static u64 n;
1415 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1416 				id, perf_evsel__name(evsel), ++n);
1417 		}
1418 		return NULL;
1419 	}
1420 
1421 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1422 	    trace__read_syscall_info(trace, id))
1423 		goto out_cant_read;
1424 
1425 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1426 		goto out_cant_read;
1427 
1428 	return &trace->syscalls.table[id];
1429 
1430 out_cant_read:
1431 	if (verbose > 0) {
1432 		fprintf(trace->output, "Problems reading syscall %d", id);
1433 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1434 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1435 		fputs(" information\n", trace->output);
1436 	}
1437 	return NULL;
1438 }
1439 
1440 static void thread__update_stats(struct thread_trace *ttrace,
1441 				 int id, struct perf_sample *sample)
1442 {
1443 	struct int_node *inode;
1444 	struct stats *stats;
1445 	u64 duration = 0;
1446 
1447 	inode = intlist__findnew(ttrace->syscall_stats, id);
1448 	if (inode == NULL)
1449 		return;
1450 
1451 	stats = inode->priv;
1452 	if (stats == NULL) {
1453 		stats = malloc(sizeof(struct stats));
1454 		if (stats == NULL)
1455 			return;
1456 		init_stats(stats);
1457 		inode->priv = stats;
1458 	}
1459 
1460 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1461 		duration = sample->time - ttrace->entry_time;
1462 
1463 	update_stats(stats, duration);
1464 }
1465 
1466 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1467 {
1468 	struct thread_trace *ttrace;
1469 	u64 duration;
1470 	size_t printed;
1471 
1472 	if (trace->current == NULL)
1473 		return 0;
1474 
1475 	ttrace = thread__priv(trace->current);
1476 
1477 	if (!ttrace->entry_pending)
1478 		return 0;
1479 
1480 	duration = sample->time - ttrace->entry_time;
1481 
1482 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1483 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1484 	ttrace->entry_pending = false;
1485 
1486 	return printed;
1487 }
1488 
1489 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1490 			    union perf_event *event __maybe_unused,
1491 			    struct perf_sample *sample)
1492 {
1493 	char *msg;
1494 	void *args;
1495 	size_t printed = 0;
1496 	struct thread *thread;
1497 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1498 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1499 	struct thread_trace *ttrace;
1500 
1501 	if (sc == NULL)
1502 		return -1;
1503 
1504 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1505 	ttrace = thread__trace(thread, trace->output);
1506 	if (ttrace == NULL)
1507 		goto out_put;
1508 
1509 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1510 
1511 	if (ttrace->entry_str == NULL) {
1512 		ttrace->entry_str = malloc(trace__entry_str_size);
1513 		if (!ttrace->entry_str)
1514 			goto out_put;
1515 	}
1516 
1517 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1518 		trace__printf_interrupted_entry(trace, sample);
1519 
1520 	ttrace->entry_time = sample->time;
1521 	msg = ttrace->entry_str;
1522 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1523 
1524 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1525 					   args, trace, thread);
1526 
1527 	if (sc->is_exit) {
1528 		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1529 			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1530 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1531 		}
1532 	} else {
1533 		ttrace->entry_pending = true;
1534 		/* See trace__vfs_getname & trace__sys_exit */
1535 		ttrace->filename.pending_open = false;
1536 	}
1537 
1538 	if (trace->current != thread) {
1539 		thread__put(trace->current);
1540 		trace->current = thread__get(thread);
1541 	}
1542 	err = 0;
1543 out_put:
1544 	thread__put(thread);
1545 	return err;
1546 }
1547 
1548 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1549 				    struct perf_sample *sample,
1550 				    struct callchain_cursor *cursor)
1551 {
1552 	struct addr_location al;
1553 
1554 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1555 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1556 		return -1;
1557 
1558 	return 0;
1559 }
1560 
1561 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1562 {
1563 	/* TODO: user-configurable print_opts */
1564 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1565 				        EVSEL__PRINT_DSO |
1566 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1567 
1568 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1569 }
1570 
1571 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1572 			   union perf_event *event __maybe_unused,
1573 			   struct perf_sample *sample)
1574 {
1575 	long ret;
1576 	u64 duration = 0;
1577 	bool duration_calculated = false;
1578 	struct thread *thread;
1579 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1580 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1581 	struct thread_trace *ttrace;
1582 
1583 	if (sc == NULL)
1584 		return -1;
1585 
1586 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1587 	ttrace = thread__trace(thread, trace->output);
1588 	if (ttrace == NULL)
1589 		goto out_put;
1590 
1591 	if (trace->summary)
1592 		thread__update_stats(ttrace, id, sample);
1593 
1594 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1595 
1596 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1597 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1598 		ttrace->filename.pending_open = false;
1599 		++trace->stats.vfs_getname;
1600 	}
1601 
1602 	if (ttrace->entry_time) {
1603 		duration = sample->time - ttrace->entry_time;
1604 		if (trace__filter_duration(trace, duration))
1605 			goto out;
1606 		duration_calculated = true;
1607 	} else if (trace->duration_filter)
1608 		goto out;
1609 
1610 	if (sample->callchain) {
1611 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1612 		if (callchain_ret == 0) {
1613 			if (callchain_cursor.nr < trace->min_stack)
1614 				goto out;
1615 			callchain_ret = 1;
1616 		}
1617 	}
1618 
1619 	if (trace->summary_only)
1620 		goto out;
1621 
1622 	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1623 
1624 	if (ttrace->entry_pending) {
1625 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1626 	} else {
1627 		fprintf(trace->output, " ... [");
1628 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1629 		fprintf(trace->output, "]: %s()", sc->name);
1630 	}
1631 
1632 	if (sc->fmt == NULL) {
1633 signed_print:
1634 		fprintf(trace->output, ") = %ld", ret);
1635 	} else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1636 		char bf[STRERR_BUFSIZE];
1637 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1638 			   *e = audit_errno_to_name(-ret);
1639 
1640 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1641 	} else if (ret == 0 && sc->fmt->timeout)
1642 		fprintf(trace->output, ") = 0 Timeout");
1643 	else if (sc->fmt->hexret)
1644 		fprintf(trace->output, ") = %#lx", ret);
1645 	else if (sc->fmt->errpid) {
1646 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1647 
1648 		if (child != NULL) {
1649 			fprintf(trace->output, ") = %ld", ret);
1650 			if (child->comm_set)
1651 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1652 			thread__put(child);
1653 		}
1654 	} else
1655 		goto signed_print;
1656 
1657 	fputc('\n', trace->output);
1658 
1659 	if (callchain_ret > 0)
1660 		trace__fprintf_callchain(trace, sample);
1661 	else if (callchain_ret < 0)
1662 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1663 out:
1664 	ttrace->entry_pending = false;
1665 	err = 0;
1666 out_put:
1667 	thread__put(thread);
1668 	return err;
1669 }
1670 
1671 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1672 			      union perf_event *event __maybe_unused,
1673 			      struct perf_sample *sample)
1674 {
1675 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1676 	struct thread_trace *ttrace;
1677 	size_t filename_len, entry_str_len, to_move;
1678 	ssize_t remaining_space;
1679 	char *pos;
1680 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1681 
1682 	if (!thread)
1683 		goto out;
1684 
1685 	ttrace = thread__priv(thread);
1686 	if (!ttrace)
1687 		goto out_put;
1688 
1689 	filename_len = strlen(filename);
1690 	if (filename_len == 0)
1691 		goto out_put;
1692 
1693 	if (ttrace->filename.namelen < filename_len) {
1694 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1695 
1696 		if (f == NULL)
1697 			goto out_put;
1698 
1699 		ttrace->filename.namelen = filename_len;
1700 		ttrace->filename.name = f;
1701 	}
1702 
1703 	strcpy(ttrace->filename.name, filename);
1704 	ttrace->filename.pending_open = true;
1705 
1706 	if (!ttrace->filename.ptr)
1707 		goto out_put;
1708 
1709 	entry_str_len = strlen(ttrace->entry_str);
1710 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1711 	if (remaining_space <= 0)
1712 		goto out_put;
1713 
1714 	if (filename_len > (size_t)remaining_space) {
1715 		filename += filename_len - remaining_space;
1716 		filename_len = remaining_space;
1717 	}
1718 
1719 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1720 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1721 	memmove(pos + filename_len, pos, to_move);
1722 	memcpy(pos, filename, filename_len);
1723 
1724 	ttrace->filename.ptr = 0;
1725 	ttrace->filename.entry_str_pos = 0;
1726 out_put:
1727 	thread__put(thread);
1728 out:
1729 	return 0;
1730 }
1731 
1732 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1733 				     union perf_event *event __maybe_unused,
1734 				     struct perf_sample *sample)
1735 {
1736         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1737 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1738 	struct thread *thread = machine__findnew_thread(trace->host,
1739 							sample->pid,
1740 							sample->tid);
1741 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1742 
1743 	if (ttrace == NULL)
1744 		goto out_dump;
1745 
1746 	ttrace->runtime_ms += runtime_ms;
1747 	trace->runtime_ms += runtime_ms;
1748 out_put:
1749 	thread__put(thread);
1750 	return 0;
1751 
1752 out_dump:
1753 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1754 	       evsel->name,
1755 	       perf_evsel__strval(evsel, sample, "comm"),
1756 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1757 	       runtime,
1758 	       perf_evsel__intval(evsel, sample, "vruntime"));
1759 	goto out_put;
1760 }
1761 
1762 static void bpf_output__printer(enum binary_printer_ops op,
1763 				unsigned int val, void *extra)
1764 {
1765 	FILE *output = extra;
1766 	unsigned char ch = (unsigned char)val;
1767 
1768 	switch (op) {
1769 	case BINARY_PRINT_CHAR_DATA:
1770 		fprintf(output, "%c", isprint(ch) ? ch : '.');
1771 		break;
1772 	case BINARY_PRINT_DATA_BEGIN:
1773 	case BINARY_PRINT_LINE_BEGIN:
1774 	case BINARY_PRINT_ADDR:
1775 	case BINARY_PRINT_NUM_DATA:
1776 	case BINARY_PRINT_NUM_PAD:
1777 	case BINARY_PRINT_SEP:
1778 	case BINARY_PRINT_CHAR_PAD:
1779 	case BINARY_PRINT_LINE_END:
1780 	case BINARY_PRINT_DATA_END:
1781 	default:
1782 		break;
1783 	}
1784 }
1785 
1786 static void bpf_output__fprintf(struct trace *trace,
1787 				struct perf_sample *sample)
1788 {
1789 	print_binary(sample->raw_data, sample->raw_size, 8,
1790 		     bpf_output__printer, trace->output);
1791 }
1792 
1793 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1794 				union perf_event *event __maybe_unused,
1795 				struct perf_sample *sample)
1796 {
1797 	int callchain_ret = 0;
1798 
1799 	if (sample->callchain) {
1800 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1801 		if (callchain_ret == 0) {
1802 			if (callchain_cursor.nr < trace->min_stack)
1803 				goto out;
1804 			callchain_ret = 1;
1805 		}
1806 	}
1807 
1808 	trace__printf_interrupted_entry(trace, sample);
1809 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1810 
1811 	if (trace->trace_syscalls)
1812 		fprintf(trace->output, "(         ): ");
1813 
1814 	fprintf(trace->output, "%s:", evsel->name);
1815 
1816 	if (perf_evsel__is_bpf_output(evsel)) {
1817 		bpf_output__fprintf(trace, sample);
1818 	} else if (evsel->tp_format) {
1819 		event_format__fprintf(evsel->tp_format, sample->cpu,
1820 				      sample->raw_data, sample->raw_size,
1821 				      trace->output);
1822 	}
1823 
1824 	fprintf(trace->output, ")\n");
1825 
1826 	if (callchain_ret > 0)
1827 		trace__fprintf_callchain(trace, sample);
1828 	else if (callchain_ret < 0)
1829 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1830 out:
1831 	return 0;
1832 }
1833 
1834 static void print_location(FILE *f, struct perf_sample *sample,
1835 			   struct addr_location *al,
1836 			   bool print_dso, bool print_sym)
1837 {
1838 
1839 	if ((verbose > 0 || print_dso) && al->map)
1840 		fprintf(f, "%s@", al->map->dso->long_name);
1841 
1842 	if ((verbose > 0 || print_sym) && al->sym)
1843 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1844 			al->addr - al->sym->start);
1845 	else if (al->map)
1846 		fprintf(f, "0x%" PRIx64, al->addr);
1847 	else
1848 		fprintf(f, "0x%" PRIx64, sample->addr);
1849 }
1850 
1851 static int trace__pgfault(struct trace *trace,
1852 			  struct perf_evsel *evsel,
1853 			  union perf_event *event __maybe_unused,
1854 			  struct perf_sample *sample)
1855 {
1856 	struct thread *thread;
1857 	struct addr_location al;
1858 	char map_type = 'd';
1859 	struct thread_trace *ttrace;
1860 	int err = -1;
1861 	int callchain_ret = 0;
1862 
1863 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1864 
1865 	if (sample->callchain) {
1866 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1867 		if (callchain_ret == 0) {
1868 			if (callchain_cursor.nr < trace->min_stack)
1869 				goto out_put;
1870 			callchain_ret = 1;
1871 		}
1872 	}
1873 
1874 	ttrace = thread__trace(thread, trace->output);
1875 	if (ttrace == NULL)
1876 		goto out_put;
1877 
1878 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1879 		ttrace->pfmaj++;
1880 	else
1881 		ttrace->pfmin++;
1882 
1883 	if (trace->summary_only)
1884 		goto out;
1885 
1886 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1887 			      sample->ip, &al);
1888 
1889 	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1890 
1891 	fprintf(trace->output, "%sfault [",
1892 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1893 		"maj" : "min");
1894 
1895 	print_location(trace->output, sample, &al, false, true);
1896 
1897 	fprintf(trace->output, "] => ");
1898 
1899 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1900 				   sample->addr, &al);
1901 
1902 	if (!al.map) {
1903 		thread__find_addr_location(thread, sample->cpumode,
1904 					   MAP__FUNCTION, sample->addr, &al);
1905 
1906 		if (al.map)
1907 			map_type = 'x';
1908 		else
1909 			map_type = '?';
1910 	}
1911 
1912 	print_location(trace->output, sample, &al, true, false);
1913 
1914 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1915 
1916 	if (callchain_ret > 0)
1917 		trace__fprintf_callchain(trace, sample);
1918 	else if (callchain_ret < 0)
1919 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1920 out:
1921 	err = 0;
1922 out_put:
1923 	thread__put(thread);
1924 	return err;
1925 }
1926 
1927 static void trace__set_base_time(struct trace *trace,
1928 				 struct perf_evsel *evsel,
1929 				 struct perf_sample *sample)
1930 {
1931 	/*
1932 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1933 	 * and don't use sample->time unconditionally, we may end up having
1934 	 * some other event in the future without PERF_SAMPLE_TIME for good
1935 	 * reason, i.e. we may not be interested in its timestamps, just in
1936 	 * it taking place, picking some piece of information when it
1937 	 * appears in our event stream (vfs_getname comes to mind).
1938 	 */
1939 	if (trace->base_time == 0 && !trace->full_time &&
1940 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1941 		trace->base_time = sample->time;
1942 }
1943 
1944 static int trace__process_sample(struct perf_tool *tool,
1945 				 union perf_event *event,
1946 				 struct perf_sample *sample,
1947 				 struct perf_evsel *evsel,
1948 				 struct machine *machine __maybe_unused)
1949 {
1950 	struct trace *trace = container_of(tool, struct trace, tool);
1951 	struct thread *thread;
1952 	int err = 0;
1953 
1954 	tracepoint_handler handler = evsel->handler;
1955 
1956 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1957 	if (thread && thread__is_filtered(thread))
1958 		goto out;
1959 
1960 	trace__set_base_time(trace, evsel, sample);
1961 
1962 	if (handler) {
1963 		++trace->nr_events;
1964 		handler(trace, evsel, event, sample);
1965 	}
1966 out:
1967 	thread__put(thread);
1968 	return err;
1969 }
1970 
1971 static int trace__record(struct trace *trace, int argc, const char **argv)
1972 {
1973 	unsigned int rec_argc, i, j;
1974 	const char **rec_argv;
1975 	const char * const record_args[] = {
1976 		"record",
1977 		"-R",
1978 		"-m", "1024",
1979 		"-c", "1",
1980 	};
1981 
1982 	const char * const sc_args[] = { "-e", };
1983 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1984 	const char * const majpf_args[] = { "-e", "major-faults" };
1985 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1986 	const char * const minpf_args[] = { "-e", "minor-faults" };
1987 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1988 
1989 	/* +1 is for the event string below */
1990 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1991 		majpf_args_nr + minpf_args_nr + argc;
1992 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
1993 
1994 	if (rec_argv == NULL)
1995 		return -ENOMEM;
1996 
1997 	j = 0;
1998 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
1999 		rec_argv[j++] = record_args[i];
2000 
2001 	if (trace->trace_syscalls) {
2002 		for (i = 0; i < sc_args_nr; i++)
2003 			rec_argv[j++] = sc_args[i];
2004 
2005 		/* event string may be different for older kernels - e.g., RHEL6 */
2006 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2007 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2008 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2009 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2010 		else {
2011 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2012 			return -1;
2013 		}
2014 	}
2015 
2016 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2017 		for (i = 0; i < majpf_args_nr; i++)
2018 			rec_argv[j++] = majpf_args[i];
2019 
2020 	if (trace->trace_pgfaults & TRACE_PFMIN)
2021 		for (i = 0; i < minpf_args_nr; i++)
2022 			rec_argv[j++] = minpf_args[i];
2023 
2024 	for (i = 0; i < (unsigned int)argc; i++)
2025 		rec_argv[j++] = argv[i];
2026 
2027 	return cmd_record(j, rec_argv);
2028 }
2029 
2030 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2031 
2032 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2033 {
2034 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2035 
2036 	if (IS_ERR(evsel))
2037 		return false;
2038 
2039 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2040 		perf_evsel__delete(evsel);
2041 		return false;
2042 	}
2043 
2044 	evsel->handler = trace__vfs_getname;
2045 	perf_evlist__add(evlist, evsel);
2046 	return true;
2047 }
2048 
2049 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2050 {
2051 	struct perf_evsel *evsel;
2052 	struct perf_event_attr attr = {
2053 		.type = PERF_TYPE_SOFTWARE,
2054 		.mmap_data = 1,
2055 	};
2056 
2057 	attr.config = config;
2058 	attr.sample_period = 1;
2059 
2060 	event_attr_init(&attr);
2061 
2062 	evsel = perf_evsel__new(&attr);
2063 	if (evsel)
2064 		evsel->handler = trace__pgfault;
2065 
2066 	return evsel;
2067 }
2068 
2069 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2070 {
2071 	const u32 type = event->header.type;
2072 	struct perf_evsel *evsel;
2073 
2074 	if (type != PERF_RECORD_SAMPLE) {
2075 		trace__process_event(trace, trace->host, event, sample);
2076 		return;
2077 	}
2078 
2079 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2080 	if (evsel == NULL) {
2081 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2082 		return;
2083 	}
2084 
2085 	trace__set_base_time(trace, evsel, sample);
2086 
2087 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2088 	    sample->raw_data == NULL) {
2089 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2090 		       perf_evsel__name(evsel), sample->tid,
2091 		       sample->cpu, sample->raw_size);
2092 	} else {
2093 		tracepoint_handler handler = evsel->handler;
2094 		handler(trace, evsel, event, sample);
2095 	}
2096 }
2097 
2098 static int trace__add_syscall_newtp(struct trace *trace)
2099 {
2100 	int ret = -1;
2101 	struct perf_evlist *evlist = trace->evlist;
2102 	struct perf_evsel *sys_enter, *sys_exit;
2103 
2104 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2105 	if (sys_enter == NULL)
2106 		goto out;
2107 
2108 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2109 		goto out_delete_sys_enter;
2110 
2111 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2112 	if (sys_exit == NULL)
2113 		goto out_delete_sys_enter;
2114 
2115 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2116 		goto out_delete_sys_exit;
2117 
2118 	perf_evlist__add(evlist, sys_enter);
2119 	perf_evlist__add(evlist, sys_exit);
2120 
2121 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2122 		/*
2123 		 * We're interested only in the user space callchain
2124 		 * leading to the syscall, allow overriding that for
2125 		 * debugging reasons using --kernel_syscall_callchains
2126 		 */
2127 		sys_exit->attr.exclude_callchain_kernel = 1;
2128 	}
2129 
2130 	trace->syscalls.events.sys_enter = sys_enter;
2131 	trace->syscalls.events.sys_exit  = sys_exit;
2132 
2133 	ret = 0;
2134 out:
2135 	return ret;
2136 
2137 out_delete_sys_exit:
2138 	perf_evsel__delete_priv(sys_exit);
2139 out_delete_sys_enter:
2140 	perf_evsel__delete_priv(sys_enter);
2141 	goto out;
2142 }
2143 
2144 static int trace__set_ev_qualifier_filter(struct trace *trace)
2145 {
2146 	int err = -1;
2147 	struct perf_evsel *sys_exit;
2148 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2149 						trace->ev_qualifier_ids.nr,
2150 						trace->ev_qualifier_ids.entries);
2151 
2152 	if (filter == NULL)
2153 		goto out_enomem;
2154 
2155 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2156 					  filter)) {
2157 		sys_exit = trace->syscalls.events.sys_exit;
2158 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2159 	}
2160 
2161 	free(filter);
2162 out:
2163 	return err;
2164 out_enomem:
2165 	errno = ENOMEM;
2166 	goto out;
2167 }
2168 
2169 static int trace__run(struct trace *trace, int argc, const char **argv)
2170 {
2171 	struct perf_evlist *evlist = trace->evlist;
2172 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2173 	int err = -1, i;
2174 	unsigned long before;
2175 	const bool forks = argc > 0;
2176 	bool draining = false;
2177 
2178 	trace->live = true;
2179 
2180 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2181 		goto out_error_raw_syscalls;
2182 
2183 	if (trace->trace_syscalls)
2184 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2185 
2186 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2187 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2188 		if (pgfault_maj == NULL)
2189 			goto out_error_mem;
2190 		perf_evlist__add(evlist, pgfault_maj);
2191 	}
2192 
2193 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2194 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2195 		if (pgfault_min == NULL)
2196 			goto out_error_mem;
2197 		perf_evlist__add(evlist, pgfault_min);
2198 	}
2199 
2200 	if (trace->sched &&
2201 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2202 				   trace__sched_stat_runtime))
2203 		goto out_error_sched_stat_runtime;
2204 
2205 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2206 	if (err < 0) {
2207 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2208 		goto out_delete_evlist;
2209 	}
2210 
2211 	err = trace__symbols_init(trace, evlist);
2212 	if (err < 0) {
2213 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2214 		goto out_delete_evlist;
2215 	}
2216 
2217 	perf_evlist__config(evlist, &trace->opts, NULL);
2218 
2219 	if (callchain_param.enabled) {
2220 		bool use_identifier = false;
2221 
2222 		if (trace->syscalls.events.sys_exit) {
2223 			perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2224 						     &trace->opts, &callchain_param);
2225 			use_identifier = true;
2226 		}
2227 
2228 		if (pgfault_maj) {
2229 			perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2230 			use_identifier = true;
2231 		}
2232 
2233 		if (pgfault_min) {
2234 			perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2235 			use_identifier = true;
2236 		}
2237 
2238 		if (use_identifier) {
2239 		       /*
2240 			* Now we have evsels with different sample_ids, use
2241 			* PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2242 			* from a fixed position in each ring buffer record.
2243 			*
2244 			* As of this the changeset introducing this comment, this
2245 			* isn't strictly needed, as the fields that can come before
2246 			* PERF_SAMPLE_ID are all used, but we'll probably disable
2247 			* some of those for things like copying the payload of
2248 			* pointer syscall arguments, and for vfs_getname we don't
2249 			* need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2250 			* here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2251 			*/
2252 			perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2253 			perf_evlist__reset_sample_bit(evlist, ID);
2254 		}
2255 	}
2256 
2257 	signal(SIGCHLD, sig_handler);
2258 	signal(SIGINT, sig_handler);
2259 
2260 	if (forks) {
2261 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2262 						    argv, false, NULL);
2263 		if (err < 0) {
2264 			fprintf(trace->output, "Couldn't run the workload!\n");
2265 			goto out_delete_evlist;
2266 		}
2267 	}
2268 
2269 	err = perf_evlist__open(evlist);
2270 	if (err < 0)
2271 		goto out_error_open;
2272 
2273 	err = bpf__apply_obj_config();
2274 	if (err) {
2275 		char errbuf[BUFSIZ];
2276 
2277 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2278 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2279 			 errbuf);
2280 		goto out_error_open;
2281 	}
2282 
2283 	/*
2284 	 * Better not use !target__has_task() here because we need to cover the
2285 	 * case where no threads were specified in the command line, but a
2286 	 * workload was, and in that case we will fill in the thread_map when
2287 	 * we fork the workload in perf_evlist__prepare_workload.
2288 	 */
2289 	if (trace->filter_pids.nr > 0)
2290 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2291 	else if (thread_map__pid(evlist->threads, 0) == -1)
2292 		err = perf_evlist__set_filter_pid(evlist, getpid());
2293 
2294 	if (err < 0)
2295 		goto out_error_mem;
2296 
2297 	if (trace->ev_qualifier_ids.nr > 0) {
2298 		err = trace__set_ev_qualifier_filter(trace);
2299 		if (err < 0)
2300 			goto out_errno;
2301 
2302 		pr_debug("event qualifier tracepoint filter: %s\n",
2303 			 trace->syscalls.events.sys_exit->filter);
2304 	}
2305 
2306 	err = perf_evlist__apply_filters(evlist, &evsel);
2307 	if (err < 0)
2308 		goto out_error_apply_filters;
2309 
2310 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2311 	if (err < 0)
2312 		goto out_error_mmap;
2313 
2314 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2315 		perf_evlist__enable(evlist);
2316 
2317 	if (forks)
2318 		perf_evlist__start_workload(evlist);
2319 
2320 	if (trace->opts.initial_delay) {
2321 		usleep(trace->opts.initial_delay * 1000);
2322 		perf_evlist__enable(evlist);
2323 	}
2324 
2325 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2326 				  evlist->threads->nr > 1 ||
2327 				  perf_evlist__first(evlist)->attr.inherit;
2328 again:
2329 	before = trace->nr_events;
2330 
2331 	for (i = 0; i < evlist->nr_mmaps; i++) {
2332 		union perf_event *event;
2333 
2334 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2335 			struct perf_sample sample;
2336 
2337 			++trace->nr_events;
2338 
2339 			err = perf_evlist__parse_sample(evlist, event, &sample);
2340 			if (err) {
2341 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2342 				goto next_event;
2343 			}
2344 
2345 			trace__handle_event(trace, event, &sample);
2346 next_event:
2347 			perf_evlist__mmap_consume(evlist, i);
2348 
2349 			if (interrupted)
2350 				goto out_disable;
2351 
2352 			if (done && !draining) {
2353 				perf_evlist__disable(evlist);
2354 				draining = true;
2355 			}
2356 		}
2357 	}
2358 
2359 	if (trace->nr_events == before) {
2360 		int timeout = done ? 100 : -1;
2361 
2362 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2363 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2364 				draining = true;
2365 
2366 			goto again;
2367 		}
2368 	} else {
2369 		goto again;
2370 	}
2371 
2372 out_disable:
2373 	thread__zput(trace->current);
2374 
2375 	perf_evlist__disable(evlist);
2376 
2377 	if (!err) {
2378 		if (trace->summary)
2379 			trace__fprintf_thread_summary(trace, trace->output);
2380 
2381 		if (trace->show_tool_stats) {
2382 			fprintf(trace->output, "Stats:\n "
2383 					       " vfs_getname : %" PRIu64 "\n"
2384 					       " proc_getname: %" PRIu64 "\n",
2385 				trace->stats.vfs_getname,
2386 				trace->stats.proc_getname);
2387 		}
2388 	}
2389 
2390 out_delete_evlist:
2391 	perf_evlist__delete(evlist);
2392 	trace->evlist = NULL;
2393 	trace->live = false;
2394 	return err;
2395 {
2396 	char errbuf[BUFSIZ];
2397 
2398 out_error_sched_stat_runtime:
2399 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2400 	goto out_error;
2401 
2402 out_error_raw_syscalls:
2403 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2404 	goto out_error;
2405 
2406 out_error_mmap:
2407 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2408 	goto out_error;
2409 
2410 out_error_open:
2411 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2412 
2413 out_error:
2414 	fprintf(trace->output, "%s\n", errbuf);
2415 	goto out_delete_evlist;
2416 
2417 out_error_apply_filters:
2418 	fprintf(trace->output,
2419 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2420 		evsel->filter, perf_evsel__name(evsel), errno,
2421 		str_error_r(errno, errbuf, sizeof(errbuf)));
2422 	goto out_delete_evlist;
2423 }
2424 out_error_mem:
2425 	fprintf(trace->output, "Not enough memory to run!\n");
2426 	goto out_delete_evlist;
2427 
2428 out_errno:
2429 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2430 	goto out_delete_evlist;
2431 }
2432 
2433 static int trace__replay(struct trace *trace)
2434 {
2435 	const struct perf_evsel_str_handler handlers[] = {
2436 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2437 	};
2438 	struct perf_data_file file = {
2439 		.path  = input_name,
2440 		.mode  = PERF_DATA_MODE_READ,
2441 		.force = trace->force,
2442 	};
2443 	struct perf_session *session;
2444 	struct perf_evsel *evsel;
2445 	int err = -1;
2446 
2447 	trace->tool.sample	  = trace__process_sample;
2448 	trace->tool.mmap	  = perf_event__process_mmap;
2449 	trace->tool.mmap2	  = perf_event__process_mmap2;
2450 	trace->tool.comm	  = perf_event__process_comm;
2451 	trace->tool.exit	  = perf_event__process_exit;
2452 	trace->tool.fork	  = perf_event__process_fork;
2453 	trace->tool.attr	  = perf_event__process_attr;
2454 	trace->tool.tracing_data  = perf_event__process_tracing_data;
2455 	trace->tool.build_id	  = perf_event__process_build_id;
2456 	trace->tool.namespaces	  = perf_event__process_namespaces;
2457 
2458 	trace->tool.ordered_events = true;
2459 	trace->tool.ordering_requires_timestamps = true;
2460 
2461 	/* add tid to output */
2462 	trace->multiple_threads = true;
2463 
2464 	session = perf_session__new(&file, false, &trace->tool);
2465 	if (session == NULL)
2466 		return -1;
2467 
2468 	if (trace->opts.target.pid)
2469 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2470 
2471 	if (trace->opts.target.tid)
2472 		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2473 
2474 	if (symbol__init(&session->header.env) < 0)
2475 		goto out;
2476 
2477 	trace->host = &session->machines.host;
2478 
2479 	err = perf_session__set_tracepoints_handlers(session, handlers);
2480 	if (err)
2481 		goto out;
2482 
2483 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2484 						     "raw_syscalls:sys_enter");
2485 	/* older kernels have syscalls tp versus raw_syscalls */
2486 	if (evsel == NULL)
2487 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2488 							     "syscalls:sys_enter");
2489 
2490 	if (evsel &&
2491 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2492 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2493 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2494 		goto out;
2495 	}
2496 
2497 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2498 						     "raw_syscalls:sys_exit");
2499 	if (evsel == NULL)
2500 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2501 							     "syscalls:sys_exit");
2502 	if (evsel &&
2503 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2504 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2505 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2506 		goto out;
2507 	}
2508 
2509 	evlist__for_each_entry(session->evlist, evsel) {
2510 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2511 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2512 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2513 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2514 			evsel->handler = trace__pgfault;
2515 	}
2516 
2517 	setup_pager();
2518 
2519 	err = perf_session__process_events(session);
2520 	if (err)
2521 		pr_err("Failed to process events, error %d", err);
2522 
2523 	else if (trace->summary)
2524 		trace__fprintf_thread_summary(trace, trace->output);
2525 
2526 out:
2527 	perf_session__delete(session);
2528 
2529 	return err;
2530 }
2531 
2532 static size_t trace__fprintf_threads_header(FILE *fp)
2533 {
2534 	size_t printed;
2535 
2536 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2537 
2538 	return printed;
2539 }
2540 
2541 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2542 	struct stats 	*stats;
2543 	double		msecs;
2544 	int		syscall;
2545 )
2546 {
2547 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2548 	struct stats *stats = source->priv;
2549 
2550 	entry->syscall = source->i;
2551 	entry->stats   = stats;
2552 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2553 }
2554 
2555 static size_t thread__dump_stats(struct thread_trace *ttrace,
2556 				 struct trace *trace, FILE *fp)
2557 {
2558 	size_t printed = 0;
2559 	struct syscall *sc;
2560 	struct rb_node *nd;
2561 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2562 
2563 	if (syscall_stats == NULL)
2564 		return 0;
2565 
2566 	printed += fprintf(fp, "\n");
2567 
2568 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2569 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2570 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2571 
2572 	resort_rb__for_each_entry(nd, syscall_stats) {
2573 		struct stats *stats = syscall_stats_entry->stats;
2574 		if (stats) {
2575 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2576 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2577 			double avg = avg_stats(stats);
2578 			double pct;
2579 			u64 n = (u64) stats->n;
2580 
2581 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2582 			avg /= NSEC_PER_MSEC;
2583 
2584 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2585 			printed += fprintf(fp, "   %-15s", sc->name);
2586 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2587 					   n, syscall_stats_entry->msecs, min, avg);
2588 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2589 		}
2590 	}
2591 
2592 	resort_rb__delete(syscall_stats);
2593 	printed += fprintf(fp, "\n\n");
2594 
2595 	return printed;
2596 }
2597 
2598 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2599 {
2600 	size_t printed = 0;
2601 	struct thread_trace *ttrace = thread__priv(thread);
2602 	double ratio;
2603 
2604 	if (ttrace == NULL)
2605 		return 0;
2606 
2607 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2608 
2609 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2610 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2611 	printed += fprintf(fp, "%.1f%%", ratio);
2612 	if (ttrace->pfmaj)
2613 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2614 	if (ttrace->pfmin)
2615 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2616 	if (trace->sched)
2617 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2618 	else if (fputc('\n', fp) != EOF)
2619 		++printed;
2620 
2621 	printed += thread__dump_stats(ttrace, trace, fp);
2622 
2623 	return printed;
2624 }
2625 
2626 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2627 {
2628 	return ttrace ? ttrace->nr_events : 0;
2629 }
2630 
2631 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2632 	struct thread *thread;
2633 )
2634 {
2635 	entry->thread = rb_entry(nd, struct thread, rb_node);
2636 }
2637 
2638 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2639 {
2640 	DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2641 	size_t printed = trace__fprintf_threads_header(fp);
2642 	struct rb_node *nd;
2643 
2644 	if (threads == NULL) {
2645 		fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2646 		return 0;
2647 	}
2648 
2649 	resort_rb__for_each_entry(nd, threads)
2650 		printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2651 
2652 	resort_rb__delete(threads);
2653 
2654 	return printed;
2655 }
2656 
2657 static int trace__set_duration(const struct option *opt, const char *str,
2658 			       int unset __maybe_unused)
2659 {
2660 	struct trace *trace = opt->value;
2661 
2662 	trace->duration_filter = atof(str);
2663 	return 0;
2664 }
2665 
2666 static int trace__set_filter_pids(const struct option *opt, const char *str,
2667 				  int unset __maybe_unused)
2668 {
2669 	int ret = -1;
2670 	size_t i;
2671 	struct trace *trace = opt->value;
2672 	/*
2673 	 * FIXME: introduce a intarray class, plain parse csv and create a
2674 	 * { int nr, int entries[] } struct...
2675 	 */
2676 	struct intlist *list = intlist__new(str);
2677 
2678 	if (list == NULL)
2679 		return -1;
2680 
2681 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2682 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2683 
2684 	if (trace->filter_pids.entries == NULL)
2685 		goto out;
2686 
2687 	trace->filter_pids.entries[0] = getpid();
2688 
2689 	for (i = 1; i < trace->filter_pids.nr; ++i)
2690 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2691 
2692 	intlist__delete(list);
2693 	ret = 0;
2694 out:
2695 	return ret;
2696 }
2697 
2698 static int trace__open_output(struct trace *trace, const char *filename)
2699 {
2700 	struct stat st;
2701 
2702 	if (!stat(filename, &st) && st.st_size) {
2703 		char oldname[PATH_MAX];
2704 
2705 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2706 		unlink(oldname);
2707 		rename(filename, oldname);
2708 	}
2709 
2710 	trace->output = fopen(filename, "w");
2711 
2712 	return trace->output == NULL ? -errno : 0;
2713 }
2714 
2715 static int parse_pagefaults(const struct option *opt, const char *str,
2716 			    int unset __maybe_unused)
2717 {
2718 	int *trace_pgfaults = opt->value;
2719 
2720 	if (strcmp(str, "all") == 0)
2721 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2722 	else if (strcmp(str, "maj") == 0)
2723 		*trace_pgfaults |= TRACE_PFMAJ;
2724 	else if (strcmp(str, "min") == 0)
2725 		*trace_pgfaults |= TRACE_PFMIN;
2726 	else
2727 		return -1;
2728 
2729 	return 0;
2730 }
2731 
2732 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2733 {
2734 	struct perf_evsel *evsel;
2735 
2736 	evlist__for_each_entry(evlist, evsel)
2737 		evsel->handler = handler;
2738 }
2739 
2740 /*
2741  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2742  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2743  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2744  *
2745  * It'd be better to introduce a parse_options() variant that would return a
2746  * list with the terms it didn't match to an event...
2747  */
2748 static int trace__parse_events_option(const struct option *opt, const char *str,
2749 				      int unset __maybe_unused)
2750 {
2751 	struct trace *trace = (struct trace *)opt->value;
2752 	const char *s = str;
2753 	char *sep = NULL, *lists[2] = { NULL, NULL, };
2754 	int len = strlen(str), err = -1, list;
2755 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2756 	char group_name[PATH_MAX];
2757 
2758 	if (strace_groups_dir == NULL)
2759 		return -1;
2760 
2761 	if (*s == '!') {
2762 		++s;
2763 		trace->not_ev_qualifier = true;
2764 	}
2765 
2766 	while (1) {
2767 		if ((sep = strchr(s, ',')) != NULL)
2768 			*sep = '\0';
2769 
2770 		list = 0;
2771 		if (syscalltbl__id(trace->sctbl, s) >= 0) {
2772 			list = 1;
2773 		} else {
2774 			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2775 			if (access(group_name, R_OK) == 0)
2776 				list = 1;
2777 		}
2778 
2779 		if (lists[list]) {
2780 			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2781 		} else {
2782 			lists[list] = malloc(len);
2783 			if (lists[list] == NULL)
2784 				goto out;
2785 			strcpy(lists[list], s);
2786 		}
2787 
2788 		if (!sep)
2789 			break;
2790 
2791 		*sep = ',';
2792 		s = sep + 1;
2793 	}
2794 
2795 	if (lists[1] != NULL) {
2796 		struct strlist_config slist_config = {
2797 			.dirname = strace_groups_dir,
2798 		};
2799 
2800 		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2801 		if (trace->ev_qualifier == NULL) {
2802 			fputs("Not enough memory to parse event qualifier", trace->output);
2803 			goto out;
2804 		}
2805 
2806 		if (trace__validate_ev_qualifier(trace))
2807 			goto out;
2808 	}
2809 
2810 	err = 0;
2811 
2812 	if (lists[0]) {
2813 		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2814 					       "event selector. use 'perf list' to list available events",
2815 					       parse_events_option);
2816 		err = parse_events_option(&o, lists[0], 0);
2817 	}
2818 out:
2819 	if (sep)
2820 		*sep = ',';
2821 
2822 	return err;
2823 }
2824 
2825 int cmd_trace(int argc, const char **argv)
2826 {
2827 	const char *trace_usage[] = {
2828 		"perf trace [<options>] [<command>]",
2829 		"perf trace [<options>] -- <command> [<options>]",
2830 		"perf trace record [<options>] [<command>]",
2831 		"perf trace record [<options>] -- <command> [<options>]",
2832 		NULL
2833 	};
2834 	struct trace trace = {
2835 		.syscalls = {
2836 			. max = -1,
2837 		},
2838 		.opts = {
2839 			.target = {
2840 				.uid	   = UINT_MAX,
2841 				.uses_mmap = true,
2842 			},
2843 			.user_freq     = UINT_MAX,
2844 			.user_interval = ULLONG_MAX,
2845 			.no_buffering  = true,
2846 			.mmap_pages    = UINT_MAX,
2847 			.proc_map_timeout  = 500,
2848 		},
2849 		.output = stderr,
2850 		.show_comm = true,
2851 		.trace_syscalls = true,
2852 		.kernel_syscallchains = false,
2853 		.max_stack = UINT_MAX,
2854 	};
2855 	const char *output_name = NULL;
2856 	const struct option trace_options[] = {
2857 	OPT_CALLBACK('e', "event", &trace, "event",
2858 		     "event/syscall selector. use 'perf list' to list available events",
2859 		     trace__parse_events_option),
2860 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2861 		    "show the thread COMM next to its id"),
2862 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2863 	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2864 		     trace__parse_events_option),
2865 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
2866 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2867 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2868 		    "trace events on existing process id"),
2869 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2870 		    "trace events on existing thread id"),
2871 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2872 		     "pids to filter (by the kernel)", trace__set_filter_pids),
2873 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2874 		    "system-wide collection from all CPUs"),
2875 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2876 		    "list of cpus to monitor"),
2877 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2878 		    "child tasks do not inherit counters"),
2879 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2880 		     "number of mmap data pages",
2881 		     perf_evlist__parse_mmap_pages),
2882 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2883 		   "user to profile"),
2884 	OPT_CALLBACK(0, "duration", &trace, "float",
2885 		     "show only events with duration > N.M ms",
2886 		     trace__set_duration),
2887 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2888 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2889 	OPT_BOOLEAN('T', "time", &trace.full_time,
2890 		    "Show full timestamp, not time relative to first start"),
2891 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
2892 		    "Show only syscall summary with statistics"),
2893 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
2894 		    "Show all syscalls and summary with statistics"),
2895 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2896 		     "Trace pagefaults", parse_pagefaults, "maj"),
2897 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2898 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2899 	OPT_CALLBACK(0, "call-graph", &trace.opts,
2900 		     "record_mode[,record_size]", record_callchain_help,
2901 		     &record_parse_callchain_opt),
2902 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2903 		    "Show the kernel callchains on the syscall exit path"),
2904 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2905 		     "Set the minimum stack depth when parsing the callchain, "
2906 		     "anything below the specified depth will be ignored."),
2907 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2908 		     "Set the maximum stack depth when parsing the callchain, "
2909 		     "anything beyond the specified depth will be ignored. "
2910 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2911 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2912 			"per thread proc mmap processing timeout in ms"),
2913 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
2914 		     "ms to wait before starting measurement after program "
2915 		     "start"),
2916 	OPT_END()
2917 	};
2918 	bool __maybe_unused max_stack_user_set = true;
2919 	bool mmap_pages_user_set = true;
2920 	const char * const trace_subcommands[] = { "record", NULL };
2921 	int err;
2922 	char bf[BUFSIZ];
2923 
2924 	signal(SIGSEGV, sighandler_dump_stack);
2925 	signal(SIGFPE, sighandler_dump_stack);
2926 
2927 	trace.evlist = perf_evlist__new();
2928 	trace.sctbl = syscalltbl__new();
2929 
2930 	if (trace.evlist == NULL || trace.sctbl == NULL) {
2931 		pr_err("Not enough memory to run!\n");
2932 		err = -ENOMEM;
2933 		goto out;
2934 	}
2935 
2936 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2937 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2938 
2939 	err = bpf__setup_stdout(trace.evlist);
2940 	if (err) {
2941 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2942 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2943 		goto out;
2944 	}
2945 
2946 	err = -1;
2947 
2948 	if (trace.trace_pgfaults) {
2949 		trace.opts.sample_address = true;
2950 		trace.opts.sample_time = true;
2951 	}
2952 
2953 	if (trace.opts.mmap_pages == UINT_MAX)
2954 		mmap_pages_user_set = false;
2955 
2956 	if (trace.max_stack == UINT_MAX) {
2957 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2958 		max_stack_user_set = false;
2959 	}
2960 
2961 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2962 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2963 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2964 #endif
2965 
2966 	if (callchain_param.enabled) {
2967 		if (!mmap_pages_user_set && geteuid() == 0)
2968 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2969 
2970 		symbol_conf.use_callchain = true;
2971 	}
2972 
2973 	if (trace.evlist->nr_entries > 0)
2974 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2975 
2976 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2977 		return trace__record(&trace, argc-1, &argv[1]);
2978 
2979 	/* summary_only implies summary option, but don't overwrite summary if set */
2980 	if (trace.summary_only)
2981 		trace.summary = trace.summary_only;
2982 
2983 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2984 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
2985 		pr_err("Please specify something to trace.\n");
2986 		return -1;
2987 	}
2988 
2989 	if (!trace.trace_syscalls && trace.ev_qualifier) {
2990 		pr_err("The -e option can't be used with --no-syscalls.\n");
2991 		goto out;
2992 	}
2993 
2994 	if (output_name != NULL) {
2995 		err = trace__open_output(&trace, output_name);
2996 		if (err < 0) {
2997 			perror("failed to create output file");
2998 			goto out;
2999 		}
3000 	}
3001 
3002 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
3003 
3004 	err = target__validate(&trace.opts.target);
3005 	if (err) {
3006 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3007 		fprintf(trace.output, "%s", bf);
3008 		goto out_close;
3009 	}
3010 
3011 	err = target__parse_uid(&trace.opts.target);
3012 	if (err) {
3013 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3014 		fprintf(trace.output, "%s", bf);
3015 		goto out_close;
3016 	}
3017 
3018 	if (!argc && target__none(&trace.opts.target))
3019 		trace.opts.target.system_wide = true;
3020 
3021 	if (input_name)
3022 		err = trace__replay(&trace);
3023 	else
3024 		err = trace__run(&trace, argc, argv);
3025 
3026 out_close:
3027 	if (output_name != NULL)
3028 		fclose(trace.output);
3029 out:
3030 	return err;
3031 }
3032