xref: /linux/tools/perf/builtin-trace.c (revision 0d08df6c493898e679d9c517e77ea95c063d40ec)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 #include "util/bpf-loader.h"
37 #include "callchain.h"
38 #include "syscalltbl.h"
39 #include "rb_resort.h"
40 
41 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
42 #include <stdlib.h>
43 #include <linux/err.h>
44 #include <linux/filter.h>
45 #include <linux/audit.h>
46 #include <sys/ptrace.h>
47 #include <linux/random.h>
48 #include <linux/stringify.h>
49 
50 #ifndef O_CLOEXEC
51 # define O_CLOEXEC		02000000
52 #endif
53 
54 struct trace {
55 	struct perf_tool	tool;
56 	struct syscalltbl	*sctbl;
57 	struct {
58 		int		max;
59 		struct syscall  *table;
60 		struct {
61 			struct perf_evsel *sys_enter,
62 					  *sys_exit;
63 		}		events;
64 	} syscalls;
65 	struct record_opts	opts;
66 	struct perf_evlist	*evlist;
67 	struct machine		*host;
68 	struct thread		*current;
69 	u64			base_time;
70 	FILE			*output;
71 	unsigned long		nr_events;
72 	struct strlist		*ev_qualifier;
73 	struct {
74 		size_t		nr;
75 		int		*entries;
76 	}			ev_qualifier_ids;
77 	struct intlist		*tid_list;
78 	struct intlist		*pid_list;
79 	struct {
80 		size_t		nr;
81 		pid_t		*entries;
82 	}			filter_pids;
83 	double			duration_filter;
84 	double			runtime_ms;
85 	struct {
86 		u64		vfs_getname,
87 				proc_getname;
88 	} stats;
89 	unsigned int		max_stack;
90 	unsigned int		min_stack;
91 	bool			not_ev_qualifier;
92 	bool			live;
93 	bool			full_time;
94 	bool			sched;
95 	bool			multiple_threads;
96 	bool			summary;
97 	bool			summary_only;
98 	bool			show_comm;
99 	bool			show_tool_stats;
100 	bool			trace_syscalls;
101 	bool			kernel_syscallchains;
102 	bool			force;
103 	bool			vfs_getname;
104 	int			trace_pgfaults;
105 	int			open_id;
106 };
107 
108 struct tp_field {
109 	int offset;
110 	union {
111 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
112 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
113 	};
114 };
115 
116 #define TP_UINT_FIELD(bits) \
117 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
118 { \
119 	u##bits value; \
120 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
121 	return value;  \
122 }
123 
124 TP_UINT_FIELD(8);
125 TP_UINT_FIELD(16);
126 TP_UINT_FIELD(32);
127 TP_UINT_FIELD(64);
128 
129 #define TP_UINT_FIELD__SWAPPED(bits) \
130 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
131 { \
132 	u##bits value; \
133 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
134 	return bswap_##bits(value);\
135 }
136 
137 TP_UINT_FIELD__SWAPPED(16);
138 TP_UINT_FIELD__SWAPPED(32);
139 TP_UINT_FIELD__SWAPPED(64);
140 
141 static int tp_field__init_uint(struct tp_field *field,
142 			       struct format_field *format_field,
143 			       bool needs_swap)
144 {
145 	field->offset = format_field->offset;
146 
147 	switch (format_field->size) {
148 	case 1:
149 		field->integer = tp_field__u8;
150 		break;
151 	case 2:
152 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
153 		break;
154 	case 4:
155 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
156 		break;
157 	case 8:
158 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
159 		break;
160 	default:
161 		return -1;
162 	}
163 
164 	return 0;
165 }
166 
167 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
168 {
169 	return sample->raw_data + field->offset;
170 }
171 
172 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
173 {
174 	field->offset = format_field->offset;
175 	field->pointer = tp_field__ptr;
176 	return 0;
177 }
178 
179 struct syscall_tp {
180 	struct tp_field id;
181 	union {
182 		struct tp_field args, ret;
183 	};
184 };
185 
186 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
187 					  struct tp_field *field,
188 					  const char *name)
189 {
190 	struct format_field *format_field = perf_evsel__field(evsel, name);
191 
192 	if (format_field == NULL)
193 		return -1;
194 
195 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
196 }
197 
198 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
199 	({ struct syscall_tp *sc = evsel->priv;\
200 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
201 
202 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
203 					 struct tp_field *field,
204 					 const char *name)
205 {
206 	struct format_field *format_field = perf_evsel__field(evsel, name);
207 
208 	if (format_field == NULL)
209 		return -1;
210 
211 	return tp_field__init_ptr(field, format_field);
212 }
213 
214 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
215 	({ struct syscall_tp *sc = evsel->priv;\
216 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
217 
218 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
219 {
220 	zfree(&evsel->priv);
221 	perf_evsel__delete(evsel);
222 }
223 
224 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
225 {
226 	evsel->priv = malloc(sizeof(struct syscall_tp));
227 	if (evsel->priv != NULL) {
228 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
229 			goto out_delete;
230 
231 		evsel->handler = handler;
232 		return 0;
233 	}
234 
235 	return -ENOMEM;
236 
237 out_delete:
238 	zfree(&evsel->priv);
239 	return -ENOENT;
240 }
241 
242 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
243 {
244 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
245 
246 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
247 	if (IS_ERR(evsel))
248 		evsel = perf_evsel__newtp("syscalls", direction);
249 
250 	if (IS_ERR(evsel))
251 		return NULL;
252 
253 	if (perf_evsel__init_syscall_tp(evsel, handler))
254 		goto out_delete;
255 
256 	return evsel;
257 
258 out_delete:
259 	perf_evsel__delete_priv(evsel);
260 	return NULL;
261 }
262 
263 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
264 	({ struct syscall_tp *fields = evsel->priv; \
265 	   fields->name.integer(&fields->name, sample); })
266 
267 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
268 	({ struct syscall_tp *fields = evsel->priv; \
269 	   fields->name.pointer(&fields->name, sample); })
270 
271 struct syscall_arg {
272 	unsigned long val;
273 	struct thread *thread;
274 	struct trace  *trace;
275 	void	      *parm;
276 	u8	      idx;
277 	u8	      mask;
278 };
279 
280 struct strarray {
281 	int	    offset;
282 	int	    nr_entries;
283 	const char **entries;
284 };
285 
286 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
287 	.nr_entries = ARRAY_SIZE(array), \
288 	.entries = array, \
289 }
290 
291 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
292 	.offset	    = off, \
293 	.nr_entries = ARRAY_SIZE(array), \
294 	.entries = array, \
295 }
296 
297 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
298 						const char *intfmt,
299 					        struct syscall_arg *arg)
300 {
301 	struct strarray *sa = arg->parm;
302 	int idx = arg->val - sa->offset;
303 
304 	if (idx < 0 || idx >= sa->nr_entries)
305 		return scnprintf(bf, size, intfmt, arg->val);
306 
307 	return scnprintf(bf, size, "%s", sa->entries[idx]);
308 }
309 
310 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
311 					      struct syscall_arg *arg)
312 {
313 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
314 }
315 
316 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
317 
318 #if defined(__i386__) || defined(__x86_64__)
319 /*
320  * FIXME: Make this available to all arches as soon as the ioctl beautifier
321  * 	  gets rewritten to support all arches.
322  */
323 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
324 						 struct syscall_arg *arg)
325 {
326 	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
327 }
328 
329 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
330 #endif /* defined(__i386__) || defined(__x86_64__) */
331 
332 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
333 					struct syscall_arg *arg);
334 
335 #define SCA_FD syscall_arg__scnprintf_fd
336 
337 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
338 					   struct syscall_arg *arg)
339 {
340 	int fd = arg->val;
341 
342 	if (fd == AT_FDCWD)
343 		return scnprintf(bf, size, "CWD");
344 
345 	return syscall_arg__scnprintf_fd(bf, size, arg);
346 }
347 
348 #define SCA_FDAT syscall_arg__scnprintf_fd_at
349 
350 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
351 					      struct syscall_arg *arg);
352 
353 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
354 
355 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
356 					 struct syscall_arg *arg)
357 {
358 	return scnprintf(bf, size, "%#lx", arg->val);
359 }
360 
361 #define SCA_HEX syscall_arg__scnprintf_hex
362 
363 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
364 					 struct syscall_arg *arg)
365 {
366 	return scnprintf(bf, size, "%d", arg->val);
367 }
368 
369 #define SCA_INT syscall_arg__scnprintf_int
370 
371 static const char *bpf_cmd[] = {
372 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
373 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
374 };
375 static DEFINE_STRARRAY(bpf_cmd);
376 
377 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
378 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
379 
380 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
381 static DEFINE_STRARRAY(itimers);
382 
383 static const char *keyctl_options[] = {
384 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
385 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
386 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
387 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
388 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
389 };
390 static DEFINE_STRARRAY(keyctl_options);
391 
392 static const char *whences[] = { "SET", "CUR", "END",
393 #ifdef SEEK_DATA
394 "DATA",
395 #endif
396 #ifdef SEEK_HOLE
397 "HOLE",
398 #endif
399 };
400 static DEFINE_STRARRAY(whences);
401 
402 static const char *fcntl_cmds[] = {
403 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
404 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
405 	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
406 	"F_GETOWNER_UIDS",
407 };
408 static DEFINE_STRARRAY(fcntl_cmds);
409 
410 static const char *rlimit_resources[] = {
411 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
412 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
413 	"RTTIME",
414 };
415 static DEFINE_STRARRAY(rlimit_resources);
416 
417 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
418 static DEFINE_STRARRAY(sighow);
419 
420 static const char *clockid[] = {
421 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
422 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
423 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
424 };
425 static DEFINE_STRARRAY(clockid);
426 
427 static const char *socket_families[] = {
428 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
429 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
430 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
431 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
432 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
433 	"ALG", "NFC", "VSOCK",
434 };
435 static DEFINE_STRARRAY(socket_families);
436 
437 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
438 						 struct syscall_arg *arg)
439 {
440 	size_t printed = 0;
441 	int mode = arg->val;
442 
443 	if (mode == F_OK) /* 0 */
444 		return scnprintf(bf, size, "F");
445 #define	P_MODE(n) \
446 	if (mode & n##_OK) { \
447 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
448 		mode &= ~n##_OK; \
449 	}
450 
451 	P_MODE(R);
452 	P_MODE(W);
453 	P_MODE(X);
454 #undef P_MODE
455 
456 	if (mode)
457 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
458 
459 	return printed;
460 }
461 
462 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
463 
464 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
465 					      struct syscall_arg *arg);
466 
467 #define SCA_FILENAME syscall_arg__scnprintf_filename
468 
469 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
470 						struct syscall_arg *arg)
471 {
472 	int printed = 0, flags = arg->val;
473 
474 #define	P_FLAG(n) \
475 	if (flags & O_##n) { \
476 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
477 		flags &= ~O_##n; \
478 	}
479 
480 	P_FLAG(CLOEXEC);
481 	P_FLAG(NONBLOCK);
482 #undef P_FLAG
483 
484 	if (flags)
485 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
486 
487 	return printed;
488 }
489 
490 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
491 
492 #if defined(__i386__) || defined(__x86_64__)
493 /*
494  * FIXME: Make this available to all arches.
495  */
496 #define TCGETS		0x5401
497 
498 static const char *tioctls[] = {
499 	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
500 	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
501 	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
502 	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
503 	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
504 	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
505 	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
506 	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
507 	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
508 	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
509 	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
510 	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
511 	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
512 	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
513 	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
514 };
515 
516 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
517 #endif /* defined(__i386__) || defined(__x86_64__) */
518 
519 #ifndef GRND_NONBLOCK
520 #define GRND_NONBLOCK	0x0001
521 #endif
522 #ifndef GRND_RANDOM
523 #define GRND_RANDOM	0x0002
524 #endif
525 
526 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
527 						   struct syscall_arg *arg)
528 {
529 	int printed = 0, flags = arg->val;
530 
531 #define	P_FLAG(n) \
532 	if (flags & GRND_##n) { \
533 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
534 		flags &= ~GRND_##n; \
535 	}
536 
537 	P_FLAG(RANDOM);
538 	P_FLAG(NONBLOCK);
539 #undef P_FLAG
540 
541 	if (flags)
542 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
543 
544 	return printed;
545 }
546 
547 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
548 
549 #define STRARRAY(arg, name, array) \
550 	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
551 	  .arg_parm	 = { [arg] = &strarray__##array, }
552 
553 #include "trace/beauty/eventfd.c"
554 #include "trace/beauty/flock.c"
555 #include "trace/beauty/futex_op.c"
556 #include "trace/beauty/mmap.c"
557 #include "trace/beauty/mode_t.c"
558 #include "trace/beauty/msg_flags.c"
559 #include "trace/beauty/open_flags.c"
560 #include "trace/beauty/perf_event_open.c"
561 #include "trace/beauty/pid.c"
562 #include "trace/beauty/sched_policy.c"
563 #include "trace/beauty/seccomp.c"
564 #include "trace/beauty/signum.c"
565 #include "trace/beauty/socket_type.c"
566 #include "trace/beauty/waitid_options.c"
567 
568 static struct syscall_fmt {
569 	const char *name;
570 	const char *alias;
571 	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
572 	void	   *arg_parm[6];
573 	bool	   errmsg;
574 	bool	   errpid;
575 	bool	   timeout;
576 	bool	   hexret;
577 } syscall_fmts[] = {
578 	{ .name	    = "access",	    .errmsg = true,
579 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */
580 			     [1] = SCA_ACCMODE,  /* mode */ }, },
581 	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
582 	{ .name	    = "bpf",	    .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
583 	{ .name	    = "brk",	    .hexret = true,
584 	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
585 	{ .name	    = "chdir",	    .errmsg = true,
586 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
587 	{ .name	    = "chmod",	    .errmsg = true,
588 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
589 	{ .name	    = "chroot",	    .errmsg = true,
590 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
591 	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
592 	{ .name	    = "clone",	    .errpid = true, },
593 	{ .name	    = "close",	    .errmsg = true,
594 	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
595 	{ .name	    = "connect",    .errmsg = true, },
596 	{ .name	    = "creat",	    .errmsg = true,
597 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
598 	{ .name	    = "dup",	    .errmsg = true,
599 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
600 	{ .name	    = "dup2",	    .errmsg = true,
601 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
602 	{ .name	    = "dup3",	    .errmsg = true,
603 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
604 	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
605 	{ .name	    = "eventfd2",   .errmsg = true,
606 	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
607 	{ .name	    = "faccessat",  .errmsg = true,
608 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
609 			     [1] = SCA_FILENAME, /* filename */ }, },
610 	{ .name	    = "fadvise64",  .errmsg = true,
611 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
612 	{ .name	    = "fallocate",  .errmsg = true,
613 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
614 	{ .name	    = "fchdir",	    .errmsg = true,
615 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
616 	{ .name	    = "fchmod",	    .errmsg = true,
617 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
618 	{ .name	    = "fchmodat",   .errmsg = true,
619 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
620 			     [1] = SCA_FILENAME, /* filename */ }, },
621 	{ .name	    = "fchown",	    .errmsg = true,
622 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
623 	{ .name	    = "fchownat",   .errmsg = true,
624 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
625 			     [1] = SCA_FILENAME, /* filename */ }, },
626 	{ .name	    = "fcntl",	    .errmsg = true,
627 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
628 			     [1] = SCA_STRARRAY, /* cmd */ },
629 	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
630 	{ .name	    = "fdatasync",  .errmsg = true,
631 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
632 	{ .name	    = "flock",	    .errmsg = true,
633 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
634 			     [1] = SCA_FLOCK, /* cmd */ }, },
635 	{ .name	    = "fsetxattr",  .errmsg = true,
636 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
637 	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat",
638 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
639 	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat",
640 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
641 			     [1] = SCA_FILENAME, /* filename */ }, },
642 	{ .name	    = "fstatfs",    .errmsg = true,
643 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
644 	{ .name	    = "fsync",    .errmsg = true,
645 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
646 	{ .name	    = "ftruncate", .errmsg = true,
647 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
648 	{ .name	    = "futex",	    .errmsg = true,
649 	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
650 	{ .name	    = "futimesat", .errmsg = true,
651 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
652 			     [1] = SCA_FILENAME, /* filename */ }, },
653 	{ .name	    = "getdents",   .errmsg = true,
654 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
655 	{ .name	    = "getdents64", .errmsg = true,
656 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
657 	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
658 	{ .name	    = "getpid",	    .errpid = true, },
659 	{ .name	    = "getpgid",    .errpid = true, },
660 	{ .name	    = "getppid",    .errpid = true, },
661 	{ .name	    = "getrandom",  .errmsg = true,
662 	  .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
663 	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
664 	{ .name	    = "getxattr",    .errmsg = true,
665 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
666 	{ .name	    = "inotify_add_watch",	    .errmsg = true,
667 	  .arg_scnprintf = { [1] = SCA_FILENAME, /* pathname */ }, },
668 	{ .name	    = "ioctl",	    .errmsg = true,
669 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
670 #if defined(__i386__) || defined(__x86_64__)
671 /*
672  * FIXME: Make this available to all arches.
673  */
674 			     [1] = SCA_STRHEXARRAY, /* cmd */
675 			     [2] = SCA_HEX, /* arg */ },
676 	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
677 #else
678 			     [2] = SCA_HEX, /* arg */ }, },
679 #endif
680 	{ .name	    = "keyctl",	    .errmsg = true, STRARRAY(0, option, keyctl_options), },
681 	{ .name	    = "kill",	    .errmsg = true,
682 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
683 	{ .name	    = "lchown",    .errmsg = true,
684 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
685 	{ .name	    = "lgetxattr",  .errmsg = true,
686 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
687 	{ .name	    = "linkat",	    .errmsg = true,
688 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
689 	{ .name	    = "listxattr",  .errmsg = true,
690 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
691 	{ .name	    = "llistxattr", .errmsg = true,
692 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
693 	{ .name	    = "lremovexattr",  .errmsg = true,
694 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
695 	{ .name	    = "lseek",	    .errmsg = true,
696 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
697 			     [2] = SCA_STRARRAY, /* whence */ },
698 	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
699 	{ .name	    = "lsetxattr",  .errmsg = true,
700 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
701 	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat",
702 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
703 	{ .name	    = "lsxattr",    .errmsg = true,
704 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
705 	{ .name     = "madvise",    .errmsg = true,
706 	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
707 			     [2] = SCA_MADV_BHV, /* behavior */ }, },
708 	{ .name	    = "mkdir",    .errmsg = true,
709 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
710 	{ .name	    = "mkdirat",    .errmsg = true,
711 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
712 			     [1] = SCA_FILENAME, /* pathname */ }, },
713 	{ .name	    = "mknod",      .errmsg = true,
714 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
715 	{ .name	    = "mknodat",    .errmsg = true,
716 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
717 			     [1] = SCA_FILENAME, /* filename */ }, },
718 	{ .name	    = "mlock",	    .errmsg = true,
719 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
720 	{ .name	    = "mlockall",   .errmsg = true,
721 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
722 	{ .name	    = "mmap",	    .hexret = true,
723 	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
724 			     [2] = SCA_MMAP_PROT, /* prot */
725 			     [3] = SCA_MMAP_FLAGS, /* flags */
726 			     [4] = SCA_FD, 	  /* fd */ }, },
727 	{ .name	    = "mprotect",   .errmsg = true,
728 	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
729 			     [2] = SCA_MMAP_PROT, /* prot */ }, },
730 	{ .name	    = "mq_unlink", .errmsg = true,
731 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
732 	{ .name	    = "mremap",	    .hexret = true,
733 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
734 			     [3] = SCA_MREMAP_FLAGS, /* flags */
735 			     [4] = SCA_HEX, /* new_addr */ }, },
736 	{ .name	    = "munlock",    .errmsg = true,
737 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
738 	{ .name	    = "munmap",	    .errmsg = true,
739 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
740 	{ .name	    = "name_to_handle_at", .errmsg = true,
741 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
742 	{ .name	    = "newfstatat", .errmsg = true,
743 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
744 			     [1] = SCA_FILENAME, /* filename */ }, },
745 	{ .name	    = "open",	    .errmsg = true,
746 	  .arg_scnprintf = { [0] = SCA_FILENAME,   /* filename */
747 			     [1] = SCA_OPEN_FLAGS, /* flags */ }, },
748 	{ .name	    = "open_by_handle_at", .errmsg = true,
749 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
750 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
751 	{ .name	    = "openat",	    .errmsg = true,
752 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
753 			     [1] = SCA_FILENAME, /* filename */
754 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
755 	{ .name	    = "perf_event_open", .errmsg = true,
756 	  .arg_scnprintf = { [2] = SCA_INT, /* cpu */
757 			     [3] = SCA_FD,  /* group_fd */
758 			     [4] = SCA_PERF_FLAGS,  /* flags */ }, },
759 	{ .name	    = "pipe2",	    .errmsg = true,
760 	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
761 	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
762 	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
763 	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64",
764 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
765 	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread",
766 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
767 	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
768 	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64",
769 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
770 	{ .name	    = "pwritev",    .errmsg = true,
771 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
772 	{ .name	    = "read",	    .errmsg = true,
773 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
774 	{ .name	    = "readlink",   .errmsg = true,
775 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
776 	{ .name	    = "readlinkat", .errmsg = true,
777 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
778 			     [1] = SCA_FILENAME, /* pathname */ }, },
779 	{ .name	    = "readv",	    .errmsg = true,
780 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
781 	{ .name	    = "recvfrom",   .errmsg = true,
782 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
783 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
784 	{ .name	    = "recvmmsg",   .errmsg = true,
785 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
786 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
787 	{ .name	    = "recvmsg",    .errmsg = true,
788 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
789 			     [2] = SCA_MSG_FLAGS, /* flags */ }, },
790 	{ .name	    = "removexattr", .errmsg = true,
791 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
792 	{ .name	    = "renameat",   .errmsg = true,
793 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
794 	{ .name	    = "rmdir",    .errmsg = true,
795 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
796 	{ .name	    = "rt_sigaction", .errmsg = true,
797 	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
798 	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
799 	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
800 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
801 	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
802 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
803 	{ .name	    = "sched_setscheduler",   .errmsg = true,
804 	  .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
805 	{ .name	    = "seccomp", .errmsg = true,
806 	  .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
807 			     [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
808 	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
809 	{ .name	    = "sendmmsg",    .errmsg = true,
810 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
811 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
812 	{ .name	    = "sendmsg",    .errmsg = true,
813 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
814 			     [2] = SCA_MSG_FLAGS, /* flags */ }, },
815 	{ .name	    = "sendto",	    .errmsg = true,
816 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
817 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
818 	{ .name	    = "set_tid_address", .errpid = true, },
819 	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
820 	{ .name	    = "setpgid",    .errmsg = true, },
821 	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
822 	{ .name	    = "setxattr",   .errmsg = true,
823 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
824 	{ .name	    = "shutdown",   .errmsg = true,
825 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
826 	{ .name	    = "socket",	    .errmsg = true,
827 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
828 			     [1] = SCA_SK_TYPE, /* type */ },
829 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
830 	{ .name	    = "socketpair", .errmsg = true,
831 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
832 			     [1] = SCA_SK_TYPE, /* type */ },
833 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
834 	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat",
835 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
836 	{ .name	    = "statfs",	    .errmsg = true,
837 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
838 	{ .name	    = "swapoff",    .errmsg = true,
839 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
840 	{ .name	    = "swapon",	    .errmsg = true,
841 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
842 	{ .name	    = "symlinkat",  .errmsg = true,
843 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
844 	{ .name	    = "tgkill",	    .errmsg = true,
845 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
846 	{ .name	    = "tkill",	    .errmsg = true,
847 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
848 	{ .name	    = "truncate",   .errmsg = true,
849 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
850 	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
851 	{ .name	    = "unlinkat",   .errmsg = true,
852 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
853 			     [1] = SCA_FILENAME, /* pathname */ }, },
854 	{ .name	    = "utime",  .errmsg = true,
855 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
856 	{ .name	    = "utimensat",  .errmsg = true,
857 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */
858 			     [1] = SCA_FILENAME, /* filename */ }, },
859 	{ .name	    = "utimes",  .errmsg = true,
860 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
861 	{ .name	    = "vmsplice",  .errmsg = true,
862 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
863 	{ .name	    = "wait4",	    .errpid = true,
864 	  .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
865 	{ .name	    = "waitid",	    .errpid = true,
866 	  .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
867 	{ .name	    = "write",	    .errmsg = true,
868 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
869 	{ .name	    = "writev",	    .errmsg = true,
870 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
871 };
872 
873 static int syscall_fmt__cmp(const void *name, const void *fmtp)
874 {
875 	const struct syscall_fmt *fmt = fmtp;
876 	return strcmp(name, fmt->name);
877 }
878 
879 static struct syscall_fmt *syscall_fmt__find(const char *name)
880 {
881 	const int nmemb = ARRAY_SIZE(syscall_fmts);
882 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
883 }
884 
885 struct syscall {
886 	struct event_format *tp_format;
887 	int		    nr_args;
888 	struct format_field *args;
889 	const char	    *name;
890 	bool		    is_exit;
891 	struct syscall_fmt  *fmt;
892 	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
893 	void		    **arg_parm;
894 };
895 
896 static size_t fprintf_duration(unsigned long t, FILE *fp)
897 {
898 	double duration = (double)t / NSEC_PER_MSEC;
899 	size_t printed = fprintf(fp, "(");
900 
901 	if (duration >= 1.0)
902 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
903 	else if (duration >= 0.01)
904 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
905 	else
906 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
907 	return printed + fprintf(fp, "): ");
908 }
909 
910 /**
911  * filename.ptr: The filename char pointer that will be vfs_getname'd
912  * filename.entry_str_pos: Where to insert the string translated from
913  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
914  */
915 struct thread_trace {
916 	u64		  entry_time;
917 	u64		  exit_time;
918 	bool		  entry_pending;
919 	unsigned long	  nr_events;
920 	unsigned long	  pfmaj, pfmin;
921 	char		  *entry_str;
922 	double		  runtime_ms;
923         struct {
924 		unsigned long ptr;
925 		short int     entry_str_pos;
926 		bool	      pending_open;
927 		unsigned int  namelen;
928 		char	      *name;
929 	} filename;
930 	struct {
931 		int	  max;
932 		char	  **table;
933 	} paths;
934 
935 	struct intlist *syscall_stats;
936 };
937 
938 static struct thread_trace *thread_trace__new(void)
939 {
940 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
941 
942 	if (ttrace)
943 		ttrace->paths.max = -1;
944 
945 	ttrace->syscall_stats = intlist__new(NULL);
946 
947 	return ttrace;
948 }
949 
950 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
951 {
952 	struct thread_trace *ttrace;
953 
954 	if (thread == NULL)
955 		goto fail;
956 
957 	if (thread__priv(thread) == NULL)
958 		thread__set_priv(thread, thread_trace__new());
959 
960 	if (thread__priv(thread) == NULL)
961 		goto fail;
962 
963 	ttrace = thread__priv(thread);
964 	++ttrace->nr_events;
965 
966 	return ttrace;
967 fail:
968 	color_fprintf(fp, PERF_COLOR_RED,
969 		      "WARNING: not enough memory, dropping samples!\n");
970 	return NULL;
971 }
972 
973 #define TRACE_PFMAJ		(1 << 0)
974 #define TRACE_PFMIN		(1 << 1)
975 
976 static const size_t trace__entry_str_size = 2048;
977 
978 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
979 {
980 	struct thread_trace *ttrace = thread__priv(thread);
981 
982 	if (fd > ttrace->paths.max) {
983 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
984 
985 		if (npath == NULL)
986 			return -1;
987 
988 		if (ttrace->paths.max != -1) {
989 			memset(npath + ttrace->paths.max + 1, 0,
990 			       (fd - ttrace->paths.max) * sizeof(char *));
991 		} else {
992 			memset(npath, 0, (fd + 1) * sizeof(char *));
993 		}
994 
995 		ttrace->paths.table = npath;
996 		ttrace->paths.max   = fd;
997 	}
998 
999 	ttrace->paths.table[fd] = strdup(pathname);
1000 
1001 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
1002 }
1003 
1004 static int thread__read_fd_path(struct thread *thread, int fd)
1005 {
1006 	char linkname[PATH_MAX], pathname[PATH_MAX];
1007 	struct stat st;
1008 	int ret;
1009 
1010 	if (thread->pid_ == thread->tid) {
1011 		scnprintf(linkname, sizeof(linkname),
1012 			  "/proc/%d/fd/%d", thread->pid_, fd);
1013 	} else {
1014 		scnprintf(linkname, sizeof(linkname),
1015 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1016 	}
1017 
1018 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1019 		return -1;
1020 
1021 	ret = readlink(linkname, pathname, sizeof(pathname));
1022 
1023 	if (ret < 0 || ret > st.st_size)
1024 		return -1;
1025 
1026 	pathname[ret] = '\0';
1027 	return trace__set_fd_pathname(thread, fd, pathname);
1028 }
1029 
1030 static const char *thread__fd_path(struct thread *thread, int fd,
1031 				   struct trace *trace)
1032 {
1033 	struct thread_trace *ttrace = thread__priv(thread);
1034 
1035 	if (ttrace == NULL)
1036 		return NULL;
1037 
1038 	if (fd < 0)
1039 		return NULL;
1040 
1041 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1042 		if (!trace->live)
1043 			return NULL;
1044 		++trace->stats.proc_getname;
1045 		if (thread__read_fd_path(thread, fd))
1046 			return NULL;
1047 	}
1048 
1049 	return ttrace->paths.table[fd];
1050 }
1051 
1052 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1053 					struct syscall_arg *arg)
1054 {
1055 	int fd = arg->val;
1056 	size_t printed = scnprintf(bf, size, "%d", fd);
1057 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1058 
1059 	if (path)
1060 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1061 
1062 	return printed;
1063 }
1064 
1065 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1066 					      struct syscall_arg *arg)
1067 {
1068 	int fd = arg->val;
1069 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1070 	struct thread_trace *ttrace = thread__priv(arg->thread);
1071 
1072 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1073 		zfree(&ttrace->paths.table[fd]);
1074 
1075 	return printed;
1076 }
1077 
1078 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1079 				     unsigned long ptr)
1080 {
1081 	struct thread_trace *ttrace = thread__priv(thread);
1082 
1083 	ttrace->filename.ptr = ptr;
1084 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1085 }
1086 
1087 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1088 					      struct syscall_arg *arg)
1089 {
1090 	unsigned long ptr = arg->val;
1091 
1092 	if (!arg->trace->vfs_getname)
1093 		return scnprintf(bf, size, "%#x", ptr);
1094 
1095 	thread__set_filename_pos(arg->thread, bf, ptr);
1096 	return 0;
1097 }
1098 
1099 static bool trace__filter_duration(struct trace *trace, double t)
1100 {
1101 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1102 }
1103 
1104 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1105 {
1106 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1107 
1108 	return fprintf(fp, "%10.3f ", ts);
1109 }
1110 
1111 static bool done = false;
1112 static bool interrupted = false;
1113 
1114 static void sig_handler(int sig)
1115 {
1116 	done = true;
1117 	interrupted = sig == SIGINT;
1118 }
1119 
1120 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1121 					u64 duration, u64 tstamp, FILE *fp)
1122 {
1123 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1124 	printed += fprintf_duration(duration, fp);
1125 
1126 	if (trace->multiple_threads) {
1127 		if (trace->show_comm)
1128 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1129 		printed += fprintf(fp, "%d ", thread->tid);
1130 	}
1131 
1132 	return printed;
1133 }
1134 
1135 static int trace__process_event(struct trace *trace, struct machine *machine,
1136 				union perf_event *event, struct perf_sample *sample)
1137 {
1138 	int ret = 0;
1139 
1140 	switch (event->header.type) {
1141 	case PERF_RECORD_LOST:
1142 		color_fprintf(trace->output, PERF_COLOR_RED,
1143 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1144 		ret = machine__process_lost_event(machine, event, sample);
1145 		break;
1146 	default:
1147 		ret = machine__process_event(machine, event, sample);
1148 		break;
1149 	}
1150 
1151 	return ret;
1152 }
1153 
1154 static int trace__tool_process(struct perf_tool *tool,
1155 			       union perf_event *event,
1156 			       struct perf_sample *sample,
1157 			       struct machine *machine)
1158 {
1159 	struct trace *trace = container_of(tool, struct trace, tool);
1160 	return trace__process_event(trace, machine, event, sample);
1161 }
1162 
1163 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1164 {
1165 	int err = symbol__init(NULL);
1166 
1167 	if (err)
1168 		return err;
1169 
1170 	trace->host = machine__new_host();
1171 	if (trace->host == NULL)
1172 		return -ENOMEM;
1173 
1174 	if (trace_event__register_resolver(trace->host, machine__resolve_kernel_addr) < 0)
1175 		return -errno;
1176 
1177 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1178 					    evlist->threads, trace__tool_process, false,
1179 					    trace->opts.proc_map_timeout);
1180 	if (err)
1181 		symbol__exit();
1182 
1183 	return err;
1184 }
1185 
1186 static int syscall__set_arg_fmts(struct syscall *sc)
1187 {
1188 	struct format_field *field;
1189 	int idx = 0;
1190 
1191 	sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1192 	if (sc->arg_scnprintf == NULL)
1193 		return -1;
1194 
1195 	if (sc->fmt)
1196 		sc->arg_parm = sc->fmt->arg_parm;
1197 
1198 	for (field = sc->args; field; field = field->next) {
1199 		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1200 			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1201 		else if (field->flags & FIELD_IS_POINTER)
1202 			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1203 		else if (strcmp(field->type, "pid_t") == 0)
1204 			sc->arg_scnprintf[idx] = SCA_PID;
1205 		else if (strcmp(field->type, "umode_t") == 0)
1206 			sc->arg_scnprintf[idx] = SCA_MODE_T;
1207 		++idx;
1208 	}
1209 
1210 	return 0;
1211 }
1212 
1213 static int trace__read_syscall_info(struct trace *trace, int id)
1214 {
1215 	char tp_name[128];
1216 	struct syscall *sc;
1217 	const char *name = syscalltbl__name(trace->sctbl, id);
1218 
1219 	if (name == NULL)
1220 		return -1;
1221 
1222 	if (id > trace->syscalls.max) {
1223 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1224 
1225 		if (nsyscalls == NULL)
1226 			return -1;
1227 
1228 		if (trace->syscalls.max != -1) {
1229 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1230 			       (id - trace->syscalls.max) * sizeof(*sc));
1231 		} else {
1232 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1233 		}
1234 
1235 		trace->syscalls.table = nsyscalls;
1236 		trace->syscalls.max   = id;
1237 	}
1238 
1239 	sc = trace->syscalls.table + id;
1240 	sc->name = name;
1241 
1242 	sc->fmt  = syscall_fmt__find(sc->name);
1243 
1244 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1245 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1246 
1247 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1248 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1249 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1250 	}
1251 
1252 	if (IS_ERR(sc->tp_format))
1253 		return -1;
1254 
1255 	sc->args = sc->tp_format->format.fields;
1256 	sc->nr_args = sc->tp_format->format.nr_fields;
1257 	/*
1258 	 * We need to check and discard the first variable '__syscall_nr'
1259 	 * or 'nr' that mean the syscall number. It is needless here.
1260 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1261 	 */
1262 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1263 		sc->args = sc->args->next;
1264 		--sc->nr_args;
1265 	}
1266 
1267 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1268 
1269 	return syscall__set_arg_fmts(sc);
1270 }
1271 
1272 static int trace__validate_ev_qualifier(struct trace *trace)
1273 {
1274 	int err = 0, i;
1275 	struct str_node *pos;
1276 
1277 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1278 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1279 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1280 
1281 	if (trace->ev_qualifier_ids.entries == NULL) {
1282 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1283 		       trace->output);
1284 		err = -EINVAL;
1285 		goto out;
1286 	}
1287 
1288 	i = 0;
1289 
1290 	strlist__for_each(pos, trace->ev_qualifier) {
1291 		const char *sc = pos->s;
1292 		int id = syscalltbl__id(trace->sctbl, sc);
1293 
1294 		if (id < 0) {
1295 			if (err == 0) {
1296 				fputs("Error:\tInvalid syscall ", trace->output);
1297 				err = -EINVAL;
1298 			} else {
1299 				fputs(", ", trace->output);
1300 			}
1301 
1302 			fputs(sc, trace->output);
1303 		}
1304 
1305 		trace->ev_qualifier_ids.entries[i++] = id;
1306 	}
1307 
1308 	if (err < 0) {
1309 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1310 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1311 		zfree(&trace->ev_qualifier_ids.entries);
1312 		trace->ev_qualifier_ids.nr = 0;
1313 	}
1314 out:
1315 	return err;
1316 }
1317 
1318 /*
1319  * args is to be interpreted as a series of longs but we need to handle
1320  * 8-byte unaligned accesses. args points to raw_data within the event
1321  * and raw_data is guaranteed to be 8-byte unaligned because it is
1322  * preceded by raw_size which is a u32. So we need to copy args to a temp
1323  * variable to read it. Most notably this avoids extended load instructions
1324  * on unaligned addresses
1325  */
1326 
1327 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1328 				      unsigned char *args, struct trace *trace,
1329 				      struct thread *thread)
1330 {
1331 	size_t printed = 0;
1332 	unsigned char *p;
1333 	unsigned long val;
1334 
1335 	if (sc->args != NULL) {
1336 		struct format_field *field;
1337 		u8 bit = 1;
1338 		struct syscall_arg arg = {
1339 			.idx	= 0,
1340 			.mask	= 0,
1341 			.trace  = trace,
1342 			.thread = thread,
1343 		};
1344 
1345 		for (field = sc->args; field;
1346 		     field = field->next, ++arg.idx, bit <<= 1) {
1347 			if (arg.mask & bit)
1348 				continue;
1349 
1350 			/* special care for unaligned accesses */
1351 			p = args + sizeof(unsigned long) * arg.idx;
1352 			memcpy(&val, p, sizeof(val));
1353 
1354 			/*
1355  			 * Suppress this argument if its value is zero and
1356  			 * and we don't have a string associated in an
1357  			 * strarray for it.
1358  			 */
1359 			if (val == 0 &&
1360 			    !(sc->arg_scnprintf &&
1361 			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1362 			      sc->arg_parm[arg.idx]))
1363 				continue;
1364 
1365 			printed += scnprintf(bf + printed, size - printed,
1366 					     "%s%s: ", printed ? ", " : "", field->name);
1367 			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1368 				arg.val = val;
1369 				if (sc->arg_parm)
1370 					arg.parm = sc->arg_parm[arg.idx];
1371 				printed += sc->arg_scnprintf[arg.idx](bf + printed,
1372 								      size - printed, &arg);
1373 			} else {
1374 				printed += scnprintf(bf + printed, size - printed,
1375 						     "%ld", val);
1376 			}
1377 		}
1378 	} else if (IS_ERR(sc->tp_format)) {
1379 		/*
1380 		 * If we managed to read the tracepoint /format file, then we
1381 		 * may end up not having any args, like with gettid(), so only
1382 		 * print the raw args when we didn't manage to read it.
1383 		 */
1384 		int i = 0;
1385 
1386 		while (i < 6) {
1387 			/* special care for unaligned accesses */
1388 			p = args + sizeof(unsigned long) * i;
1389 			memcpy(&val, p, sizeof(val));
1390 			printed += scnprintf(bf + printed, size - printed,
1391 					     "%sarg%d: %ld",
1392 					     printed ? ", " : "", i, val);
1393 			++i;
1394 		}
1395 	}
1396 
1397 	return printed;
1398 }
1399 
1400 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1401 				  union perf_event *event,
1402 				  struct perf_sample *sample);
1403 
1404 static struct syscall *trace__syscall_info(struct trace *trace,
1405 					   struct perf_evsel *evsel, int id)
1406 {
1407 
1408 	if (id < 0) {
1409 
1410 		/*
1411 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1412 		 * before that, leaving at a higher verbosity level till that is
1413 		 * explained. Reproduced with plain ftrace with:
1414 		 *
1415 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1416 		 * grep "NR -1 " /t/trace_pipe
1417 		 *
1418 		 * After generating some load on the machine.
1419  		 */
1420 		if (verbose > 1) {
1421 			static u64 n;
1422 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1423 				id, perf_evsel__name(evsel), ++n);
1424 		}
1425 		return NULL;
1426 	}
1427 
1428 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1429 	    trace__read_syscall_info(trace, id))
1430 		goto out_cant_read;
1431 
1432 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1433 		goto out_cant_read;
1434 
1435 	return &trace->syscalls.table[id];
1436 
1437 out_cant_read:
1438 	if (verbose) {
1439 		fprintf(trace->output, "Problems reading syscall %d", id);
1440 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1441 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1442 		fputs(" information\n", trace->output);
1443 	}
1444 	return NULL;
1445 }
1446 
1447 static void thread__update_stats(struct thread_trace *ttrace,
1448 				 int id, struct perf_sample *sample)
1449 {
1450 	struct int_node *inode;
1451 	struct stats *stats;
1452 	u64 duration = 0;
1453 
1454 	inode = intlist__findnew(ttrace->syscall_stats, id);
1455 	if (inode == NULL)
1456 		return;
1457 
1458 	stats = inode->priv;
1459 	if (stats == NULL) {
1460 		stats = malloc(sizeof(struct stats));
1461 		if (stats == NULL)
1462 			return;
1463 		init_stats(stats);
1464 		inode->priv = stats;
1465 	}
1466 
1467 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1468 		duration = sample->time - ttrace->entry_time;
1469 
1470 	update_stats(stats, duration);
1471 }
1472 
1473 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1474 {
1475 	struct thread_trace *ttrace;
1476 	u64 duration;
1477 	size_t printed;
1478 
1479 	if (trace->current == NULL)
1480 		return 0;
1481 
1482 	ttrace = thread__priv(trace->current);
1483 
1484 	if (!ttrace->entry_pending)
1485 		return 0;
1486 
1487 	duration = sample->time - ttrace->entry_time;
1488 
1489 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1490 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1491 	ttrace->entry_pending = false;
1492 
1493 	return printed;
1494 }
1495 
1496 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1497 			    union perf_event *event __maybe_unused,
1498 			    struct perf_sample *sample)
1499 {
1500 	char *msg;
1501 	void *args;
1502 	size_t printed = 0;
1503 	struct thread *thread;
1504 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1505 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1506 	struct thread_trace *ttrace;
1507 
1508 	if (sc == NULL)
1509 		return -1;
1510 
1511 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1512 	ttrace = thread__trace(thread, trace->output);
1513 	if (ttrace == NULL)
1514 		goto out_put;
1515 
1516 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1517 
1518 	if (ttrace->entry_str == NULL) {
1519 		ttrace->entry_str = malloc(trace__entry_str_size);
1520 		if (!ttrace->entry_str)
1521 			goto out_put;
1522 	}
1523 
1524 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1525 		trace__printf_interrupted_entry(trace, sample);
1526 
1527 	ttrace->entry_time = sample->time;
1528 	msg = ttrace->entry_str;
1529 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1530 
1531 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1532 					   args, trace, thread);
1533 
1534 	if (sc->is_exit) {
1535 		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1536 			trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1537 			fprintf(trace->output, "%-70s\n", ttrace->entry_str);
1538 		}
1539 	} else {
1540 		ttrace->entry_pending = true;
1541 		/* See trace__vfs_getname & trace__sys_exit */
1542 		ttrace->filename.pending_open = false;
1543 	}
1544 
1545 	if (trace->current != thread) {
1546 		thread__put(trace->current);
1547 		trace->current = thread__get(thread);
1548 	}
1549 	err = 0;
1550 out_put:
1551 	thread__put(thread);
1552 	return err;
1553 }
1554 
1555 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1556 				    struct perf_sample *sample,
1557 				    struct callchain_cursor *cursor)
1558 {
1559 	struct addr_location al;
1560 
1561 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1562 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1563 		return -1;
1564 
1565 	return 0;
1566 }
1567 
1568 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1569 {
1570 	/* TODO: user-configurable print_opts */
1571 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1572 				        EVSEL__PRINT_DSO |
1573 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1574 
1575 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1576 }
1577 
1578 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1579 			   union perf_event *event __maybe_unused,
1580 			   struct perf_sample *sample)
1581 {
1582 	long ret;
1583 	u64 duration = 0;
1584 	struct thread *thread;
1585 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1586 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1587 	struct thread_trace *ttrace;
1588 
1589 	if (sc == NULL)
1590 		return -1;
1591 
1592 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1593 	ttrace = thread__trace(thread, trace->output);
1594 	if (ttrace == NULL)
1595 		goto out_put;
1596 
1597 	if (trace->summary)
1598 		thread__update_stats(ttrace, id, sample);
1599 
1600 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1601 
1602 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1603 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1604 		ttrace->filename.pending_open = false;
1605 		++trace->stats.vfs_getname;
1606 	}
1607 
1608 	ttrace->exit_time = sample->time;
1609 
1610 	if (ttrace->entry_time) {
1611 		duration = sample->time - ttrace->entry_time;
1612 		if (trace__filter_duration(trace, duration))
1613 			goto out;
1614 	} else if (trace->duration_filter)
1615 		goto out;
1616 
1617 	if (sample->callchain) {
1618 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1619 		if (callchain_ret == 0) {
1620 			if (callchain_cursor.nr < trace->min_stack)
1621 				goto out;
1622 			callchain_ret = 1;
1623 		}
1624 	}
1625 
1626 	if (trace->summary_only)
1627 		goto out;
1628 
1629 	trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1630 
1631 	if (ttrace->entry_pending) {
1632 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1633 	} else {
1634 		fprintf(trace->output, " ... [");
1635 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1636 		fprintf(trace->output, "]: %s()", sc->name);
1637 	}
1638 
1639 	if (sc->fmt == NULL) {
1640 signed_print:
1641 		fprintf(trace->output, ") = %ld", ret);
1642 	} else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1643 		char bf[STRERR_BUFSIZE];
1644 		const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
1645 			   *e = audit_errno_to_name(-ret);
1646 
1647 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1648 	} else if (ret == 0 && sc->fmt->timeout)
1649 		fprintf(trace->output, ") = 0 Timeout");
1650 	else if (sc->fmt->hexret)
1651 		fprintf(trace->output, ") = %#lx", ret);
1652 	else if (sc->fmt->errpid) {
1653 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1654 
1655 		if (child != NULL) {
1656 			fprintf(trace->output, ") = %ld", ret);
1657 			if (child->comm_set)
1658 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1659 			thread__put(child);
1660 		}
1661 	} else
1662 		goto signed_print;
1663 
1664 	fputc('\n', trace->output);
1665 
1666 	if (callchain_ret > 0)
1667 		trace__fprintf_callchain(trace, sample);
1668 	else if (callchain_ret < 0)
1669 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1670 out:
1671 	ttrace->entry_pending = false;
1672 	err = 0;
1673 out_put:
1674 	thread__put(thread);
1675 	return err;
1676 }
1677 
1678 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1679 			      union perf_event *event __maybe_unused,
1680 			      struct perf_sample *sample)
1681 {
1682 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1683 	struct thread_trace *ttrace;
1684 	size_t filename_len, entry_str_len, to_move;
1685 	ssize_t remaining_space;
1686 	char *pos;
1687 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1688 
1689 	if (!thread)
1690 		goto out;
1691 
1692 	ttrace = thread__priv(thread);
1693 	if (!ttrace)
1694 		goto out;
1695 
1696 	filename_len = strlen(filename);
1697 
1698 	if (ttrace->filename.namelen < filename_len) {
1699 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1700 
1701 		if (f == NULL)
1702 				goto out;
1703 
1704 		ttrace->filename.namelen = filename_len;
1705 		ttrace->filename.name = f;
1706 	}
1707 
1708 	strcpy(ttrace->filename.name, filename);
1709 	ttrace->filename.pending_open = true;
1710 
1711 	if (!ttrace->filename.ptr)
1712 		goto out;
1713 
1714 	entry_str_len = strlen(ttrace->entry_str);
1715 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1716 	if (remaining_space <= 0)
1717 		goto out;
1718 
1719 	if (filename_len > (size_t)remaining_space) {
1720 		filename += filename_len - remaining_space;
1721 		filename_len = remaining_space;
1722 	}
1723 
1724 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1725 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1726 	memmove(pos + filename_len, pos, to_move);
1727 	memcpy(pos, filename, filename_len);
1728 
1729 	ttrace->filename.ptr = 0;
1730 	ttrace->filename.entry_str_pos = 0;
1731 out:
1732 	return 0;
1733 }
1734 
1735 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1736 				     union perf_event *event __maybe_unused,
1737 				     struct perf_sample *sample)
1738 {
1739         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1740 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1741 	struct thread *thread = machine__findnew_thread(trace->host,
1742 							sample->pid,
1743 							sample->tid);
1744 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1745 
1746 	if (ttrace == NULL)
1747 		goto out_dump;
1748 
1749 	ttrace->runtime_ms += runtime_ms;
1750 	trace->runtime_ms += runtime_ms;
1751 	thread__put(thread);
1752 	return 0;
1753 
1754 out_dump:
1755 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1756 	       evsel->name,
1757 	       perf_evsel__strval(evsel, sample, "comm"),
1758 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1759 	       runtime,
1760 	       perf_evsel__intval(evsel, sample, "vruntime"));
1761 	thread__put(thread);
1762 	return 0;
1763 }
1764 
1765 static void bpf_output__printer(enum binary_printer_ops op,
1766 				unsigned int val, void *extra)
1767 {
1768 	FILE *output = extra;
1769 	unsigned char ch = (unsigned char)val;
1770 
1771 	switch (op) {
1772 	case BINARY_PRINT_CHAR_DATA:
1773 		fprintf(output, "%c", isprint(ch) ? ch : '.');
1774 		break;
1775 	case BINARY_PRINT_DATA_BEGIN:
1776 	case BINARY_PRINT_LINE_BEGIN:
1777 	case BINARY_PRINT_ADDR:
1778 	case BINARY_PRINT_NUM_DATA:
1779 	case BINARY_PRINT_NUM_PAD:
1780 	case BINARY_PRINT_SEP:
1781 	case BINARY_PRINT_CHAR_PAD:
1782 	case BINARY_PRINT_LINE_END:
1783 	case BINARY_PRINT_DATA_END:
1784 	default:
1785 		break;
1786 	}
1787 }
1788 
1789 static void bpf_output__fprintf(struct trace *trace,
1790 				struct perf_sample *sample)
1791 {
1792 	print_binary(sample->raw_data, sample->raw_size, 8,
1793 		     bpf_output__printer, trace->output);
1794 }
1795 
1796 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1797 				union perf_event *event __maybe_unused,
1798 				struct perf_sample *sample)
1799 {
1800 	int callchain_ret = 0;
1801 
1802 	if (sample->callchain) {
1803 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1804 		if (callchain_ret == 0) {
1805 			if (callchain_cursor.nr < trace->min_stack)
1806 				goto out;
1807 			callchain_ret = 1;
1808 		}
1809 	}
1810 
1811 	trace__printf_interrupted_entry(trace, sample);
1812 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1813 
1814 	if (trace->trace_syscalls)
1815 		fprintf(trace->output, "(         ): ");
1816 
1817 	fprintf(trace->output, "%s:", evsel->name);
1818 
1819 	if (perf_evsel__is_bpf_output(evsel)) {
1820 		bpf_output__fprintf(trace, sample);
1821 	} else if (evsel->tp_format) {
1822 		event_format__fprintf(evsel->tp_format, sample->cpu,
1823 				      sample->raw_data, sample->raw_size,
1824 				      trace->output);
1825 	}
1826 
1827 	fprintf(trace->output, ")\n");
1828 
1829 	if (callchain_ret > 0)
1830 		trace__fprintf_callchain(trace, sample);
1831 	else if (callchain_ret < 0)
1832 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1833 out:
1834 	return 0;
1835 }
1836 
1837 static void print_location(FILE *f, struct perf_sample *sample,
1838 			   struct addr_location *al,
1839 			   bool print_dso, bool print_sym)
1840 {
1841 
1842 	if ((verbose || print_dso) && al->map)
1843 		fprintf(f, "%s@", al->map->dso->long_name);
1844 
1845 	if ((verbose || print_sym) && al->sym)
1846 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1847 			al->addr - al->sym->start);
1848 	else if (al->map)
1849 		fprintf(f, "0x%" PRIx64, al->addr);
1850 	else
1851 		fprintf(f, "0x%" PRIx64, sample->addr);
1852 }
1853 
1854 static int trace__pgfault(struct trace *trace,
1855 			  struct perf_evsel *evsel,
1856 			  union perf_event *event __maybe_unused,
1857 			  struct perf_sample *sample)
1858 {
1859 	struct thread *thread;
1860 	struct addr_location al;
1861 	char map_type = 'd';
1862 	struct thread_trace *ttrace;
1863 	int err = -1;
1864 	int callchain_ret = 0;
1865 
1866 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1867 
1868 	if (sample->callchain) {
1869 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1870 		if (callchain_ret == 0) {
1871 			if (callchain_cursor.nr < trace->min_stack)
1872 				goto out_put;
1873 			callchain_ret = 1;
1874 		}
1875 	}
1876 
1877 	ttrace = thread__trace(thread, trace->output);
1878 	if (ttrace == NULL)
1879 		goto out_put;
1880 
1881 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1882 		ttrace->pfmaj++;
1883 	else
1884 		ttrace->pfmin++;
1885 
1886 	if (trace->summary_only)
1887 		goto out;
1888 
1889 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1890 			      sample->ip, &al);
1891 
1892 	trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
1893 
1894 	fprintf(trace->output, "%sfault [",
1895 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1896 		"maj" : "min");
1897 
1898 	print_location(trace->output, sample, &al, false, true);
1899 
1900 	fprintf(trace->output, "] => ");
1901 
1902 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1903 				   sample->addr, &al);
1904 
1905 	if (!al.map) {
1906 		thread__find_addr_location(thread, sample->cpumode,
1907 					   MAP__FUNCTION, sample->addr, &al);
1908 
1909 		if (al.map)
1910 			map_type = 'x';
1911 		else
1912 			map_type = '?';
1913 	}
1914 
1915 	print_location(trace->output, sample, &al, true, false);
1916 
1917 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1918 
1919 	if (callchain_ret > 0)
1920 		trace__fprintf_callchain(trace, sample);
1921 	else if (callchain_ret < 0)
1922 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1923 out:
1924 	err = 0;
1925 out_put:
1926 	thread__put(thread);
1927 	return err;
1928 }
1929 
1930 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1931 {
1932 	if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1933 	    (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1934 		return false;
1935 
1936 	if (trace->pid_list || trace->tid_list)
1937 		return true;
1938 
1939 	return false;
1940 }
1941 
1942 static void trace__set_base_time(struct trace *trace,
1943 				 struct perf_evsel *evsel,
1944 				 struct perf_sample *sample)
1945 {
1946 	/*
1947 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1948 	 * and don't use sample->time unconditionally, we may end up having
1949 	 * some other event in the future without PERF_SAMPLE_TIME for good
1950 	 * reason, i.e. we may not be interested in its timestamps, just in
1951 	 * it taking place, picking some piece of information when it
1952 	 * appears in our event stream (vfs_getname comes to mind).
1953 	 */
1954 	if (trace->base_time == 0 && !trace->full_time &&
1955 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1956 		trace->base_time = sample->time;
1957 }
1958 
1959 static int trace__process_sample(struct perf_tool *tool,
1960 				 union perf_event *event,
1961 				 struct perf_sample *sample,
1962 				 struct perf_evsel *evsel,
1963 				 struct machine *machine __maybe_unused)
1964 {
1965 	struct trace *trace = container_of(tool, struct trace, tool);
1966 	int err = 0;
1967 
1968 	tracepoint_handler handler = evsel->handler;
1969 
1970 	if (skip_sample(trace, sample))
1971 		return 0;
1972 
1973 	trace__set_base_time(trace, evsel, sample);
1974 
1975 	if (handler) {
1976 		++trace->nr_events;
1977 		handler(trace, evsel, event, sample);
1978 	}
1979 
1980 	return err;
1981 }
1982 
1983 static int parse_target_str(struct trace *trace)
1984 {
1985 	if (trace->opts.target.pid) {
1986 		trace->pid_list = intlist__new(trace->opts.target.pid);
1987 		if (trace->pid_list == NULL) {
1988 			pr_err("Error parsing process id string\n");
1989 			return -EINVAL;
1990 		}
1991 	}
1992 
1993 	if (trace->opts.target.tid) {
1994 		trace->tid_list = intlist__new(trace->opts.target.tid);
1995 		if (trace->tid_list == NULL) {
1996 			pr_err("Error parsing thread id string\n");
1997 			return -EINVAL;
1998 		}
1999 	}
2000 
2001 	return 0;
2002 }
2003 
2004 static int trace__record(struct trace *trace, int argc, const char **argv)
2005 {
2006 	unsigned int rec_argc, i, j;
2007 	const char **rec_argv;
2008 	const char * const record_args[] = {
2009 		"record",
2010 		"-R",
2011 		"-m", "1024",
2012 		"-c", "1",
2013 	};
2014 
2015 	const char * const sc_args[] = { "-e", };
2016 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2017 	const char * const majpf_args[] = { "-e", "major-faults" };
2018 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2019 	const char * const minpf_args[] = { "-e", "minor-faults" };
2020 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2021 
2022 	/* +1 is for the event string below */
2023 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2024 		majpf_args_nr + minpf_args_nr + argc;
2025 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2026 
2027 	if (rec_argv == NULL)
2028 		return -ENOMEM;
2029 
2030 	j = 0;
2031 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2032 		rec_argv[j++] = record_args[i];
2033 
2034 	if (trace->trace_syscalls) {
2035 		for (i = 0; i < sc_args_nr; i++)
2036 			rec_argv[j++] = sc_args[i];
2037 
2038 		/* event string may be different for older kernels - e.g., RHEL6 */
2039 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2040 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2041 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2042 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2043 		else {
2044 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2045 			return -1;
2046 		}
2047 	}
2048 
2049 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2050 		for (i = 0; i < majpf_args_nr; i++)
2051 			rec_argv[j++] = majpf_args[i];
2052 
2053 	if (trace->trace_pgfaults & TRACE_PFMIN)
2054 		for (i = 0; i < minpf_args_nr; i++)
2055 			rec_argv[j++] = minpf_args[i];
2056 
2057 	for (i = 0; i < (unsigned int)argc; i++)
2058 		rec_argv[j++] = argv[i];
2059 
2060 	return cmd_record(j, rec_argv, NULL);
2061 }
2062 
2063 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2064 
2065 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2066 {
2067 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2068 
2069 	if (IS_ERR(evsel))
2070 		return false;
2071 
2072 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2073 		perf_evsel__delete(evsel);
2074 		return false;
2075 	}
2076 
2077 	evsel->handler = trace__vfs_getname;
2078 	perf_evlist__add(evlist, evsel);
2079 	return true;
2080 }
2081 
2082 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2083 {
2084 	struct perf_evsel *evsel;
2085 	struct perf_event_attr attr = {
2086 		.type = PERF_TYPE_SOFTWARE,
2087 		.mmap_data = 1,
2088 	};
2089 
2090 	attr.config = config;
2091 	attr.sample_period = 1;
2092 
2093 	event_attr_init(&attr);
2094 
2095 	evsel = perf_evsel__new(&attr);
2096 	if (evsel)
2097 		evsel->handler = trace__pgfault;
2098 
2099 	return evsel;
2100 }
2101 
2102 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2103 {
2104 	const u32 type = event->header.type;
2105 	struct perf_evsel *evsel;
2106 
2107 	if (type != PERF_RECORD_SAMPLE) {
2108 		trace__process_event(trace, trace->host, event, sample);
2109 		return;
2110 	}
2111 
2112 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2113 	if (evsel == NULL) {
2114 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2115 		return;
2116 	}
2117 
2118 	trace__set_base_time(trace, evsel, sample);
2119 
2120 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2121 	    sample->raw_data == NULL) {
2122 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2123 		       perf_evsel__name(evsel), sample->tid,
2124 		       sample->cpu, sample->raw_size);
2125 	} else {
2126 		tracepoint_handler handler = evsel->handler;
2127 		handler(trace, evsel, event, sample);
2128 	}
2129 }
2130 
2131 static int trace__add_syscall_newtp(struct trace *trace)
2132 {
2133 	int ret = -1;
2134 	struct perf_evlist *evlist = trace->evlist;
2135 	struct perf_evsel *sys_enter, *sys_exit;
2136 
2137 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2138 	if (sys_enter == NULL)
2139 		goto out;
2140 
2141 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2142 		goto out_delete_sys_enter;
2143 
2144 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2145 	if (sys_exit == NULL)
2146 		goto out_delete_sys_enter;
2147 
2148 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2149 		goto out_delete_sys_exit;
2150 
2151 	perf_evlist__add(evlist, sys_enter);
2152 	perf_evlist__add(evlist, sys_exit);
2153 
2154 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2155 		/*
2156 		 * We're interested only in the user space callchain
2157 		 * leading to the syscall, allow overriding that for
2158 		 * debugging reasons using --kernel_syscall_callchains
2159 		 */
2160 		sys_exit->attr.exclude_callchain_kernel = 1;
2161 	}
2162 
2163 	trace->syscalls.events.sys_enter = sys_enter;
2164 	trace->syscalls.events.sys_exit  = sys_exit;
2165 
2166 	ret = 0;
2167 out:
2168 	return ret;
2169 
2170 out_delete_sys_exit:
2171 	perf_evsel__delete_priv(sys_exit);
2172 out_delete_sys_enter:
2173 	perf_evsel__delete_priv(sys_enter);
2174 	goto out;
2175 }
2176 
2177 static int trace__set_ev_qualifier_filter(struct trace *trace)
2178 {
2179 	int err = -1;
2180 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2181 						trace->ev_qualifier_ids.nr,
2182 						trace->ev_qualifier_ids.entries);
2183 
2184 	if (filter == NULL)
2185 		goto out_enomem;
2186 
2187 	if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2188 		err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2189 
2190 	free(filter);
2191 out:
2192 	return err;
2193 out_enomem:
2194 	errno = ENOMEM;
2195 	goto out;
2196 }
2197 
2198 static int trace__run(struct trace *trace, int argc, const char **argv)
2199 {
2200 	struct perf_evlist *evlist = trace->evlist;
2201 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2202 	int err = -1, i;
2203 	unsigned long before;
2204 	const bool forks = argc > 0;
2205 	bool draining = false;
2206 
2207 	trace->live = true;
2208 
2209 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2210 		goto out_error_raw_syscalls;
2211 
2212 	if (trace->trace_syscalls)
2213 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2214 
2215 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2216 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2217 		if (pgfault_maj == NULL)
2218 			goto out_error_mem;
2219 		perf_evlist__add(evlist, pgfault_maj);
2220 	}
2221 
2222 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2223 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2224 		if (pgfault_min == NULL)
2225 			goto out_error_mem;
2226 		perf_evlist__add(evlist, pgfault_min);
2227 	}
2228 
2229 	if (trace->sched &&
2230 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2231 				   trace__sched_stat_runtime))
2232 		goto out_error_sched_stat_runtime;
2233 
2234 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2235 	if (err < 0) {
2236 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2237 		goto out_delete_evlist;
2238 	}
2239 
2240 	err = trace__symbols_init(trace, evlist);
2241 	if (err < 0) {
2242 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2243 		goto out_delete_evlist;
2244 	}
2245 
2246 	perf_evlist__config(evlist, &trace->opts, NULL);
2247 
2248 	if (callchain_param.enabled) {
2249 		bool use_identifier = false;
2250 
2251 		if (trace->syscalls.events.sys_exit) {
2252 			perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2253 						     &trace->opts, &callchain_param);
2254 			use_identifier = true;
2255 		}
2256 
2257 		if (pgfault_maj) {
2258 			perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2259 			use_identifier = true;
2260 		}
2261 
2262 		if (pgfault_min) {
2263 			perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2264 			use_identifier = true;
2265 		}
2266 
2267 		if (use_identifier) {
2268 		       /*
2269 			* Now we have evsels with different sample_ids, use
2270 			* PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2271 			* from a fixed position in each ring buffer record.
2272 			*
2273 			* As of this the changeset introducing this comment, this
2274 			* isn't strictly needed, as the fields that can come before
2275 			* PERF_SAMPLE_ID are all used, but we'll probably disable
2276 			* some of those for things like copying the payload of
2277 			* pointer syscall arguments, and for vfs_getname we don't
2278 			* need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2279 			* here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2280 			*/
2281 			perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2282 			perf_evlist__reset_sample_bit(evlist, ID);
2283 		}
2284 	}
2285 
2286 	signal(SIGCHLD, sig_handler);
2287 	signal(SIGINT, sig_handler);
2288 
2289 	if (forks) {
2290 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2291 						    argv, false, NULL);
2292 		if (err < 0) {
2293 			fprintf(trace->output, "Couldn't run the workload!\n");
2294 			goto out_delete_evlist;
2295 		}
2296 	}
2297 
2298 	err = perf_evlist__open(evlist);
2299 	if (err < 0)
2300 		goto out_error_open;
2301 
2302 	err = bpf__apply_obj_config();
2303 	if (err) {
2304 		char errbuf[BUFSIZ];
2305 
2306 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2307 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2308 			 errbuf);
2309 		goto out_error_open;
2310 	}
2311 
2312 	/*
2313 	 * Better not use !target__has_task() here because we need to cover the
2314 	 * case where no threads were specified in the command line, but a
2315 	 * workload was, and in that case we will fill in the thread_map when
2316 	 * we fork the workload in perf_evlist__prepare_workload.
2317 	 */
2318 	if (trace->filter_pids.nr > 0)
2319 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2320 	else if (thread_map__pid(evlist->threads, 0) == -1)
2321 		err = perf_evlist__set_filter_pid(evlist, getpid());
2322 
2323 	if (err < 0)
2324 		goto out_error_mem;
2325 
2326 	if (trace->ev_qualifier_ids.nr > 0) {
2327 		err = trace__set_ev_qualifier_filter(trace);
2328 		if (err < 0)
2329 			goto out_errno;
2330 
2331 		pr_debug("event qualifier tracepoint filter: %s\n",
2332 			 trace->syscalls.events.sys_exit->filter);
2333 	}
2334 
2335 	err = perf_evlist__apply_filters(evlist, &evsel);
2336 	if (err < 0)
2337 		goto out_error_apply_filters;
2338 
2339 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2340 	if (err < 0)
2341 		goto out_error_mmap;
2342 
2343 	if (!target__none(&trace->opts.target))
2344 		perf_evlist__enable(evlist);
2345 
2346 	if (forks)
2347 		perf_evlist__start_workload(evlist);
2348 
2349 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2350 				  evlist->threads->nr > 1 ||
2351 				  perf_evlist__first(evlist)->attr.inherit;
2352 again:
2353 	before = trace->nr_events;
2354 
2355 	for (i = 0; i < evlist->nr_mmaps; i++) {
2356 		union perf_event *event;
2357 
2358 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2359 			struct perf_sample sample;
2360 
2361 			++trace->nr_events;
2362 
2363 			err = perf_evlist__parse_sample(evlist, event, &sample);
2364 			if (err) {
2365 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2366 				goto next_event;
2367 			}
2368 
2369 			trace__handle_event(trace, event, &sample);
2370 next_event:
2371 			perf_evlist__mmap_consume(evlist, i);
2372 
2373 			if (interrupted)
2374 				goto out_disable;
2375 
2376 			if (done && !draining) {
2377 				perf_evlist__disable(evlist);
2378 				draining = true;
2379 			}
2380 		}
2381 	}
2382 
2383 	if (trace->nr_events == before) {
2384 		int timeout = done ? 100 : -1;
2385 
2386 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2387 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2388 				draining = true;
2389 
2390 			goto again;
2391 		}
2392 	} else {
2393 		goto again;
2394 	}
2395 
2396 out_disable:
2397 	thread__zput(trace->current);
2398 
2399 	perf_evlist__disable(evlist);
2400 
2401 	if (!err) {
2402 		if (trace->summary)
2403 			trace__fprintf_thread_summary(trace, trace->output);
2404 
2405 		if (trace->show_tool_stats) {
2406 			fprintf(trace->output, "Stats:\n "
2407 					       " vfs_getname : %" PRIu64 "\n"
2408 					       " proc_getname: %" PRIu64 "\n",
2409 				trace->stats.vfs_getname,
2410 				trace->stats.proc_getname);
2411 		}
2412 	}
2413 
2414 out_delete_evlist:
2415 	perf_evlist__delete(evlist);
2416 	trace->evlist = NULL;
2417 	trace->live = false;
2418 	return err;
2419 {
2420 	char errbuf[BUFSIZ];
2421 
2422 out_error_sched_stat_runtime:
2423 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2424 	goto out_error;
2425 
2426 out_error_raw_syscalls:
2427 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2428 	goto out_error;
2429 
2430 out_error_mmap:
2431 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2432 	goto out_error;
2433 
2434 out_error_open:
2435 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2436 
2437 out_error:
2438 	fprintf(trace->output, "%s\n", errbuf);
2439 	goto out_delete_evlist;
2440 
2441 out_error_apply_filters:
2442 	fprintf(trace->output,
2443 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2444 		evsel->filter, perf_evsel__name(evsel), errno,
2445 		strerror_r(errno, errbuf, sizeof(errbuf)));
2446 	goto out_delete_evlist;
2447 }
2448 out_error_mem:
2449 	fprintf(trace->output, "Not enough memory to run!\n");
2450 	goto out_delete_evlist;
2451 
2452 out_errno:
2453 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2454 	goto out_delete_evlist;
2455 }
2456 
2457 static int trace__replay(struct trace *trace)
2458 {
2459 	const struct perf_evsel_str_handler handlers[] = {
2460 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2461 	};
2462 	struct perf_data_file file = {
2463 		.path  = input_name,
2464 		.mode  = PERF_DATA_MODE_READ,
2465 		.force = trace->force,
2466 	};
2467 	struct perf_session *session;
2468 	struct perf_evsel *evsel;
2469 	int err = -1;
2470 
2471 	trace->tool.sample	  = trace__process_sample;
2472 	trace->tool.mmap	  = perf_event__process_mmap;
2473 	trace->tool.mmap2	  = perf_event__process_mmap2;
2474 	trace->tool.comm	  = perf_event__process_comm;
2475 	trace->tool.exit	  = perf_event__process_exit;
2476 	trace->tool.fork	  = perf_event__process_fork;
2477 	trace->tool.attr	  = perf_event__process_attr;
2478 	trace->tool.tracing_data = perf_event__process_tracing_data;
2479 	trace->tool.build_id	  = perf_event__process_build_id;
2480 
2481 	trace->tool.ordered_events = true;
2482 	trace->tool.ordering_requires_timestamps = true;
2483 
2484 	/* add tid to output */
2485 	trace->multiple_threads = true;
2486 
2487 	session = perf_session__new(&file, false, &trace->tool);
2488 	if (session == NULL)
2489 		return -1;
2490 
2491 	if (symbol__init(&session->header.env) < 0)
2492 		goto out;
2493 
2494 	trace->host = &session->machines.host;
2495 
2496 	err = perf_session__set_tracepoints_handlers(session, handlers);
2497 	if (err)
2498 		goto out;
2499 
2500 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2501 						     "raw_syscalls:sys_enter");
2502 	/* older kernels have syscalls tp versus raw_syscalls */
2503 	if (evsel == NULL)
2504 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2505 							     "syscalls:sys_enter");
2506 
2507 	if (evsel &&
2508 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2509 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2510 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2511 		goto out;
2512 	}
2513 
2514 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2515 						     "raw_syscalls:sys_exit");
2516 	if (evsel == NULL)
2517 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2518 							     "syscalls:sys_exit");
2519 	if (evsel &&
2520 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2521 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2522 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2523 		goto out;
2524 	}
2525 
2526 	evlist__for_each(session->evlist, evsel) {
2527 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2528 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2529 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2530 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2531 			evsel->handler = trace__pgfault;
2532 	}
2533 
2534 	err = parse_target_str(trace);
2535 	if (err != 0)
2536 		goto out;
2537 
2538 	setup_pager();
2539 
2540 	err = perf_session__process_events(session);
2541 	if (err)
2542 		pr_err("Failed to process events, error %d", err);
2543 
2544 	else if (trace->summary)
2545 		trace__fprintf_thread_summary(trace, trace->output);
2546 
2547 out:
2548 	perf_session__delete(session);
2549 
2550 	return err;
2551 }
2552 
2553 static size_t trace__fprintf_threads_header(FILE *fp)
2554 {
2555 	size_t printed;
2556 
2557 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2558 
2559 	return printed;
2560 }
2561 
2562 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2563 	struct stats 	*stats;
2564 	double		msecs;
2565 	int		syscall;
2566 )
2567 {
2568 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2569 	struct stats *stats = source->priv;
2570 
2571 	entry->syscall = source->i;
2572 	entry->stats   = stats;
2573 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2574 }
2575 
2576 static size_t thread__dump_stats(struct thread_trace *ttrace,
2577 				 struct trace *trace, FILE *fp)
2578 {
2579 	size_t printed = 0;
2580 	struct syscall *sc;
2581 	struct rb_node *nd;
2582 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2583 
2584 	if (syscall_stats == NULL)
2585 		return 0;
2586 
2587 	printed += fprintf(fp, "\n");
2588 
2589 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2590 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2591 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2592 
2593 	resort_rb__for_each(nd, syscall_stats) {
2594 		struct stats *stats = syscall_stats_entry->stats;
2595 		if (stats) {
2596 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2597 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2598 			double avg = avg_stats(stats);
2599 			double pct;
2600 			u64 n = (u64) stats->n;
2601 
2602 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2603 			avg /= NSEC_PER_MSEC;
2604 
2605 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2606 			printed += fprintf(fp, "   %-15s", sc->name);
2607 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2608 					   n, syscall_stats_entry->msecs, min, avg);
2609 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2610 		}
2611 	}
2612 
2613 	resort_rb__delete(syscall_stats);
2614 	printed += fprintf(fp, "\n\n");
2615 
2616 	return printed;
2617 }
2618 
2619 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2620 {
2621 	size_t printed = 0;
2622 	struct thread_trace *ttrace = thread__priv(thread);
2623 	double ratio;
2624 
2625 	if (ttrace == NULL)
2626 		return 0;
2627 
2628 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2629 
2630 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2631 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2632 	printed += fprintf(fp, "%.1f%%", ratio);
2633 	if (ttrace->pfmaj)
2634 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2635 	if (ttrace->pfmin)
2636 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2637 	if (trace->sched)
2638 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2639 	else if (fputc('\n', fp) != EOF)
2640 		++printed;
2641 
2642 	printed += thread__dump_stats(ttrace, trace, fp);
2643 
2644 	return printed;
2645 }
2646 
2647 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2648 {
2649 	return ttrace ? ttrace->nr_events : 0;
2650 }
2651 
2652 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2653 	struct thread *thread;
2654 )
2655 {
2656 	entry->thread = rb_entry(nd, struct thread, rb_node);
2657 }
2658 
2659 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2660 {
2661 	DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2662 	size_t printed = trace__fprintf_threads_header(fp);
2663 	struct rb_node *nd;
2664 
2665 	if (threads == NULL) {
2666 		fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2667 		return 0;
2668 	}
2669 
2670 	resort_rb__for_each(nd, threads)
2671 		printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2672 
2673 	resort_rb__delete(threads);
2674 
2675 	return printed;
2676 }
2677 
2678 static int trace__set_duration(const struct option *opt, const char *str,
2679 			       int unset __maybe_unused)
2680 {
2681 	struct trace *trace = opt->value;
2682 
2683 	trace->duration_filter = atof(str);
2684 	return 0;
2685 }
2686 
2687 static int trace__set_filter_pids(const struct option *opt, const char *str,
2688 				  int unset __maybe_unused)
2689 {
2690 	int ret = -1;
2691 	size_t i;
2692 	struct trace *trace = opt->value;
2693 	/*
2694 	 * FIXME: introduce a intarray class, plain parse csv and create a
2695 	 * { int nr, int entries[] } struct...
2696 	 */
2697 	struct intlist *list = intlist__new(str);
2698 
2699 	if (list == NULL)
2700 		return -1;
2701 
2702 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2703 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2704 
2705 	if (trace->filter_pids.entries == NULL)
2706 		goto out;
2707 
2708 	trace->filter_pids.entries[0] = getpid();
2709 
2710 	for (i = 1; i < trace->filter_pids.nr; ++i)
2711 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2712 
2713 	intlist__delete(list);
2714 	ret = 0;
2715 out:
2716 	return ret;
2717 }
2718 
2719 static int trace__open_output(struct trace *trace, const char *filename)
2720 {
2721 	struct stat st;
2722 
2723 	if (!stat(filename, &st) && st.st_size) {
2724 		char oldname[PATH_MAX];
2725 
2726 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2727 		unlink(oldname);
2728 		rename(filename, oldname);
2729 	}
2730 
2731 	trace->output = fopen(filename, "w");
2732 
2733 	return trace->output == NULL ? -errno : 0;
2734 }
2735 
2736 static int parse_pagefaults(const struct option *opt, const char *str,
2737 			    int unset __maybe_unused)
2738 {
2739 	int *trace_pgfaults = opt->value;
2740 
2741 	if (strcmp(str, "all") == 0)
2742 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2743 	else if (strcmp(str, "maj") == 0)
2744 		*trace_pgfaults |= TRACE_PFMAJ;
2745 	else if (strcmp(str, "min") == 0)
2746 		*trace_pgfaults |= TRACE_PFMIN;
2747 	else
2748 		return -1;
2749 
2750 	return 0;
2751 }
2752 
2753 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2754 {
2755 	struct perf_evsel *evsel;
2756 
2757 	evlist__for_each(evlist, evsel)
2758 		evsel->handler = handler;
2759 }
2760 
2761 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2762 {
2763 	const char *trace_usage[] = {
2764 		"perf trace [<options>] [<command>]",
2765 		"perf trace [<options>] -- <command> [<options>]",
2766 		"perf trace record [<options>] [<command>]",
2767 		"perf trace record [<options>] -- <command> [<options>]",
2768 		NULL
2769 	};
2770 	struct trace trace = {
2771 		.syscalls = {
2772 			. max = -1,
2773 		},
2774 		.opts = {
2775 			.target = {
2776 				.uid	   = UINT_MAX,
2777 				.uses_mmap = true,
2778 			},
2779 			.user_freq     = UINT_MAX,
2780 			.user_interval = ULLONG_MAX,
2781 			.no_buffering  = true,
2782 			.mmap_pages    = UINT_MAX,
2783 			.proc_map_timeout  = 500,
2784 		},
2785 		.output = stderr,
2786 		.show_comm = true,
2787 		.trace_syscalls = true,
2788 		.kernel_syscallchains = false,
2789 		.max_stack = UINT_MAX,
2790 	};
2791 	const char *output_name = NULL;
2792 	const char *ev_qualifier_str = NULL;
2793 	const struct option trace_options[] = {
2794 	OPT_CALLBACK(0, "event", &trace.evlist, "event",
2795 		     "event selector. use 'perf list' to list available events",
2796 		     parse_events_option),
2797 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2798 		    "show the thread COMM next to its id"),
2799 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2800 	OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2801 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
2802 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2803 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2804 		    "trace events on existing process id"),
2805 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2806 		    "trace events on existing thread id"),
2807 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2808 		     "pids to filter (by the kernel)", trace__set_filter_pids),
2809 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2810 		    "system-wide collection from all CPUs"),
2811 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2812 		    "list of cpus to monitor"),
2813 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2814 		    "child tasks do not inherit counters"),
2815 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2816 		     "number of mmap data pages",
2817 		     perf_evlist__parse_mmap_pages),
2818 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2819 		   "user to profile"),
2820 	OPT_CALLBACK(0, "duration", &trace, "float",
2821 		     "show only events with duration > N.M ms",
2822 		     trace__set_duration),
2823 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2824 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2825 	OPT_BOOLEAN('T', "time", &trace.full_time,
2826 		    "Show full timestamp, not time relative to first start"),
2827 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
2828 		    "Show only syscall summary with statistics"),
2829 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
2830 		    "Show all syscalls and summary with statistics"),
2831 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2832 		     "Trace pagefaults", parse_pagefaults, "maj"),
2833 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2834 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2835 	OPT_CALLBACK(0, "call-graph", &trace.opts,
2836 		     "record_mode[,record_size]", record_callchain_help,
2837 		     &record_parse_callchain_opt),
2838 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2839 		    "Show the kernel callchains on the syscall exit path"),
2840 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2841 		     "Set the minimum stack depth when parsing the callchain, "
2842 		     "anything below the specified depth will be ignored."),
2843 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2844 		     "Set the maximum stack depth when parsing the callchain, "
2845 		     "anything beyond the specified depth will be ignored. "
2846 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2847 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2848 			"per thread proc mmap processing timeout in ms"),
2849 	OPT_END()
2850 	};
2851 	bool __maybe_unused max_stack_user_set = true;
2852 	bool mmap_pages_user_set = true;
2853 	const char * const trace_subcommands[] = { "record", NULL };
2854 	int err;
2855 	char bf[BUFSIZ];
2856 
2857 	signal(SIGSEGV, sighandler_dump_stack);
2858 	signal(SIGFPE, sighandler_dump_stack);
2859 
2860 	trace.evlist = perf_evlist__new();
2861 	trace.sctbl = syscalltbl__new();
2862 
2863 	if (trace.evlist == NULL || trace.sctbl == NULL) {
2864 		pr_err("Not enough memory to run!\n");
2865 		err = -ENOMEM;
2866 		goto out;
2867 	}
2868 
2869 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2870 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2871 
2872 	err = bpf__setup_stdout(trace.evlist);
2873 	if (err) {
2874 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2875 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2876 		goto out;
2877 	}
2878 
2879 	err = -1;
2880 
2881 	if (trace.trace_pgfaults) {
2882 		trace.opts.sample_address = true;
2883 		trace.opts.sample_time = true;
2884 	}
2885 
2886 	if (trace.opts.mmap_pages == UINT_MAX)
2887 		mmap_pages_user_set = false;
2888 
2889 	if (trace.max_stack == UINT_MAX) {
2890 		trace.max_stack = sysctl_perf_event_max_stack;
2891 		max_stack_user_set = false;
2892 	}
2893 
2894 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2895 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled)
2896 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2897 #endif
2898 
2899 	if (callchain_param.enabled) {
2900 		if (!mmap_pages_user_set && geteuid() == 0)
2901 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2902 
2903 		symbol_conf.use_callchain = true;
2904 	}
2905 
2906 	if (trace.evlist->nr_entries > 0)
2907 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2908 
2909 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2910 		return trace__record(&trace, argc-1, &argv[1]);
2911 
2912 	/* summary_only implies summary option, but don't overwrite summary if set */
2913 	if (trace.summary_only)
2914 		trace.summary = trace.summary_only;
2915 
2916 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2917 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
2918 		pr_err("Please specify something to trace.\n");
2919 		return -1;
2920 	}
2921 
2922 	if (!trace.trace_syscalls && ev_qualifier_str) {
2923 		pr_err("The -e option can't be used with --no-syscalls.\n");
2924 		goto out;
2925 	}
2926 
2927 	if (output_name != NULL) {
2928 		err = trace__open_output(&trace, output_name);
2929 		if (err < 0) {
2930 			perror("failed to create output file");
2931 			goto out;
2932 		}
2933 	}
2934 
2935 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
2936 
2937 	if (ev_qualifier_str != NULL) {
2938 		const char *s = ev_qualifier_str;
2939 		struct strlist_config slist_config = {
2940 			.dirname = system_path(STRACE_GROUPS_DIR),
2941 		};
2942 
2943 		trace.not_ev_qualifier = *s == '!';
2944 		if (trace.not_ev_qualifier)
2945 			++s;
2946 		trace.ev_qualifier = strlist__new(s, &slist_config);
2947 		if (trace.ev_qualifier == NULL) {
2948 			fputs("Not enough memory to parse event qualifier",
2949 			      trace.output);
2950 			err = -ENOMEM;
2951 			goto out_close;
2952 		}
2953 
2954 		err = trace__validate_ev_qualifier(&trace);
2955 		if (err)
2956 			goto out_close;
2957 	}
2958 
2959 	err = target__validate(&trace.opts.target);
2960 	if (err) {
2961 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2962 		fprintf(trace.output, "%s", bf);
2963 		goto out_close;
2964 	}
2965 
2966 	err = target__parse_uid(&trace.opts.target);
2967 	if (err) {
2968 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2969 		fprintf(trace.output, "%s", bf);
2970 		goto out_close;
2971 	}
2972 
2973 	if (!argc && target__none(&trace.opts.target))
2974 		trace.opts.target.system_wide = true;
2975 
2976 	if (input_name)
2977 		err = trace__replay(&trace);
2978 	else
2979 		err = trace__run(&trace, argc, argv);
2980 
2981 out_close:
2982 	if (output_name != NULL)
2983 		fclose(trace.output);
2984 out:
2985 	return err;
2986 }
2987