xref: /linux/tools/perf/builtin-trace.c (revision 2ec8107d8e0d1d285b2bbf1999e7f4e46b6b535b)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 #include "util/bpf-loader.h"
37 #include "callchain.h"
38 #include "syscalltbl.h"
39 #include "rb_resort.h"
40 
41 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
42 #include <stdlib.h>
43 #include <linux/err.h>
44 #include <linux/filter.h>
45 #include <linux/audit.h>
46 #include <linux/random.h>
47 #include <linux/stringify.h>
48 #include <linux/time64.h>
49 
50 #ifndef O_CLOEXEC
51 # define O_CLOEXEC		02000000
52 #endif
53 
54 struct trace {
55 	struct perf_tool	tool;
56 	struct syscalltbl	*sctbl;
57 	struct {
58 		int		max;
59 		struct syscall  *table;
60 		struct {
61 			struct perf_evsel *sys_enter,
62 					  *sys_exit;
63 		}		events;
64 	} syscalls;
65 	struct record_opts	opts;
66 	struct perf_evlist	*evlist;
67 	struct machine		*host;
68 	struct thread		*current;
69 	u64			base_time;
70 	FILE			*output;
71 	unsigned long		nr_events;
72 	struct strlist		*ev_qualifier;
73 	struct {
74 		size_t		nr;
75 		int		*entries;
76 	}			ev_qualifier_ids;
77 	struct intlist		*tid_list;
78 	struct intlist		*pid_list;
79 	struct {
80 		size_t		nr;
81 		pid_t		*entries;
82 	}			filter_pids;
83 	double			duration_filter;
84 	double			runtime_ms;
85 	struct {
86 		u64		vfs_getname,
87 				proc_getname;
88 	} stats;
89 	unsigned int		max_stack;
90 	unsigned int		min_stack;
91 	bool			not_ev_qualifier;
92 	bool			live;
93 	bool			full_time;
94 	bool			sched;
95 	bool			multiple_threads;
96 	bool			summary;
97 	bool			summary_only;
98 	bool			show_comm;
99 	bool			show_tool_stats;
100 	bool			trace_syscalls;
101 	bool			kernel_syscallchains;
102 	bool			force;
103 	bool			vfs_getname;
104 	int			trace_pgfaults;
105 	int			open_id;
106 };
107 
108 struct tp_field {
109 	int offset;
110 	union {
111 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
112 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
113 	};
114 };
115 
116 #define TP_UINT_FIELD(bits) \
117 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
118 { \
119 	u##bits value; \
120 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
121 	return value;  \
122 }
123 
124 TP_UINT_FIELD(8);
125 TP_UINT_FIELD(16);
126 TP_UINT_FIELD(32);
127 TP_UINT_FIELD(64);
128 
129 #define TP_UINT_FIELD__SWAPPED(bits) \
130 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
131 { \
132 	u##bits value; \
133 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
134 	return bswap_##bits(value);\
135 }
136 
137 TP_UINT_FIELD__SWAPPED(16);
138 TP_UINT_FIELD__SWAPPED(32);
139 TP_UINT_FIELD__SWAPPED(64);
140 
141 static int tp_field__init_uint(struct tp_field *field,
142 			       struct format_field *format_field,
143 			       bool needs_swap)
144 {
145 	field->offset = format_field->offset;
146 
147 	switch (format_field->size) {
148 	case 1:
149 		field->integer = tp_field__u8;
150 		break;
151 	case 2:
152 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
153 		break;
154 	case 4:
155 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
156 		break;
157 	case 8:
158 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
159 		break;
160 	default:
161 		return -1;
162 	}
163 
164 	return 0;
165 }
166 
167 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
168 {
169 	return sample->raw_data + field->offset;
170 }
171 
172 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
173 {
174 	field->offset = format_field->offset;
175 	field->pointer = tp_field__ptr;
176 	return 0;
177 }
178 
179 struct syscall_tp {
180 	struct tp_field id;
181 	union {
182 		struct tp_field args, ret;
183 	};
184 };
185 
186 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
187 					  struct tp_field *field,
188 					  const char *name)
189 {
190 	struct format_field *format_field = perf_evsel__field(evsel, name);
191 
192 	if (format_field == NULL)
193 		return -1;
194 
195 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
196 }
197 
198 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
199 	({ struct syscall_tp *sc = evsel->priv;\
200 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
201 
202 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
203 					 struct tp_field *field,
204 					 const char *name)
205 {
206 	struct format_field *format_field = perf_evsel__field(evsel, name);
207 
208 	if (format_field == NULL)
209 		return -1;
210 
211 	return tp_field__init_ptr(field, format_field);
212 }
213 
214 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
215 	({ struct syscall_tp *sc = evsel->priv;\
216 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
217 
218 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
219 {
220 	zfree(&evsel->priv);
221 	perf_evsel__delete(evsel);
222 }
223 
224 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
225 {
226 	evsel->priv = malloc(sizeof(struct syscall_tp));
227 	if (evsel->priv != NULL) {
228 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
229 			goto out_delete;
230 
231 		evsel->handler = handler;
232 		return 0;
233 	}
234 
235 	return -ENOMEM;
236 
237 out_delete:
238 	zfree(&evsel->priv);
239 	return -ENOENT;
240 }
241 
242 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
243 {
244 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
245 
246 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
247 	if (IS_ERR(evsel))
248 		evsel = perf_evsel__newtp("syscalls", direction);
249 
250 	if (IS_ERR(evsel))
251 		return NULL;
252 
253 	if (perf_evsel__init_syscall_tp(evsel, handler))
254 		goto out_delete;
255 
256 	return evsel;
257 
258 out_delete:
259 	perf_evsel__delete_priv(evsel);
260 	return NULL;
261 }
262 
263 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
264 	({ struct syscall_tp *fields = evsel->priv; \
265 	   fields->name.integer(&fields->name, sample); })
266 
267 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
268 	({ struct syscall_tp *fields = evsel->priv; \
269 	   fields->name.pointer(&fields->name, sample); })
270 
271 struct syscall_arg {
272 	unsigned long val;
273 	struct thread *thread;
274 	struct trace  *trace;
275 	void	      *parm;
276 	u8	      idx;
277 	u8	      mask;
278 };
279 
280 struct strarray {
281 	int	    offset;
282 	int	    nr_entries;
283 	const char **entries;
284 };
285 
286 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
287 	.nr_entries = ARRAY_SIZE(array), \
288 	.entries = array, \
289 }
290 
291 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
292 	.offset	    = off, \
293 	.nr_entries = ARRAY_SIZE(array), \
294 	.entries = array, \
295 }
296 
297 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
298 						const char *intfmt,
299 					        struct syscall_arg *arg)
300 {
301 	struct strarray *sa = arg->parm;
302 	int idx = arg->val - sa->offset;
303 
304 	if (idx < 0 || idx >= sa->nr_entries)
305 		return scnprintf(bf, size, intfmt, arg->val);
306 
307 	return scnprintf(bf, size, "%s", sa->entries[idx]);
308 }
309 
310 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
311 					      struct syscall_arg *arg)
312 {
313 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
314 }
315 
316 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
317 
318 #if defined(__i386__) || defined(__x86_64__)
319 /*
320  * FIXME: Make this available to all arches as soon as the ioctl beautifier
321  * 	  gets rewritten to support all arches.
322  */
323 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
324 						 struct syscall_arg *arg)
325 {
326 	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
327 }
328 
329 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
330 #endif /* defined(__i386__) || defined(__x86_64__) */
331 
332 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
333 					struct syscall_arg *arg);
334 
335 #define SCA_FD syscall_arg__scnprintf_fd
336 
337 #ifndef AT_FDCWD
338 #define AT_FDCWD	-100
339 #endif
340 
341 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
342 					   struct syscall_arg *arg)
343 {
344 	int fd = arg->val;
345 
346 	if (fd == AT_FDCWD)
347 		return scnprintf(bf, size, "CWD");
348 
349 	return syscall_arg__scnprintf_fd(bf, size, arg);
350 }
351 
352 #define SCA_FDAT syscall_arg__scnprintf_fd_at
353 
354 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
355 					      struct syscall_arg *arg);
356 
357 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
358 
359 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
360 					 struct syscall_arg *arg)
361 {
362 	return scnprintf(bf, size, "%#lx", arg->val);
363 }
364 
365 #define SCA_HEX syscall_arg__scnprintf_hex
366 
367 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
368 					 struct syscall_arg *arg)
369 {
370 	return scnprintf(bf, size, "%d", arg->val);
371 }
372 
373 #define SCA_INT syscall_arg__scnprintf_int
374 
375 static const char *bpf_cmd[] = {
376 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
377 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
378 };
379 static DEFINE_STRARRAY(bpf_cmd);
380 
381 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
382 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
383 
384 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
385 static DEFINE_STRARRAY(itimers);
386 
387 static const char *keyctl_options[] = {
388 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
389 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
390 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
391 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
392 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
393 };
394 static DEFINE_STRARRAY(keyctl_options);
395 
396 static const char *whences[] = { "SET", "CUR", "END",
397 #ifdef SEEK_DATA
398 "DATA",
399 #endif
400 #ifdef SEEK_HOLE
401 "HOLE",
402 #endif
403 };
404 static DEFINE_STRARRAY(whences);
405 
406 static const char *fcntl_cmds[] = {
407 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
408 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
409 	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
410 	"F_GETOWNER_UIDS",
411 };
412 static DEFINE_STRARRAY(fcntl_cmds);
413 
414 static const char *rlimit_resources[] = {
415 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
416 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
417 	"RTTIME",
418 };
419 static DEFINE_STRARRAY(rlimit_resources);
420 
421 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
422 static DEFINE_STRARRAY(sighow);
423 
424 static const char *clockid[] = {
425 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
426 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
427 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
428 };
429 static DEFINE_STRARRAY(clockid);
430 
431 static const char *socket_families[] = {
432 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
433 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
434 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
435 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
436 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
437 	"ALG", "NFC", "VSOCK",
438 };
439 static DEFINE_STRARRAY(socket_families);
440 
441 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
442 						 struct syscall_arg *arg)
443 {
444 	size_t printed = 0;
445 	int mode = arg->val;
446 
447 	if (mode == F_OK) /* 0 */
448 		return scnprintf(bf, size, "F");
449 #define	P_MODE(n) \
450 	if (mode & n##_OK) { \
451 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
452 		mode &= ~n##_OK; \
453 	}
454 
455 	P_MODE(R);
456 	P_MODE(W);
457 	P_MODE(X);
458 #undef P_MODE
459 
460 	if (mode)
461 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
462 
463 	return printed;
464 }
465 
466 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
467 
468 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
469 					      struct syscall_arg *arg);
470 
471 #define SCA_FILENAME syscall_arg__scnprintf_filename
472 
473 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
474 						struct syscall_arg *arg)
475 {
476 	int printed = 0, flags = arg->val;
477 
478 #define	P_FLAG(n) \
479 	if (flags & O_##n) { \
480 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
481 		flags &= ~O_##n; \
482 	}
483 
484 	P_FLAG(CLOEXEC);
485 	P_FLAG(NONBLOCK);
486 #undef P_FLAG
487 
488 	if (flags)
489 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
490 
491 	return printed;
492 }
493 
494 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
495 
496 #if defined(__i386__) || defined(__x86_64__)
497 /*
498  * FIXME: Make this available to all arches.
499  */
500 #define TCGETS		0x5401
501 
502 static const char *tioctls[] = {
503 	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
504 	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
505 	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
506 	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
507 	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
508 	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
509 	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
510 	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
511 	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
512 	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
513 	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
514 	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
515 	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
516 	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
517 	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
518 };
519 
520 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
521 #endif /* defined(__i386__) || defined(__x86_64__) */
522 
523 #ifndef GRND_NONBLOCK
524 #define GRND_NONBLOCK	0x0001
525 #endif
526 #ifndef GRND_RANDOM
527 #define GRND_RANDOM	0x0002
528 #endif
529 
530 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
531 						   struct syscall_arg *arg)
532 {
533 	int printed = 0, flags = arg->val;
534 
535 #define	P_FLAG(n) \
536 	if (flags & GRND_##n) { \
537 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
538 		flags &= ~GRND_##n; \
539 	}
540 
541 	P_FLAG(RANDOM);
542 	P_FLAG(NONBLOCK);
543 #undef P_FLAG
544 
545 	if (flags)
546 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
547 
548 	return printed;
549 }
550 
551 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
552 
553 #define STRARRAY(arg, name, array) \
554 	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
555 	  .arg_parm	 = { [arg] = &strarray__##array, }
556 
557 #include "trace/beauty/eventfd.c"
558 #include "trace/beauty/flock.c"
559 #include "trace/beauty/futex_op.c"
560 #include "trace/beauty/mmap.c"
561 #include "trace/beauty/mode_t.c"
562 #include "trace/beauty/msg_flags.c"
563 #include "trace/beauty/open_flags.c"
564 #include "trace/beauty/perf_event_open.c"
565 #include "trace/beauty/pid.c"
566 #include "trace/beauty/sched_policy.c"
567 #include "trace/beauty/seccomp.c"
568 #include "trace/beauty/signum.c"
569 #include "trace/beauty/socket_type.c"
570 #include "trace/beauty/waitid_options.c"
571 
572 static struct syscall_fmt {
573 	const char *name;
574 	const char *alias;
575 	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
576 	void	   *arg_parm[6];
577 	bool	   errmsg;
578 	bool	   errpid;
579 	bool	   timeout;
580 	bool	   hexret;
581 } syscall_fmts[] = {
582 	{ .name	    = "access",	    .errmsg = true,
583 	  .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
584 	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
585 	{ .name	    = "bpf",	    .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
586 	{ .name	    = "brk",	    .hexret = true,
587 	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
588 	{ .name	    = "chdir",	    .errmsg = true, },
589 	{ .name	    = "chmod",	    .errmsg = true, },
590 	{ .name	    = "chroot",	    .errmsg = true, },
591 	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
592 	{ .name	    = "clone",	    .errpid = true, },
593 	{ .name	    = "close",	    .errmsg = true,
594 	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
595 	{ .name	    = "connect",    .errmsg = true, },
596 	{ .name	    = "creat",	    .errmsg = true, },
597 	{ .name	    = "dup",	    .errmsg = true, },
598 	{ .name	    = "dup2",	    .errmsg = true, },
599 	{ .name	    = "dup3",	    .errmsg = true, },
600 	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
601 	{ .name	    = "eventfd2",   .errmsg = true,
602 	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
603 	{ .name	    = "faccessat",  .errmsg = true, },
604 	{ .name	    = "fadvise64",  .errmsg = true, },
605 	{ .name	    = "fallocate",  .errmsg = true, },
606 	{ .name	    = "fchdir",	    .errmsg = true, },
607 	{ .name	    = "fchmod",	    .errmsg = true, },
608 	{ .name	    = "fchmodat",   .errmsg = true,
609 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
610 	{ .name	    = "fchown",	    .errmsg = true, },
611 	{ .name	    = "fchownat",   .errmsg = true,
612 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
613 	{ .name	    = "fcntl",	    .errmsg = true,
614 	  .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
615 	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
616 	{ .name	    = "fdatasync",  .errmsg = true, },
617 	{ .name	    = "flock",	    .errmsg = true,
618 	  .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
619 	{ .name	    = "fsetxattr",  .errmsg = true, },
620 	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat", },
621 	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat", },
622 	{ .name	    = "fstatfs",    .errmsg = true, },
623 	{ .name	    = "fsync",    .errmsg = true, },
624 	{ .name	    = "ftruncate", .errmsg = true, },
625 	{ .name	    = "futex",	    .errmsg = true,
626 	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
627 	{ .name	    = "futimesat", .errmsg = true,
628 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
629 	{ .name	    = "getdents",   .errmsg = true, },
630 	{ .name	    = "getdents64", .errmsg = true, },
631 	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
632 	{ .name	    = "getpid",	    .errpid = true, },
633 	{ .name	    = "getpgid",    .errpid = true, },
634 	{ .name	    = "getppid",    .errpid = true, },
635 	{ .name	    = "getrandom",  .errmsg = true,
636 	  .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
637 	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
638 	{ .name	    = "getxattr",   .errmsg = true, },
639 	{ .name	    = "inotify_add_watch",	    .errmsg = true, },
640 	{ .name	    = "ioctl",	    .errmsg = true,
641 	  .arg_scnprintf = {
642 #if defined(__i386__) || defined(__x86_64__)
643 /*
644  * FIXME: Make this available to all arches.
645  */
646 			     [1] = SCA_STRHEXARRAY, /* cmd */
647 			     [2] = SCA_HEX, /* arg */ },
648 	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
649 #else
650 			     [2] = SCA_HEX, /* arg */ }, },
651 #endif
652 	{ .name	    = "keyctl",	    .errmsg = true, STRARRAY(0, option, keyctl_options), },
653 	{ .name	    = "kill",	    .errmsg = true,
654 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
655 	{ .name	    = "lchown",    .errmsg = true, },
656 	{ .name	    = "lgetxattr",  .errmsg = true, },
657 	{ .name	    = "linkat",	    .errmsg = true,
658 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
659 	{ .name	    = "listxattr",  .errmsg = true, },
660 	{ .name	    = "llistxattr", .errmsg = true, },
661 	{ .name	    = "lremovexattr",  .errmsg = true, },
662 	{ .name	    = "lseek",	    .errmsg = true,
663 	  .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
664 	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
665 	{ .name	    = "lsetxattr",  .errmsg = true, },
666 	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat", },
667 	{ .name	    = "lsxattr",    .errmsg = true, },
668 	{ .name     = "madvise",    .errmsg = true,
669 	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
670 			     [2] = SCA_MADV_BHV, /* behavior */ }, },
671 	{ .name	    = "mkdir",    .errmsg = true, },
672 	{ .name	    = "mkdirat",    .errmsg = true,
673 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
674 	{ .name	    = "mknod",      .errmsg = true, },
675 	{ .name	    = "mknodat",    .errmsg = true,
676 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
677 	{ .name	    = "mlock",	    .errmsg = true,
678 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
679 	{ .name	    = "mlockall",   .errmsg = true,
680 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
681 	{ .name	    = "mmap",	    .hexret = true,
682 	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
683 			     [2] = SCA_MMAP_PROT, /* prot */
684 			     [3] = SCA_MMAP_FLAGS, /* flags */ }, },
685 	{ .name	    = "mprotect",   .errmsg = true,
686 	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
687 			     [2] = SCA_MMAP_PROT, /* prot */ }, },
688 	{ .name	    = "mq_unlink", .errmsg = true,
689 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
690 	{ .name	    = "mremap",	    .hexret = true,
691 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
692 			     [3] = SCA_MREMAP_FLAGS, /* flags */
693 			     [4] = SCA_HEX, /* new_addr */ }, },
694 	{ .name	    = "munlock",    .errmsg = true,
695 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
696 	{ .name	    = "munmap",	    .errmsg = true,
697 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
698 	{ .name	    = "name_to_handle_at", .errmsg = true,
699 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
700 	{ .name	    = "newfstatat", .errmsg = true,
701 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
702 	{ .name	    = "open",	    .errmsg = true,
703 	  .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
704 	{ .name	    = "open_by_handle_at", .errmsg = true,
705 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
706 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
707 	{ .name	    = "openat",	    .errmsg = true,
708 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
709 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
710 	{ .name	    = "perf_event_open", .errmsg = true,
711 	  .arg_scnprintf = { [2] = SCA_INT, /* cpu */
712 			     [3] = SCA_FD,  /* group_fd */
713 			     [4] = SCA_PERF_FLAGS,  /* flags */ }, },
714 	{ .name	    = "pipe2",	    .errmsg = true,
715 	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
716 	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
717 	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
718 	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64", },
719 	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread", },
720 	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
721 	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64", },
722 	{ .name	    = "pwritev",    .errmsg = true, },
723 	{ .name	    = "read",	    .errmsg = true, },
724 	{ .name	    = "readlink",   .errmsg = true, },
725 	{ .name	    = "readlinkat", .errmsg = true,
726 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
727 	{ .name	    = "readv",	    .errmsg = true, },
728 	{ .name	    = "recvfrom",   .errmsg = true,
729 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
730 	{ .name	    = "recvmmsg",   .errmsg = true,
731 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
732 	{ .name	    = "recvmsg",    .errmsg = true,
733 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
734 	{ .name	    = "removexattr", .errmsg = true, },
735 	{ .name	    = "renameat",   .errmsg = true,
736 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
737 	{ .name	    = "rmdir",    .errmsg = true, },
738 	{ .name	    = "rt_sigaction", .errmsg = true,
739 	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
740 	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
741 	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
742 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
743 	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
744 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
745 	{ .name	    = "sched_getattr",	      .errmsg = true, },
746 	{ .name	    = "sched_setattr",	      .errmsg = true, },
747 	{ .name	    = "sched_setscheduler",   .errmsg = true,
748 	  .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
749 	{ .name	    = "seccomp", .errmsg = true,
750 	  .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
751 			     [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
752 	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
753 	{ .name	    = "sendmmsg",    .errmsg = true,
754 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
755 	{ .name	    = "sendmsg",    .errmsg = true,
756 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
757 	{ .name	    = "sendto",	    .errmsg = true,
758 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
759 	{ .name	    = "set_tid_address", .errpid = true, },
760 	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
761 	{ .name	    = "setpgid",    .errmsg = true, },
762 	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
763 	{ .name	    = "setxattr",   .errmsg = true, },
764 	{ .name	    = "shutdown",   .errmsg = true, },
765 	{ .name	    = "socket",	    .errmsg = true,
766 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
767 			     [1] = SCA_SK_TYPE, /* type */ },
768 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
769 	{ .name	    = "socketpair", .errmsg = true,
770 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
771 			     [1] = SCA_SK_TYPE, /* type */ },
772 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
773 	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat", },
774 	{ .name	    = "statfs",	    .errmsg = true, },
775 	{ .name	    = "swapoff",    .errmsg = true,
776 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
777 	{ .name	    = "swapon",	    .errmsg = true,
778 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
779 	{ .name	    = "symlinkat",  .errmsg = true,
780 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
781 	{ .name	    = "tgkill",	    .errmsg = true,
782 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
783 	{ .name	    = "tkill",	    .errmsg = true,
784 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
785 	{ .name	    = "truncate",   .errmsg = true, },
786 	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
787 	{ .name	    = "unlinkat",   .errmsg = true,
788 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
789 	{ .name	    = "utime",  .errmsg = true, },
790 	{ .name	    = "utimensat",  .errmsg = true,
791 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
792 	{ .name	    = "utimes",  .errmsg = true, },
793 	{ .name	    = "vmsplice",  .errmsg = true, },
794 	{ .name	    = "wait4",	    .errpid = true,
795 	  .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
796 	{ .name	    = "waitid",	    .errpid = true,
797 	  .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
798 	{ .name	    = "write",	    .errmsg = true, },
799 	{ .name	    = "writev",	    .errmsg = true, },
800 };
801 
802 static int syscall_fmt__cmp(const void *name, const void *fmtp)
803 {
804 	const struct syscall_fmt *fmt = fmtp;
805 	return strcmp(name, fmt->name);
806 }
807 
808 static struct syscall_fmt *syscall_fmt__find(const char *name)
809 {
810 	const int nmemb = ARRAY_SIZE(syscall_fmts);
811 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
812 }
813 
814 struct syscall {
815 	struct event_format *tp_format;
816 	int		    nr_args;
817 	struct format_field *args;
818 	const char	    *name;
819 	bool		    is_exit;
820 	struct syscall_fmt  *fmt;
821 	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
822 	void		    **arg_parm;
823 };
824 
825 static size_t fprintf_duration(unsigned long t, FILE *fp)
826 {
827 	double duration = (double)t / NSEC_PER_MSEC;
828 	size_t printed = fprintf(fp, "(");
829 
830 	if (duration >= 1.0)
831 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
832 	else if (duration >= 0.01)
833 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
834 	else
835 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
836 	return printed + fprintf(fp, "): ");
837 }
838 
839 /**
840  * filename.ptr: The filename char pointer that will be vfs_getname'd
841  * filename.entry_str_pos: Where to insert the string translated from
842  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
843  */
844 struct thread_trace {
845 	u64		  entry_time;
846 	bool		  entry_pending;
847 	unsigned long	  nr_events;
848 	unsigned long	  pfmaj, pfmin;
849 	char		  *entry_str;
850 	double		  runtime_ms;
851         struct {
852 		unsigned long ptr;
853 		short int     entry_str_pos;
854 		bool	      pending_open;
855 		unsigned int  namelen;
856 		char	      *name;
857 	} filename;
858 	struct {
859 		int	  max;
860 		char	  **table;
861 	} paths;
862 
863 	struct intlist *syscall_stats;
864 };
865 
866 static struct thread_trace *thread_trace__new(void)
867 {
868 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
869 
870 	if (ttrace)
871 		ttrace->paths.max = -1;
872 
873 	ttrace->syscall_stats = intlist__new(NULL);
874 
875 	return ttrace;
876 }
877 
878 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
879 {
880 	struct thread_trace *ttrace;
881 
882 	if (thread == NULL)
883 		goto fail;
884 
885 	if (thread__priv(thread) == NULL)
886 		thread__set_priv(thread, thread_trace__new());
887 
888 	if (thread__priv(thread) == NULL)
889 		goto fail;
890 
891 	ttrace = thread__priv(thread);
892 	++ttrace->nr_events;
893 
894 	return ttrace;
895 fail:
896 	color_fprintf(fp, PERF_COLOR_RED,
897 		      "WARNING: not enough memory, dropping samples!\n");
898 	return NULL;
899 }
900 
901 #define TRACE_PFMAJ		(1 << 0)
902 #define TRACE_PFMIN		(1 << 1)
903 
904 static const size_t trace__entry_str_size = 2048;
905 
906 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
907 {
908 	struct thread_trace *ttrace = thread__priv(thread);
909 
910 	if (fd > ttrace->paths.max) {
911 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
912 
913 		if (npath == NULL)
914 			return -1;
915 
916 		if (ttrace->paths.max != -1) {
917 			memset(npath + ttrace->paths.max + 1, 0,
918 			       (fd - ttrace->paths.max) * sizeof(char *));
919 		} else {
920 			memset(npath, 0, (fd + 1) * sizeof(char *));
921 		}
922 
923 		ttrace->paths.table = npath;
924 		ttrace->paths.max   = fd;
925 	}
926 
927 	ttrace->paths.table[fd] = strdup(pathname);
928 
929 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
930 }
931 
932 static int thread__read_fd_path(struct thread *thread, int fd)
933 {
934 	char linkname[PATH_MAX], pathname[PATH_MAX];
935 	struct stat st;
936 	int ret;
937 
938 	if (thread->pid_ == thread->tid) {
939 		scnprintf(linkname, sizeof(linkname),
940 			  "/proc/%d/fd/%d", thread->pid_, fd);
941 	} else {
942 		scnprintf(linkname, sizeof(linkname),
943 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
944 	}
945 
946 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
947 		return -1;
948 
949 	ret = readlink(linkname, pathname, sizeof(pathname));
950 
951 	if (ret < 0 || ret > st.st_size)
952 		return -1;
953 
954 	pathname[ret] = '\0';
955 	return trace__set_fd_pathname(thread, fd, pathname);
956 }
957 
958 static const char *thread__fd_path(struct thread *thread, int fd,
959 				   struct trace *trace)
960 {
961 	struct thread_trace *ttrace = thread__priv(thread);
962 
963 	if (ttrace == NULL)
964 		return NULL;
965 
966 	if (fd < 0)
967 		return NULL;
968 
969 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
970 		if (!trace->live)
971 			return NULL;
972 		++trace->stats.proc_getname;
973 		if (thread__read_fd_path(thread, fd))
974 			return NULL;
975 	}
976 
977 	return ttrace->paths.table[fd];
978 }
979 
980 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
981 					struct syscall_arg *arg)
982 {
983 	int fd = arg->val;
984 	size_t printed = scnprintf(bf, size, "%d", fd);
985 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
986 
987 	if (path)
988 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
989 
990 	return printed;
991 }
992 
993 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
994 					      struct syscall_arg *arg)
995 {
996 	int fd = arg->val;
997 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
998 	struct thread_trace *ttrace = thread__priv(arg->thread);
999 
1000 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1001 		zfree(&ttrace->paths.table[fd]);
1002 
1003 	return printed;
1004 }
1005 
1006 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1007 				     unsigned long ptr)
1008 {
1009 	struct thread_trace *ttrace = thread__priv(thread);
1010 
1011 	ttrace->filename.ptr = ptr;
1012 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1013 }
1014 
1015 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1016 					      struct syscall_arg *arg)
1017 {
1018 	unsigned long ptr = arg->val;
1019 
1020 	if (!arg->trace->vfs_getname)
1021 		return scnprintf(bf, size, "%#x", ptr);
1022 
1023 	thread__set_filename_pos(arg->thread, bf, ptr);
1024 	return 0;
1025 }
1026 
1027 static bool trace__filter_duration(struct trace *trace, double t)
1028 {
1029 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1030 }
1031 
1032 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1033 {
1034 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1035 
1036 	return fprintf(fp, "%10.3f ", ts);
1037 }
1038 
1039 static bool done = false;
1040 static bool interrupted = false;
1041 
1042 static void sig_handler(int sig)
1043 {
1044 	done = true;
1045 	interrupted = sig == SIGINT;
1046 }
1047 
1048 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1049 					u64 duration, u64 tstamp, FILE *fp)
1050 {
1051 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1052 	printed += fprintf_duration(duration, fp);
1053 
1054 	if (trace->multiple_threads) {
1055 		if (trace->show_comm)
1056 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1057 		printed += fprintf(fp, "%d ", thread->tid);
1058 	}
1059 
1060 	return printed;
1061 }
1062 
1063 static int trace__process_event(struct trace *trace, struct machine *machine,
1064 				union perf_event *event, struct perf_sample *sample)
1065 {
1066 	int ret = 0;
1067 
1068 	switch (event->header.type) {
1069 	case PERF_RECORD_LOST:
1070 		color_fprintf(trace->output, PERF_COLOR_RED,
1071 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1072 		ret = machine__process_lost_event(machine, event, sample);
1073 		break;
1074 	default:
1075 		ret = machine__process_event(machine, event, sample);
1076 		break;
1077 	}
1078 
1079 	return ret;
1080 }
1081 
1082 static int trace__tool_process(struct perf_tool *tool,
1083 			       union perf_event *event,
1084 			       struct perf_sample *sample,
1085 			       struct machine *machine)
1086 {
1087 	struct trace *trace = container_of(tool, struct trace, tool);
1088 	return trace__process_event(trace, machine, event, sample);
1089 }
1090 
1091 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1092 {
1093 	struct machine *machine = vmachine;
1094 
1095 	if (machine->kptr_restrict_warned)
1096 		return NULL;
1097 
1098 	if (symbol_conf.kptr_restrict) {
1099 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1100 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1101 			   "Kernel samples will not be resolved.\n");
1102 		machine->kptr_restrict_warned = true;
1103 		return NULL;
1104 	}
1105 
1106 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1107 }
1108 
1109 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1110 {
1111 	int err = symbol__init(NULL);
1112 
1113 	if (err)
1114 		return err;
1115 
1116 	trace->host = machine__new_host();
1117 	if (trace->host == NULL)
1118 		return -ENOMEM;
1119 
1120 	if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1121 		return -errno;
1122 
1123 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1124 					    evlist->threads, trace__tool_process, false,
1125 					    trace->opts.proc_map_timeout);
1126 	if (err)
1127 		symbol__exit();
1128 
1129 	return err;
1130 }
1131 
1132 static int syscall__set_arg_fmts(struct syscall *sc)
1133 {
1134 	struct format_field *field;
1135 	int idx = 0, len;
1136 
1137 	sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1138 	if (sc->arg_scnprintf == NULL)
1139 		return -1;
1140 
1141 	if (sc->fmt)
1142 		sc->arg_parm = sc->fmt->arg_parm;
1143 
1144 	for (field = sc->args; field; field = field->next) {
1145 		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1146 			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1147 		else if (strcmp(field->type, "const char *") == 0 &&
1148 			 (strcmp(field->name, "filename") == 0 ||
1149 			  strcmp(field->name, "path") == 0 ||
1150 			  strcmp(field->name, "pathname") == 0))
1151 			sc->arg_scnprintf[idx] = SCA_FILENAME;
1152 		else if (field->flags & FIELD_IS_POINTER)
1153 			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1154 		else if (strcmp(field->type, "pid_t") == 0)
1155 			sc->arg_scnprintf[idx] = SCA_PID;
1156 		else if (strcmp(field->type, "umode_t") == 0)
1157 			sc->arg_scnprintf[idx] = SCA_MODE_T;
1158 		else if ((strcmp(field->type, "int") == 0 ||
1159 			  strcmp(field->type, "unsigned int") == 0 ||
1160 			  strcmp(field->type, "long") == 0) &&
1161 			 (len = strlen(field->name)) >= 2 &&
1162 			 strcmp(field->name + len - 2, "fd") == 0) {
1163 			/*
1164 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1165 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1166 			 * 65 int
1167 			 * 23 unsigned int
1168 			 * 7 unsigned long
1169 			 */
1170 			sc->arg_scnprintf[idx] = SCA_FD;
1171 		}
1172 		++idx;
1173 	}
1174 
1175 	return 0;
1176 }
1177 
1178 static int trace__read_syscall_info(struct trace *trace, int id)
1179 {
1180 	char tp_name[128];
1181 	struct syscall *sc;
1182 	const char *name = syscalltbl__name(trace->sctbl, id);
1183 
1184 	if (name == NULL)
1185 		return -1;
1186 
1187 	if (id > trace->syscalls.max) {
1188 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1189 
1190 		if (nsyscalls == NULL)
1191 			return -1;
1192 
1193 		if (trace->syscalls.max != -1) {
1194 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1195 			       (id - trace->syscalls.max) * sizeof(*sc));
1196 		} else {
1197 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1198 		}
1199 
1200 		trace->syscalls.table = nsyscalls;
1201 		trace->syscalls.max   = id;
1202 	}
1203 
1204 	sc = trace->syscalls.table + id;
1205 	sc->name = name;
1206 
1207 	sc->fmt  = syscall_fmt__find(sc->name);
1208 
1209 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1210 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1211 
1212 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1213 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1214 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1215 	}
1216 
1217 	if (IS_ERR(sc->tp_format))
1218 		return -1;
1219 
1220 	sc->args = sc->tp_format->format.fields;
1221 	sc->nr_args = sc->tp_format->format.nr_fields;
1222 	/*
1223 	 * We need to check and discard the first variable '__syscall_nr'
1224 	 * or 'nr' that mean the syscall number. It is needless here.
1225 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1226 	 */
1227 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1228 		sc->args = sc->args->next;
1229 		--sc->nr_args;
1230 	}
1231 
1232 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1233 
1234 	return syscall__set_arg_fmts(sc);
1235 }
1236 
1237 static int trace__validate_ev_qualifier(struct trace *trace)
1238 {
1239 	int err = 0, i;
1240 	struct str_node *pos;
1241 
1242 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1243 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1244 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1245 
1246 	if (trace->ev_qualifier_ids.entries == NULL) {
1247 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1248 		       trace->output);
1249 		err = -EINVAL;
1250 		goto out;
1251 	}
1252 
1253 	i = 0;
1254 
1255 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1256 		const char *sc = pos->s;
1257 		int id = syscalltbl__id(trace->sctbl, sc);
1258 
1259 		if (id < 0) {
1260 			if (err == 0) {
1261 				fputs("Error:\tInvalid syscall ", trace->output);
1262 				err = -EINVAL;
1263 			} else {
1264 				fputs(", ", trace->output);
1265 			}
1266 
1267 			fputs(sc, trace->output);
1268 		}
1269 
1270 		trace->ev_qualifier_ids.entries[i++] = id;
1271 	}
1272 
1273 	if (err < 0) {
1274 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1275 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1276 		zfree(&trace->ev_qualifier_ids.entries);
1277 		trace->ev_qualifier_ids.nr = 0;
1278 	}
1279 out:
1280 	return err;
1281 }
1282 
1283 /*
1284  * args is to be interpreted as a series of longs but we need to handle
1285  * 8-byte unaligned accesses. args points to raw_data within the event
1286  * and raw_data is guaranteed to be 8-byte unaligned because it is
1287  * preceded by raw_size which is a u32. So we need to copy args to a temp
1288  * variable to read it. Most notably this avoids extended load instructions
1289  * on unaligned addresses
1290  */
1291 
1292 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1293 				      unsigned char *args, struct trace *trace,
1294 				      struct thread *thread)
1295 {
1296 	size_t printed = 0;
1297 	unsigned char *p;
1298 	unsigned long val;
1299 
1300 	if (sc->args != NULL) {
1301 		struct format_field *field;
1302 		u8 bit = 1;
1303 		struct syscall_arg arg = {
1304 			.idx	= 0,
1305 			.mask	= 0,
1306 			.trace  = trace,
1307 			.thread = thread,
1308 		};
1309 
1310 		for (field = sc->args; field;
1311 		     field = field->next, ++arg.idx, bit <<= 1) {
1312 			if (arg.mask & bit)
1313 				continue;
1314 
1315 			/* special care for unaligned accesses */
1316 			p = args + sizeof(unsigned long) * arg.idx;
1317 			memcpy(&val, p, sizeof(val));
1318 
1319 			/*
1320  			 * Suppress this argument if its value is zero and
1321  			 * and we don't have a string associated in an
1322  			 * strarray for it.
1323  			 */
1324 			if (val == 0 &&
1325 			    !(sc->arg_scnprintf &&
1326 			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1327 			      sc->arg_parm[arg.idx]))
1328 				continue;
1329 
1330 			printed += scnprintf(bf + printed, size - printed,
1331 					     "%s%s: ", printed ? ", " : "", field->name);
1332 			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1333 				arg.val = val;
1334 				if (sc->arg_parm)
1335 					arg.parm = sc->arg_parm[arg.idx];
1336 				printed += sc->arg_scnprintf[arg.idx](bf + printed,
1337 								      size - printed, &arg);
1338 			} else {
1339 				printed += scnprintf(bf + printed, size - printed,
1340 						     "%ld", val);
1341 			}
1342 		}
1343 	} else if (IS_ERR(sc->tp_format)) {
1344 		/*
1345 		 * If we managed to read the tracepoint /format file, then we
1346 		 * may end up not having any args, like with gettid(), so only
1347 		 * print the raw args when we didn't manage to read it.
1348 		 */
1349 		int i = 0;
1350 
1351 		while (i < 6) {
1352 			/* special care for unaligned accesses */
1353 			p = args + sizeof(unsigned long) * i;
1354 			memcpy(&val, p, sizeof(val));
1355 			printed += scnprintf(bf + printed, size - printed,
1356 					     "%sarg%d: %ld",
1357 					     printed ? ", " : "", i, val);
1358 			++i;
1359 		}
1360 	}
1361 
1362 	return printed;
1363 }
1364 
1365 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1366 				  union perf_event *event,
1367 				  struct perf_sample *sample);
1368 
1369 static struct syscall *trace__syscall_info(struct trace *trace,
1370 					   struct perf_evsel *evsel, int id)
1371 {
1372 
1373 	if (id < 0) {
1374 
1375 		/*
1376 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1377 		 * before that, leaving at a higher verbosity level till that is
1378 		 * explained. Reproduced with plain ftrace with:
1379 		 *
1380 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1381 		 * grep "NR -1 " /t/trace_pipe
1382 		 *
1383 		 * After generating some load on the machine.
1384  		 */
1385 		if (verbose > 1) {
1386 			static u64 n;
1387 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1388 				id, perf_evsel__name(evsel), ++n);
1389 		}
1390 		return NULL;
1391 	}
1392 
1393 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1394 	    trace__read_syscall_info(trace, id))
1395 		goto out_cant_read;
1396 
1397 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1398 		goto out_cant_read;
1399 
1400 	return &trace->syscalls.table[id];
1401 
1402 out_cant_read:
1403 	if (verbose) {
1404 		fprintf(trace->output, "Problems reading syscall %d", id);
1405 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1406 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1407 		fputs(" information\n", trace->output);
1408 	}
1409 	return NULL;
1410 }
1411 
1412 static void thread__update_stats(struct thread_trace *ttrace,
1413 				 int id, struct perf_sample *sample)
1414 {
1415 	struct int_node *inode;
1416 	struct stats *stats;
1417 	u64 duration = 0;
1418 
1419 	inode = intlist__findnew(ttrace->syscall_stats, id);
1420 	if (inode == NULL)
1421 		return;
1422 
1423 	stats = inode->priv;
1424 	if (stats == NULL) {
1425 		stats = malloc(sizeof(struct stats));
1426 		if (stats == NULL)
1427 			return;
1428 		init_stats(stats);
1429 		inode->priv = stats;
1430 	}
1431 
1432 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1433 		duration = sample->time - ttrace->entry_time;
1434 
1435 	update_stats(stats, duration);
1436 }
1437 
1438 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1439 {
1440 	struct thread_trace *ttrace;
1441 	u64 duration;
1442 	size_t printed;
1443 
1444 	if (trace->current == NULL)
1445 		return 0;
1446 
1447 	ttrace = thread__priv(trace->current);
1448 
1449 	if (!ttrace->entry_pending)
1450 		return 0;
1451 
1452 	duration = sample->time - ttrace->entry_time;
1453 
1454 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, ttrace->entry_time, trace->output);
1455 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1456 	ttrace->entry_pending = false;
1457 
1458 	return printed;
1459 }
1460 
1461 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1462 			    union perf_event *event __maybe_unused,
1463 			    struct perf_sample *sample)
1464 {
1465 	char *msg;
1466 	void *args;
1467 	size_t printed = 0;
1468 	struct thread *thread;
1469 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1470 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1471 	struct thread_trace *ttrace;
1472 
1473 	if (sc == NULL)
1474 		return -1;
1475 
1476 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1477 	ttrace = thread__trace(thread, trace->output);
1478 	if (ttrace == NULL)
1479 		goto out_put;
1480 
1481 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1482 
1483 	if (ttrace->entry_str == NULL) {
1484 		ttrace->entry_str = malloc(trace__entry_str_size);
1485 		if (!ttrace->entry_str)
1486 			goto out_put;
1487 	}
1488 
1489 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1490 		trace__printf_interrupted_entry(trace, sample);
1491 
1492 	ttrace->entry_time = sample->time;
1493 	msg = ttrace->entry_str;
1494 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1495 
1496 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1497 					   args, trace, thread);
1498 
1499 	if (sc->is_exit) {
1500 		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1501 			trace__fprintf_entry_head(trace, thread, 1, ttrace->entry_time, trace->output);
1502 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1503 		}
1504 	} else {
1505 		ttrace->entry_pending = true;
1506 		/* See trace__vfs_getname & trace__sys_exit */
1507 		ttrace->filename.pending_open = false;
1508 	}
1509 
1510 	if (trace->current != thread) {
1511 		thread__put(trace->current);
1512 		trace->current = thread__get(thread);
1513 	}
1514 	err = 0;
1515 out_put:
1516 	thread__put(thread);
1517 	return err;
1518 }
1519 
1520 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1521 				    struct perf_sample *sample,
1522 				    struct callchain_cursor *cursor)
1523 {
1524 	struct addr_location al;
1525 
1526 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1527 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1528 		return -1;
1529 
1530 	return 0;
1531 }
1532 
1533 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1534 {
1535 	/* TODO: user-configurable print_opts */
1536 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1537 				        EVSEL__PRINT_DSO |
1538 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1539 
1540 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1541 }
1542 
1543 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1544 			   union perf_event *event __maybe_unused,
1545 			   struct perf_sample *sample)
1546 {
1547 	long ret;
1548 	u64 duration = 0;
1549 	struct thread *thread;
1550 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1551 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1552 	struct thread_trace *ttrace;
1553 
1554 	if (sc == NULL)
1555 		return -1;
1556 
1557 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1558 	ttrace = thread__trace(thread, trace->output);
1559 	if (ttrace == NULL)
1560 		goto out_put;
1561 
1562 	if (trace->summary)
1563 		thread__update_stats(ttrace, id, sample);
1564 
1565 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1566 
1567 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1568 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1569 		ttrace->filename.pending_open = false;
1570 		++trace->stats.vfs_getname;
1571 	}
1572 
1573 	if (ttrace->entry_time) {
1574 		duration = sample->time - ttrace->entry_time;
1575 		if (trace__filter_duration(trace, duration))
1576 			goto out;
1577 	} else if (trace->duration_filter)
1578 		goto out;
1579 
1580 	if (sample->callchain) {
1581 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1582 		if (callchain_ret == 0) {
1583 			if (callchain_cursor.nr < trace->min_stack)
1584 				goto out;
1585 			callchain_ret = 1;
1586 		}
1587 	}
1588 
1589 	if (trace->summary_only)
1590 		goto out;
1591 
1592 	trace__fprintf_entry_head(trace, thread, duration, ttrace->entry_time, trace->output);
1593 
1594 	if (ttrace->entry_pending) {
1595 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1596 	} else {
1597 		fprintf(trace->output, " ... [");
1598 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1599 		fprintf(trace->output, "]: %s()", sc->name);
1600 	}
1601 
1602 	if (sc->fmt == NULL) {
1603 signed_print:
1604 		fprintf(trace->output, ") = %ld", ret);
1605 	} else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1606 		char bf[STRERR_BUFSIZE];
1607 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1608 			   *e = audit_errno_to_name(-ret);
1609 
1610 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1611 	} else if (ret == 0 && sc->fmt->timeout)
1612 		fprintf(trace->output, ") = 0 Timeout");
1613 	else if (sc->fmt->hexret)
1614 		fprintf(trace->output, ") = %#lx", ret);
1615 	else if (sc->fmt->errpid) {
1616 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1617 
1618 		if (child != NULL) {
1619 			fprintf(trace->output, ") = %ld", ret);
1620 			if (child->comm_set)
1621 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1622 			thread__put(child);
1623 		}
1624 	} else
1625 		goto signed_print;
1626 
1627 	fputc('\n', trace->output);
1628 
1629 	if (callchain_ret > 0)
1630 		trace__fprintf_callchain(trace, sample);
1631 	else if (callchain_ret < 0)
1632 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1633 out:
1634 	ttrace->entry_pending = false;
1635 	err = 0;
1636 out_put:
1637 	thread__put(thread);
1638 	return err;
1639 }
1640 
1641 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1642 			      union perf_event *event __maybe_unused,
1643 			      struct perf_sample *sample)
1644 {
1645 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1646 	struct thread_trace *ttrace;
1647 	size_t filename_len, entry_str_len, to_move;
1648 	ssize_t remaining_space;
1649 	char *pos;
1650 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1651 
1652 	if (!thread)
1653 		goto out;
1654 
1655 	ttrace = thread__priv(thread);
1656 	if (!ttrace)
1657 		goto out;
1658 
1659 	filename_len = strlen(filename);
1660 
1661 	if (ttrace->filename.namelen < filename_len) {
1662 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1663 
1664 		if (f == NULL)
1665 				goto out;
1666 
1667 		ttrace->filename.namelen = filename_len;
1668 		ttrace->filename.name = f;
1669 	}
1670 
1671 	strcpy(ttrace->filename.name, filename);
1672 	ttrace->filename.pending_open = true;
1673 
1674 	if (!ttrace->filename.ptr)
1675 		goto out;
1676 
1677 	entry_str_len = strlen(ttrace->entry_str);
1678 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1679 	if (remaining_space <= 0)
1680 		goto out;
1681 
1682 	if (filename_len > (size_t)remaining_space) {
1683 		filename += filename_len - remaining_space;
1684 		filename_len = remaining_space;
1685 	}
1686 
1687 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1688 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1689 	memmove(pos + filename_len, pos, to_move);
1690 	memcpy(pos, filename, filename_len);
1691 
1692 	ttrace->filename.ptr = 0;
1693 	ttrace->filename.entry_str_pos = 0;
1694 out:
1695 	return 0;
1696 }
1697 
1698 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1699 				     union perf_event *event __maybe_unused,
1700 				     struct perf_sample *sample)
1701 {
1702         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1703 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1704 	struct thread *thread = machine__findnew_thread(trace->host,
1705 							sample->pid,
1706 							sample->tid);
1707 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1708 
1709 	if (ttrace == NULL)
1710 		goto out_dump;
1711 
1712 	ttrace->runtime_ms += runtime_ms;
1713 	trace->runtime_ms += runtime_ms;
1714 	thread__put(thread);
1715 	return 0;
1716 
1717 out_dump:
1718 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1719 	       evsel->name,
1720 	       perf_evsel__strval(evsel, sample, "comm"),
1721 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1722 	       runtime,
1723 	       perf_evsel__intval(evsel, sample, "vruntime"));
1724 	thread__put(thread);
1725 	return 0;
1726 }
1727 
1728 static void bpf_output__printer(enum binary_printer_ops op,
1729 				unsigned int val, void *extra)
1730 {
1731 	FILE *output = extra;
1732 	unsigned char ch = (unsigned char)val;
1733 
1734 	switch (op) {
1735 	case BINARY_PRINT_CHAR_DATA:
1736 		fprintf(output, "%c", isprint(ch) ? ch : '.');
1737 		break;
1738 	case BINARY_PRINT_DATA_BEGIN:
1739 	case BINARY_PRINT_LINE_BEGIN:
1740 	case BINARY_PRINT_ADDR:
1741 	case BINARY_PRINT_NUM_DATA:
1742 	case BINARY_PRINT_NUM_PAD:
1743 	case BINARY_PRINT_SEP:
1744 	case BINARY_PRINT_CHAR_PAD:
1745 	case BINARY_PRINT_LINE_END:
1746 	case BINARY_PRINT_DATA_END:
1747 	default:
1748 		break;
1749 	}
1750 }
1751 
1752 static void bpf_output__fprintf(struct trace *trace,
1753 				struct perf_sample *sample)
1754 {
1755 	print_binary(sample->raw_data, sample->raw_size, 8,
1756 		     bpf_output__printer, trace->output);
1757 }
1758 
1759 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1760 				union perf_event *event __maybe_unused,
1761 				struct perf_sample *sample)
1762 {
1763 	int callchain_ret = 0;
1764 
1765 	if (sample->callchain) {
1766 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1767 		if (callchain_ret == 0) {
1768 			if (callchain_cursor.nr < trace->min_stack)
1769 				goto out;
1770 			callchain_ret = 1;
1771 		}
1772 	}
1773 
1774 	trace__printf_interrupted_entry(trace, sample);
1775 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1776 
1777 	if (trace->trace_syscalls)
1778 		fprintf(trace->output, "(         ): ");
1779 
1780 	fprintf(trace->output, "%s:", evsel->name);
1781 
1782 	if (perf_evsel__is_bpf_output(evsel)) {
1783 		bpf_output__fprintf(trace, sample);
1784 	} else if (evsel->tp_format) {
1785 		event_format__fprintf(evsel->tp_format, sample->cpu,
1786 				      sample->raw_data, sample->raw_size,
1787 				      trace->output);
1788 	}
1789 
1790 	fprintf(trace->output, ")\n");
1791 
1792 	if (callchain_ret > 0)
1793 		trace__fprintf_callchain(trace, sample);
1794 	else if (callchain_ret < 0)
1795 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1796 out:
1797 	return 0;
1798 }
1799 
1800 static void print_location(FILE *f, struct perf_sample *sample,
1801 			   struct addr_location *al,
1802 			   bool print_dso, bool print_sym)
1803 {
1804 
1805 	if ((verbose || print_dso) && al->map)
1806 		fprintf(f, "%s@", al->map->dso->long_name);
1807 
1808 	if ((verbose || print_sym) && al->sym)
1809 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1810 			al->addr - al->sym->start);
1811 	else if (al->map)
1812 		fprintf(f, "0x%" PRIx64, al->addr);
1813 	else
1814 		fprintf(f, "0x%" PRIx64, sample->addr);
1815 }
1816 
1817 static int trace__pgfault(struct trace *trace,
1818 			  struct perf_evsel *evsel,
1819 			  union perf_event *event __maybe_unused,
1820 			  struct perf_sample *sample)
1821 {
1822 	struct thread *thread;
1823 	struct addr_location al;
1824 	char map_type = 'd';
1825 	struct thread_trace *ttrace;
1826 	int err = -1;
1827 	int callchain_ret = 0;
1828 
1829 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1830 
1831 	if (sample->callchain) {
1832 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1833 		if (callchain_ret == 0) {
1834 			if (callchain_cursor.nr < trace->min_stack)
1835 				goto out_put;
1836 			callchain_ret = 1;
1837 		}
1838 	}
1839 
1840 	ttrace = thread__trace(thread, trace->output);
1841 	if (ttrace == NULL)
1842 		goto out_put;
1843 
1844 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1845 		ttrace->pfmaj++;
1846 	else
1847 		ttrace->pfmin++;
1848 
1849 	if (trace->summary_only)
1850 		goto out;
1851 
1852 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1853 			      sample->ip, &al);
1854 
1855 	trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
1856 
1857 	fprintf(trace->output, "%sfault [",
1858 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1859 		"maj" : "min");
1860 
1861 	print_location(trace->output, sample, &al, false, true);
1862 
1863 	fprintf(trace->output, "] => ");
1864 
1865 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1866 				   sample->addr, &al);
1867 
1868 	if (!al.map) {
1869 		thread__find_addr_location(thread, sample->cpumode,
1870 					   MAP__FUNCTION, sample->addr, &al);
1871 
1872 		if (al.map)
1873 			map_type = 'x';
1874 		else
1875 			map_type = '?';
1876 	}
1877 
1878 	print_location(trace->output, sample, &al, true, false);
1879 
1880 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1881 
1882 	if (callchain_ret > 0)
1883 		trace__fprintf_callchain(trace, sample);
1884 	else if (callchain_ret < 0)
1885 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1886 out:
1887 	err = 0;
1888 out_put:
1889 	thread__put(thread);
1890 	return err;
1891 }
1892 
1893 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1894 {
1895 	if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1896 	    (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1897 		return false;
1898 
1899 	if (trace->pid_list || trace->tid_list)
1900 		return true;
1901 
1902 	return false;
1903 }
1904 
1905 static void trace__set_base_time(struct trace *trace,
1906 				 struct perf_evsel *evsel,
1907 				 struct perf_sample *sample)
1908 {
1909 	/*
1910 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1911 	 * and don't use sample->time unconditionally, we may end up having
1912 	 * some other event in the future without PERF_SAMPLE_TIME for good
1913 	 * reason, i.e. we may not be interested in its timestamps, just in
1914 	 * it taking place, picking some piece of information when it
1915 	 * appears in our event stream (vfs_getname comes to mind).
1916 	 */
1917 	if (trace->base_time == 0 && !trace->full_time &&
1918 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1919 		trace->base_time = sample->time;
1920 }
1921 
1922 static int trace__process_sample(struct perf_tool *tool,
1923 				 union perf_event *event,
1924 				 struct perf_sample *sample,
1925 				 struct perf_evsel *evsel,
1926 				 struct machine *machine __maybe_unused)
1927 {
1928 	struct trace *trace = container_of(tool, struct trace, tool);
1929 	int err = 0;
1930 
1931 	tracepoint_handler handler = evsel->handler;
1932 
1933 	if (skip_sample(trace, sample))
1934 		return 0;
1935 
1936 	trace__set_base_time(trace, evsel, sample);
1937 
1938 	if (handler) {
1939 		++trace->nr_events;
1940 		handler(trace, evsel, event, sample);
1941 	}
1942 
1943 	return err;
1944 }
1945 
1946 static int parse_target_str(struct trace *trace)
1947 {
1948 	if (trace->opts.target.pid) {
1949 		trace->pid_list = intlist__new(trace->opts.target.pid);
1950 		if (trace->pid_list == NULL) {
1951 			pr_err("Error parsing process id string\n");
1952 			return -EINVAL;
1953 		}
1954 	}
1955 
1956 	if (trace->opts.target.tid) {
1957 		trace->tid_list = intlist__new(trace->opts.target.tid);
1958 		if (trace->tid_list == NULL) {
1959 			pr_err("Error parsing thread id string\n");
1960 			return -EINVAL;
1961 		}
1962 	}
1963 
1964 	return 0;
1965 }
1966 
1967 static int trace__record(struct trace *trace, int argc, const char **argv)
1968 {
1969 	unsigned int rec_argc, i, j;
1970 	const char **rec_argv;
1971 	const char * const record_args[] = {
1972 		"record",
1973 		"-R",
1974 		"-m", "1024",
1975 		"-c", "1",
1976 	};
1977 
1978 	const char * const sc_args[] = { "-e", };
1979 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1980 	const char * const majpf_args[] = { "-e", "major-faults" };
1981 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1982 	const char * const minpf_args[] = { "-e", "minor-faults" };
1983 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1984 
1985 	/* +1 is for the event string below */
1986 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1987 		majpf_args_nr + minpf_args_nr + argc;
1988 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
1989 
1990 	if (rec_argv == NULL)
1991 		return -ENOMEM;
1992 
1993 	j = 0;
1994 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
1995 		rec_argv[j++] = record_args[i];
1996 
1997 	if (trace->trace_syscalls) {
1998 		for (i = 0; i < sc_args_nr; i++)
1999 			rec_argv[j++] = sc_args[i];
2000 
2001 		/* event string may be different for older kernels - e.g., RHEL6 */
2002 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2003 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2004 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2005 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2006 		else {
2007 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2008 			return -1;
2009 		}
2010 	}
2011 
2012 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2013 		for (i = 0; i < majpf_args_nr; i++)
2014 			rec_argv[j++] = majpf_args[i];
2015 
2016 	if (trace->trace_pgfaults & TRACE_PFMIN)
2017 		for (i = 0; i < minpf_args_nr; i++)
2018 			rec_argv[j++] = minpf_args[i];
2019 
2020 	for (i = 0; i < (unsigned int)argc; i++)
2021 		rec_argv[j++] = argv[i];
2022 
2023 	return cmd_record(j, rec_argv, NULL);
2024 }
2025 
2026 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2027 
2028 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2029 {
2030 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2031 
2032 	if (IS_ERR(evsel))
2033 		return false;
2034 
2035 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2036 		perf_evsel__delete(evsel);
2037 		return false;
2038 	}
2039 
2040 	evsel->handler = trace__vfs_getname;
2041 	perf_evlist__add(evlist, evsel);
2042 	return true;
2043 }
2044 
2045 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2046 {
2047 	struct perf_evsel *evsel;
2048 	struct perf_event_attr attr = {
2049 		.type = PERF_TYPE_SOFTWARE,
2050 		.mmap_data = 1,
2051 	};
2052 
2053 	attr.config = config;
2054 	attr.sample_period = 1;
2055 
2056 	event_attr_init(&attr);
2057 
2058 	evsel = perf_evsel__new(&attr);
2059 	if (evsel)
2060 		evsel->handler = trace__pgfault;
2061 
2062 	return evsel;
2063 }
2064 
2065 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2066 {
2067 	const u32 type = event->header.type;
2068 	struct perf_evsel *evsel;
2069 
2070 	if (type != PERF_RECORD_SAMPLE) {
2071 		trace__process_event(trace, trace->host, event, sample);
2072 		return;
2073 	}
2074 
2075 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2076 	if (evsel == NULL) {
2077 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2078 		return;
2079 	}
2080 
2081 	trace__set_base_time(trace, evsel, sample);
2082 
2083 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2084 	    sample->raw_data == NULL) {
2085 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2086 		       perf_evsel__name(evsel), sample->tid,
2087 		       sample->cpu, sample->raw_size);
2088 	} else {
2089 		tracepoint_handler handler = evsel->handler;
2090 		handler(trace, evsel, event, sample);
2091 	}
2092 }
2093 
2094 static int trace__add_syscall_newtp(struct trace *trace)
2095 {
2096 	int ret = -1;
2097 	struct perf_evlist *evlist = trace->evlist;
2098 	struct perf_evsel *sys_enter, *sys_exit;
2099 
2100 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2101 	if (sys_enter == NULL)
2102 		goto out;
2103 
2104 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2105 		goto out_delete_sys_enter;
2106 
2107 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2108 	if (sys_exit == NULL)
2109 		goto out_delete_sys_enter;
2110 
2111 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2112 		goto out_delete_sys_exit;
2113 
2114 	perf_evlist__add(evlist, sys_enter);
2115 	perf_evlist__add(evlist, sys_exit);
2116 
2117 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2118 		/*
2119 		 * We're interested only in the user space callchain
2120 		 * leading to the syscall, allow overriding that for
2121 		 * debugging reasons using --kernel_syscall_callchains
2122 		 */
2123 		sys_exit->attr.exclude_callchain_kernel = 1;
2124 	}
2125 
2126 	trace->syscalls.events.sys_enter = sys_enter;
2127 	trace->syscalls.events.sys_exit  = sys_exit;
2128 
2129 	ret = 0;
2130 out:
2131 	return ret;
2132 
2133 out_delete_sys_exit:
2134 	perf_evsel__delete_priv(sys_exit);
2135 out_delete_sys_enter:
2136 	perf_evsel__delete_priv(sys_enter);
2137 	goto out;
2138 }
2139 
2140 static int trace__set_ev_qualifier_filter(struct trace *trace)
2141 {
2142 	int err = -1;
2143 	struct perf_evsel *sys_exit;
2144 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2145 						trace->ev_qualifier_ids.nr,
2146 						trace->ev_qualifier_ids.entries);
2147 
2148 	if (filter == NULL)
2149 		goto out_enomem;
2150 
2151 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2152 					  filter)) {
2153 		sys_exit = trace->syscalls.events.sys_exit;
2154 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2155 	}
2156 
2157 	free(filter);
2158 out:
2159 	return err;
2160 out_enomem:
2161 	errno = ENOMEM;
2162 	goto out;
2163 }
2164 
2165 static int trace__run(struct trace *trace, int argc, const char **argv)
2166 {
2167 	struct perf_evlist *evlist = trace->evlist;
2168 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2169 	int err = -1, i;
2170 	unsigned long before;
2171 	const bool forks = argc > 0;
2172 	bool draining = false;
2173 
2174 	trace->live = true;
2175 
2176 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2177 		goto out_error_raw_syscalls;
2178 
2179 	if (trace->trace_syscalls)
2180 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2181 
2182 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2183 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2184 		if (pgfault_maj == NULL)
2185 			goto out_error_mem;
2186 		perf_evlist__add(evlist, pgfault_maj);
2187 	}
2188 
2189 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2190 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2191 		if (pgfault_min == NULL)
2192 			goto out_error_mem;
2193 		perf_evlist__add(evlist, pgfault_min);
2194 	}
2195 
2196 	if (trace->sched &&
2197 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2198 				   trace__sched_stat_runtime))
2199 		goto out_error_sched_stat_runtime;
2200 
2201 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2202 	if (err < 0) {
2203 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2204 		goto out_delete_evlist;
2205 	}
2206 
2207 	err = trace__symbols_init(trace, evlist);
2208 	if (err < 0) {
2209 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2210 		goto out_delete_evlist;
2211 	}
2212 
2213 	perf_evlist__config(evlist, &trace->opts, NULL);
2214 
2215 	if (callchain_param.enabled) {
2216 		bool use_identifier = false;
2217 
2218 		if (trace->syscalls.events.sys_exit) {
2219 			perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2220 						     &trace->opts, &callchain_param);
2221 			use_identifier = true;
2222 		}
2223 
2224 		if (pgfault_maj) {
2225 			perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2226 			use_identifier = true;
2227 		}
2228 
2229 		if (pgfault_min) {
2230 			perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2231 			use_identifier = true;
2232 		}
2233 
2234 		if (use_identifier) {
2235 		       /*
2236 			* Now we have evsels with different sample_ids, use
2237 			* PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2238 			* from a fixed position in each ring buffer record.
2239 			*
2240 			* As of this the changeset introducing this comment, this
2241 			* isn't strictly needed, as the fields that can come before
2242 			* PERF_SAMPLE_ID are all used, but we'll probably disable
2243 			* some of those for things like copying the payload of
2244 			* pointer syscall arguments, and for vfs_getname we don't
2245 			* need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2246 			* here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2247 			*/
2248 			perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2249 			perf_evlist__reset_sample_bit(evlist, ID);
2250 		}
2251 	}
2252 
2253 	signal(SIGCHLD, sig_handler);
2254 	signal(SIGINT, sig_handler);
2255 
2256 	if (forks) {
2257 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2258 						    argv, false, NULL);
2259 		if (err < 0) {
2260 			fprintf(trace->output, "Couldn't run the workload!\n");
2261 			goto out_delete_evlist;
2262 		}
2263 	}
2264 
2265 	err = perf_evlist__open(evlist);
2266 	if (err < 0)
2267 		goto out_error_open;
2268 
2269 	err = bpf__apply_obj_config();
2270 	if (err) {
2271 		char errbuf[BUFSIZ];
2272 
2273 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2274 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2275 			 errbuf);
2276 		goto out_error_open;
2277 	}
2278 
2279 	/*
2280 	 * Better not use !target__has_task() here because we need to cover the
2281 	 * case where no threads were specified in the command line, but a
2282 	 * workload was, and in that case we will fill in the thread_map when
2283 	 * we fork the workload in perf_evlist__prepare_workload.
2284 	 */
2285 	if (trace->filter_pids.nr > 0)
2286 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2287 	else if (thread_map__pid(evlist->threads, 0) == -1)
2288 		err = perf_evlist__set_filter_pid(evlist, getpid());
2289 
2290 	if (err < 0)
2291 		goto out_error_mem;
2292 
2293 	if (trace->ev_qualifier_ids.nr > 0) {
2294 		err = trace__set_ev_qualifier_filter(trace);
2295 		if (err < 0)
2296 			goto out_errno;
2297 
2298 		pr_debug("event qualifier tracepoint filter: %s\n",
2299 			 trace->syscalls.events.sys_exit->filter);
2300 	}
2301 
2302 	err = perf_evlist__apply_filters(evlist, &evsel);
2303 	if (err < 0)
2304 		goto out_error_apply_filters;
2305 
2306 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2307 	if (err < 0)
2308 		goto out_error_mmap;
2309 
2310 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2311 		perf_evlist__enable(evlist);
2312 
2313 	if (forks)
2314 		perf_evlist__start_workload(evlist);
2315 
2316 	if (trace->opts.initial_delay) {
2317 		usleep(trace->opts.initial_delay * 1000);
2318 		perf_evlist__enable(evlist);
2319 	}
2320 
2321 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2322 				  evlist->threads->nr > 1 ||
2323 				  perf_evlist__first(evlist)->attr.inherit;
2324 again:
2325 	before = trace->nr_events;
2326 
2327 	for (i = 0; i < evlist->nr_mmaps; i++) {
2328 		union perf_event *event;
2329 
2330 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2331 			struct perf_sample sample;
2332 
2333 			++trace->nr_events;
2334 
2335 			err = perf_evlist__parse_sample(evlist, event, &sample);
2336 			if (err) {
2337 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2338 				goto next_event;
2339 			}
2340 
2341 			trace__handle_event(trace, event, &sample);
2342 next_event:
2343 			perf_evlist__mmap_consume(evlist, i);
2344 
2345 			if (interrupted)
2346 				goto out_disable;
2347 
2348 			if (done && !draining) {
2349 				perf_evlist__disable(evlist);
2350 				draining = true;
2351 			}
2352 		}
2353 	}
2354 
2355 	if (trace->nr_events == before) {
2356 		int timeout = done ? 100 : -1;
2357 
2358 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2359 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2360 				draining = true;
2361 
2362 			goto again;
2363 		}
2364 	} else {
2365 		goto again;
2366 	}
2367 
2368 out_disable:
2369 	thread__zput(trace->current);
2370 
2371 	perf_evlist__disable(evlist);
2372 
2373 	if (!err) {
2374 		if (trace->summary)
2375 			trace__fprintf_thread_summary(trace, trace->output);
2376 
2377 		if (trace->show_tool_stats) {
2378 			fprintf(trace->output, "Stats:\n "
2379 					       " vfs_getname : %" PRIu64 "\n"
2380 					       " proc_getname: %" PRIu64 "\n",
2381 				trace->stats.vfs_getname,
2382 				trace->stats.proc_getname);
2383 		}
2384 	}
2385 
2386 out_delete_evlist:
2387 	perf_evlist__delete(evlist);
2388 	trace->evlist = NULL;
2389 	trace->live = false;
2390 	return err;
2391 {
2392 	char errbuf[BUFSIZ];
2393 
2394 out_error_sched_stat_runtime:
2395 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2396 	goto out_error;
2397 
2398 out_error_raw_syscalls:
2399 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2400 	goto out_error;
2401 
2402 out_error_mmap:
2403 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2404 	goto out_error;
2405 
2406 out_error_open:
2407 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2408 
2409 out_error:
2410 	fprintf(trace->output, "%s\n", errbuf);
2411 	goto out_delete_evlist;
2412 
2413 out_error_apply_filters:
2414 	fprintf(trace->output,
2415 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2416 		evsel->filter, perf_evsel__name(evsel), errno,
2417 		str_error_r(errno, errbuf, sizeof(errbuf)));
2418 	goto out_delete_evlist;
2419 }
2420 out_error_mem:
2421 	fprintf(trace->output, "Not enough memory to run!\n");
2422 	goto out_delete_evlist;
2423 
2424 out_errno:
2425 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2426 	goto out_delete_evlist;
2427 }
2428 
2429 static int trace__replay(struct trace *trace)
2430 {
2431 	const struct perf_evsel_str_handler handlers[] = {
2432 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2433 	};
2434 	struct perf_data_file file = {
2435 		.path  = input_name,
2436 		.mode  = PERF_DATA_MODE_READ,
2437 		.force = trace->force,
2438 	};
2439 	struct perf_session *session;
2440 	struct perf_evsel *evsel;
2441 	int err = -1;
2442 
2443 	trace->tool.sample	  = trace__process_sample;
2444 	trace->tool.mmap	  = perf_event__process_mmap;
2445 	trace->tool.mmap2	  = perf_event__process_mmap2;
2446 	trace->tool.comm	  = perf_event__process_comm;
2447 	trace->tool.exit	  = perf_event__process_exit;
2448 	trace->tool.fork	  = perf_event__process_fork;
2449 	trace->tool.attr	  = perf_event__process_attr;
2450 	trace->tool.tracing_data = perf_event__process_tracing_data;
2451 	trace->tool.build_id	  = perf_event__process_build_id;
2452 
2453 	trace->tool.ordered_events = true;
2454 	trace->tool.ordering_requires_timestamps = true;
2455 
2456 	/* add tid to output */
2457 	trace->multiple_threads = true;
2458 
2459 	session = perf_session__new(&file, false, &trace->tool);
2460 	if (session == NULL)
2461 		return -1;
2462 
2463 	if (symbol__init(&session->header.env) < 0)
2464 		goto out;
2465 
2466 	trace->host = &session->machines.host;
2467 
2468 	err = perf_session__set_tracepoints_handlers(session, handlers);
2469 	if (err)
2470 		goto out;
2471 
2472 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2473 						     "raw_syscalls:sys_enter");
2474 	/* older kernels have syscalls tp versus raw_syscalls */
2475 	if (evsel == NULL)
2476 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2477 							     "syscalls:sys_enter");
2478 
2479 	if (evsel &&
2480 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2481 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2482 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2483 		goto out;
2484 	}
2485 
2486 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2487 						     "raw_syscalls:sys_exit");
2488 	if (evsel == NULL)
2489 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2490 							     "syscalls:sys_exit");
2491 	if (evsel &&
2492 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2493 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2494 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2495 		goto out;
2496 	}
2497 
2498 	evlist__for_each_entry(session->evlist, evsel) {
2499 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2500 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2501 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2502 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2503 			evsel->handler = trace__pgfault;
2504 	}
2505 
2506 	err = parse_target_str(trace);
2507 	if (err != 0)
2508 		goto out;
2509 
2510 	setup_pager();
2511 
2512 	err = perf_session__process_events(session);
2513 	if (err)
2514 		pr_err("Failed to process events, error %d", err);
2515 
2516 	else if (trace->summary)
2517 		trace__fprintf_thread_summary(trace, trace->output);
2518 
2519 out:
2520 	perf_session__delete(session);
2521 
2522 	return err;
2523 }
2524 
2525 static size_t trace__fprintf_threads_header(FILE *fp)
2526 {
2527 	size_t printed;
2528 
2529 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2530 
2531 	return printed;
2532 }
2533 
2534 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2535 	struct stats 	*stats;
2536 	double		msecs;
2537 	int		syscall;
2538 )
2539 {
2540 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2541 	struct stats *stats = source->priv;
2542 
2543 	entry->syscall = source->i;
2544 	entry->stats   = stats;
2545 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2546 }
2547 
2548 static size_t thread__dump_stats(struct thread_trace *ttrace,
2549 				 struct trace *trace, FILE *fp)
2550 {
2551 	size_t printed = 0;
2552 	struct syscall *sc;
2553 	struct rb_node *nd;
2554 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2555 
2556 	if (syscall_stats == NULL)
2557 		return 0;
2558 
2559 	printed += fprintf(fp, "\n");
2560 
2561 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2562 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2563 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2564 
2565 	resort_rb__for_each_entry(nd, syscall_stats) {
2566 		struct stats *stats = syscall_stats_entry->stats;
2567 		if (stats) {
2568 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2569 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2570 			double avg = avg_stats(stats);
2571 			double pct;
2572 			u64 n = (u64) stats->n;
2573 
2574 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2575 			avg /= NSEC_PER_MSEC;
2576 
2577 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2578 			printed += fprintf(fp, "   %-15s", sc->name);
2579 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2580 					   n, syscall_stats_entry->msecs, min, avg);
2581 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2582 		}
2583 	}
2584 
2585 	resort_rb__delete(syscall_stats);
2586 	printed += fprintf(fp, "\n\n");
2587 
2588 	return printed;
2589 }
2590 
2591 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2592 {
2593 	size_t printed = 0;
2594 	struct thread_trace *ttrace = thread__priv(thread);
2595 	double ratio;
2596 
2597 	if (ttrace == NULL)
2598 		return 0;
2599 
2600 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2601 
2602 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2603 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2604 	printed += fprintf(fp, "%.1f%%", ratio);
2605 	if (ttrace->pfmaj)
2606 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2607 	if (ttrace->pfmin)
2608 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2609 	if (trace->sched)
2610 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2611 	else if (fputc('\n', fp) != EOF)
2612 		++printed;
2613 
2614 	printed += thread__dump_stats(ttrace, trace, fp);
2615 
2616 	return printed;
2617 }
2618 
2619 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2620 {
2621 	return ttrace ? ttrace->nr_events : 0;
2622 }
2623 
2624 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2625 	struct thread *thread;
2626 )
2627 {
2628 	entry->thread = rb_entry(nd, struct thread, rb_node);
2629 }
2630 
2631 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2632 {
2633 	DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2634 	size_t printed = trace__fprintf_threads_header(fp);
2635 	struct rb_node *nd;
2636 
2637 	if (threads == NULL) {
2638 		fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2639 		return 0;
2640 	}
2641 
2642 	resort_rb__for_each_entry(nd, threads)
2643 		printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2644 
2645 	resort_rb__delete(threads);
2646 
2647 	return printed;
2648 }
2649 
2650 static int trace__set_duration(const struct option *opt, const char *str,
2651 			       int unset __maybe_unused)
2652 {
2653 	struct trace *trace = opt->value;
2654 
2655 	trace->duration_filter = atof(str);
2656 	return 0;
2657 }
2658 
2659 static int trace__set_filter_pids(const struct option *opt, const char *str,
2660 				  int unset __maybe_unused)
2661 {
2662 	int ret = -1;
2663 	size_t i;
2664 	struct trace *trace = opt->value;
2665 	/*
2666 	 * FIXME: introduce a intarray class, plain parse csv and create a
2667 	 * { int nr, int entries[] } struct...
2668 	 */
2669 	struct intlist *list = intlist__new(str);
2670 
2671 	if (list == NULL)
2672 		return -1;
2673 
2674 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2675 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2676 
2677 	if (trace->filter_pids.entries == NULL)
2678 		goto out;
2679 
2680 	trace->filter_pids.entries[0] = getpid();
2681 
2682 	for (i = 1; i < trace->filter_pids.nr; ++i)
2683 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2684 
2685 	intlist__delete(list);
2686 	ret = 0;
2687 out:
2688 	return ret;
2689 }
2690 
2691 static int trace__open_output(struct trace *trace, const char *filename)
2692 {
2693 	struct stat st;
2694 
2695 	if (!stat(filename, &st) && st.st_size) {
2696 		char oldname[PATH_MAX];
2697 
2698 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2699 		unlink(oldname);
2700 		rename(filename, oldname);
2701 	}
2702 
2703 	trace->output = fopen(filename, "w");
2704 
2705 	return trace->output == NULL ? -errno : 0;
2706 }
2707 
2708 static int parse_pagefaults(const struct option *opt, const char *str,
2709 			    int unset __maybe_unused)
2710 {
2711 	int *trace_pgfaults = opt->value;
2712 
2713 	if (strcmp(str, "all") == 0)
2714 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2715 	else if (strcmp(str, "maj") == 0)
2716 		*trace_pgfaults |= TRACE_PFMAJ;
2717 	else if (strcmp(str, "min") == 0)
2718 		*trace_pgfaults |= TRACE_PFMIN;
2719 	else
2720 		return -1;
2721 
2722 	return 0;
2723 }
2724 
2725 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2726 {
2727 	struct perf_evsel *evsel;
2728 
2729 	evlist__for_each_entry(evlist, evsel)
2730 		evsel->handler = handler;
2731 }
2732 
2733 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2734 {
2735 	const char *trace_usage[] = {
2736 		"perf trace [<options>] [<command>]",
2737 		"perf trace [<options>] -- <command> [<options>]",
2738 		"perf trace record [<options>] [<command>]",
2739 		"perf trace record [<options>] -- <command> [<options>]",
2740 		NULL
2741 	};
2742 	struct trace trace = {
2743 		.syscalls = {
2744 			. max = -1,
2745 		},
2746 		.opts = {
2747 			.target = {
2748 				.uid	   = UINT_MAX,
2749 				.uses_mmap = true,
2750 			},
2751 			.user_freq     = UINT_MAX,
2752 			.user_interval = ULLONG_MAX,
2753 			.no_buffering  = true,
2754 			.mmap_pages    = UINT_MAX,
2755 			.proc_map_timeout  = 500,
2756 		},
2757 		.output = stderr,
2758 		.show_comm = true,
2759 		.trace_syscalls = true,
2760 		.kernel_syscallchains = false,
2761 		.max_stack = UINT_MAX,
2762 	};
2763 	const char *output_name = NULL;
2764 	const char *ev_qualifier_str = NULL;
2765 	const struct option trace_options[] = {
2766 	OPT_CALLBACK(0, "event", &trace.evlist, "event",
2767 		     "event selector. use 'perf list' to list available events",
2768 		     parse_events_option),
2769 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2770 		    "show the thread COMM next to its id"),
2771 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2772 	OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2773 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
2774 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2775 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2776 		    "trace events on existing process id"),
2777 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2778 		    "trace events on existing thread id"),
2779 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2780 		     "pids to filter (by the kernel)", trace__set_filter_pids),
2781 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2782 		    "system-wide collection from all CPUs"),
2783 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2784 		    "list of cpus to monitor"),
2785 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2786 		    "child tasks do not inherit counters"),
2787 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2788 		     "number of mmap data pages",
2789 		     perf_evlist__parse_mmap_pages),
2790 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2791 		   "user to profile"),
2792 	OPT_CALLBACK(0, "duration", &trace, "float",
2793 		     "show only events with duration > N.M ms",
2794 		     trace__set_duration),
2795 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2796 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2797 	OPT_BOOLEAN('T', "time", &trace.full_time,
2798 		    "Show full timestamp, not time relative to first start"),
2799 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
2800 		    "Show only syscall summary with statistics"),
2801 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
2802 		    "Show all syscalls and summary with statistics"),
2803 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2804 		     "Trace pagefaults", parse_pagefaults, "maj"),
2805 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2806 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2807 	OPT_CALLBACK(0, "call-graph", &trace.opts,
2808 		     "record_mode[,record_size]", record_callchain_help,
2809 		     &record_parse_callchain_opt),
2810 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2811 		    "Show the kernel callchains on the syscall exit path"),
2812 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2813 		     "Set the minimum stack depth when parsing the callchain, "
2814 		     "anything below the specified depth will be ignored."),
2815 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2816 		     "Set the maximum stack depth when parsing the callchain, "
2817 		     "anything beyond the specified depth will be ignored. "
2818 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2819 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2820 			"per thread proc mmap processing timeout in ms"),
2821 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
2822 		     "ms to wait before starting measurement after program "
2823 		     "start"),
2824 	OPT_END()
2825 	};
2826 	bool __maybe_unused max_stack_user_set = true;
2827 	bool mmap_pages_user_set = true;
2828 	const char * const trace_subcommands[] = { "record", NULL };
2829 	int err;
2830 	char bf[BUFSIZ];
2831 
2832 	signal(SIGSEGV, sighandler_dump_stack);
2833 	signal(SIGFPE, sighandler_dump_stack);
2834 
2835 	trace.evlist = perf_evlist__new();
2836 	trace.sctbl = syscalltbl__new();
2837 
2838 	if (trace.evlist == NULL || trace.sctbl == NULL) {
2839 		pr_err("Not enough memory to run!\n");
2840 		err = -ENOMEM;
2841 		goto out;
2842 	}
2843 
2844 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2845 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2846 
2847 	err = bpf__setup_stdout(trace.evlist);
2848 	if (err) {
2849 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2850 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2851 		goto out;
2852 	}
2853 
2854 	err = -1;
2855 
2856 	if (trace.trace_pgfaults) {
2857 		trace.opts.sample_address = true;
2858 		trace.opts.sample_time = true;
2859 	}
2860 
2861 	if (trace.opts.mmap_pages == UINT_MAX)
2862 		mmap_pages_user_set = false;
2863 
2864 	if (trace.max_stack == UINT_MAX) {
2865 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2866 		max_stack_user_set = false;
2867 	}
2868 
2869 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2870 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2871 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2872 #endif
2873 
2874 	if (callchain_param.enabled) {
2875 		if (!mmap_pages_user_set && geteuid() == 0)
2876 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2877 
2878 		symbol_conf.use_callchain = true;
2879 	}
2880 
2881 	if (trace.evlist->nr_entries > 0)
2882 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2883 
2884 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2885 		return trace__record(&trace, argc-1, &argv[1]);
2886 
2887 	/* summary_only implies summary option, but don't overwrite summary if set */
2888 	if (trace.summary_only)
2889 		trace.summary = trace.summary_only;
2890 
2891 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2892 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
2893 		pr_err("Please specify something to trace.\n");
2894 		return -1;
2895 	}
2896 
2897 	if (!trace.trace_syscalls && ev_qualifier_str) {
2898 		pr_err("The -e option can't be used with --no-syscalls.\n");
2899 		goto out;
2900 	}
2901 
2902 	if (output_name != NULL) {
2903 		err = trace__open_output(&trace, output_name);
2904 		if (err < 0) {
2905 			perror("failed to create output file");
2906 			goto out;
2907 		}
2908 	}
2909 
2910 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
2911 
2912 	if (ev_qualifier_str != NULL) {
2913 		const char *s = ev_qualifier_str;
2914 		struct strlist_config slist_config = {
2915 			.dirname = system_path(STRACE_GROUPS_DIR),
2916 		};
2917 
2918 		trace.not_ev_qualifier = *s == '!';
2919 		if (trace.not_ev_qualifier)
2920 			++s;
2921 		trace.ev_qualifier = strlist__new(s, &slist_config);
2922 		if (trace.ev_qualifier == NULL) {
2923 			fputs("Not enough memory to parse event qualifier",
2924 			      trace.output);
2925 			err = -ENOMEM;
2926 			goto out_close;
2927 		}
2928 
2929 		err = trace__validate_ev_qualifier(&trace);
2930 		if (err)
2931 			goto out_close;
2932 	}
2933 
2934 	err = target__validate(&trace.opts.target);
2935 	if (err) {
2936 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2937 		fprintf(trace.output, "%s", bf);
2938 		goto out_close;
2939 	}
2940 
2941 	err = target__parse_uid(&trace.opts.target);
2942 	if (err) {
2943 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2944 		fprintf(trace.output, "%s", bf);
2945 		goto out_close;
2946 	}
2947 
2948 	if (!argc && target__none(&trace.opts.target))
2949 		trace.opts.target.system_wide = true;
2950 
2951 	if (input_name)
2952 		err = trace__replay(&trace);
2953 	else
2954 		err = trace__run(&trace, argc, argv);
2955 
2956 out_close:
2957 	if (output_name != NULL)
2958 		fclose(trace.output);
2959 out:
2960 	return err;
2961 }
2962