xref: /linux/tools/perf/builtin-trace.c (revision 092bd3cd7169085b09e4a5307de95e461d0581d7)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/env.h"
25 #include "util/event.h"
26 #include "util/evlist.h"
27 #include <subcmd/exec-cmd.h>
28 #include "util/machine.h"
29 #include "util/path.h"
30 #include "util/session.h"
31 #include "util/thread.h"
32 #include <subcmd/parse-options.h>
33 #include "util/strlist.h"
34 #include "util/intlist.h"
35 #include "util/thread_map.h"
36 #include "util/stat.h"
37 #include "trace/beauty/beauty.h"
38 #include "trace-event.h"
39 #include "util/parse-events.h"
40 #include "util/bpf-loader.h"
41 #include "callchain.h"
42 #include "print_binary.h"
43 #include "string2.h"
44 #include "syscalltbl.h"
45 #include "rb_resort.h"
46 
47 #include <errno.h>
48 #include <inttypes.h>
49 #include <poll.h>
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/kernel.h>
56 #include <linux/random.h>
57 #include <linux/stringify.h>
58 #include <linux/time64.h>
59 
60 #include "sane_ctype.h"
61 
62 #ifndef O_CLOEXEC
63 # define O_CLOEXEC		02000000
64 #endif
65 
66 #ifndef F_LINUX_SPECIFIC_BASE
67 # define F_LINUX_SPECIFIC_BASE	1024
68 #endif
69 
70 struct trace {
71 	struct perf_tool	tool;
72 	struct syscalltbl	*sctbl;
73 	struct {
74 		int		max;
75 		struct syscall  *table;
76 		struct {
77 			struct perf_evsel *sys_enter,
78 					  *sys_exit;
79 		}		events;
80 	} syscalls;
81 	struct record_opts	opts;
82 	struct perf_evlist	*evlist;
83 	struct machine		*host;
84 	struct thread		*current;
85 	u64			base_time;
86 	FILE			*output;
87 	unsigned long		nr_events;
88 	struct strlist		*ev_qualifier;
89 	struct {
90 		size_t		nr;
91 		int		*entries;
92 	}			ev_qualifier_ids;
93 	struct {
94 		size_t		nr;
95 		pid_t		*entries;
96 	}			filter_pids;
97 	double			duration_filter;
98 	double			runtime_ms;
99 	struct {
100 		u64		vfs_getname,
101 				proc_getname;
102 	} stats;
103 	unsigned int		max_stack;
104 	unsigned int		min_stack;
105 	bool			not_ev_qualifier;
106 	bool			live;
107 	bool			full_time;
108 	bool			sched;
109 	bool			multiple_threads;
110 	bool			summary;
111 	bool			summary_only;
112 	bool			show_comm;
113 	bool			show_tool_stats;
114 	bool			trace_syscalls;
115 	bool			kernel_syscallchains;
116 	bool			force;
117 	bool			vfs_getname;
118 	int			trace_pgfaults;
119 	int			open_id;
120 };
121 
122 struct tp_field {
123 	int offset;
124 	union {
125 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
126 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
127 	};
128 };
129 
130 #define TP_UINT_FIELD(bits) \
131 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
132 { \
133 	u##bits value; \
134 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
135 	return value;  \
136 }
137 
138 TP_UINT_FIELD(8);
139 TP_UINT_FIELD(16);
140 TP_UINT_FIELD(32);
141 TP_UINT_FIELD(64);
142 
143 #define TP_UINT_FIELD__SWAPPED(bits) \
144 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
145 { \
146 	u##bits value; \
147 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
148 	return bswap_##bits(value);\
149 }
150 
151 TP_UINT_FIELD__SWAPPED(16);
152 TP_UINT_FIELD__SWAPPED(32);
153 TP_UINT_FIELD__SWAPPED(64);
154 
155 static int tp_field__init_uint(struct tp_field *field,
156 			       struct format_field *format_field,
157 			       bool needs_swap)
158 {
159 	field->offset = format_field->offset;
160 
161 	switch (format_field->size) {
162 	case 1:
163 		field->integer = tp_field__u8;
164 		break;
165 	case 2:
166 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
167 		break;
168 	case 4:
169 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
170 		break;
171 	case 8:
172 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
173 		break;
174 	default:
175 		return -1;
176 	}
177 
178 	return 0;
179 }
180 
181 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
182 {
183 	return sample->raw_data + field->offset;
184 }
185 
186 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
187 {
188 	field->offset = format_field->offset;
189 	field->pointer = tp_field__ptr;
190 	return 0;
191 }
192 
193 struct syscall_tp {
194 	struct tp_field id;
195 	union {
196 		struct tp_field args, ret;
197 	};
198 };
199 
200 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
201 					  struct tp_field *field,
202 					  const char *name)
203 {
204 	struct format_field *format_field = perf_evsel__field(evsel, name);
205 
206 	if (format_field == NULL)
207 		return -1;
208 
209 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
210 }
211 
212 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
213 	({ struct syscall_tp *sc = evsel->priv;\
214 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
215 
216 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
217 					 struct tp_field *field,
218 					 const char *name)
219 {
220 	struct format_field *format_field = perf_evsel__field(evsel, name);
221 
222 	if (format_field == NULL)
223 		return -1;
224 
225 	return tp_field__init_ptr(field, format_field);
226 }
227 
228 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
229 	({ struct syscall_tp *sc = evsel->priv;\
230 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
231 
232 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
233 {
234 	zfree(&evsel->priv);
235 	perf_evsel__delete(evsel);
236 }
237 
238 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
239 {
240 	evsel->priv = malloc(sizeof(struct syscall_tp));
241 	if (evsel->priv != NULL) {
242 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
243 			goto out_delete;
244 
245 		evsel->handler = handler;
246 		return 0;
247 	}
248 
249 	return -ENOMEM;
250 
251 out_delete:
252 	zfree(&evsel->priv);
253 	return -ENOENT;
254 }
255 
256 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
257 {
258 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
259 
260 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
261 	if (IS_ERR(evsel))
262 		evsel = perf_evsel__newtp("syscalls", direction);
263 
264 	if (IS_ERR(evsel))
265 		return NULL;
266 
267 	if (perf_evsel__init_syscall_tp(evsel, handler))
268 		goto out_delete;
269 
270 	return evsel;
271 
272 out_delete:
273 	perf_evsel__delete_priv(evsel);
274 	return NULL;
275 }
276 
277 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
278 	({ struct syscall_tp *fields = evsel->priv; \
279 	   fields->name.integer(&fields->name, sample); })
280 
281 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
282 	({ struct syscall_tp *fields = evsel->priv; \
283 	   fields->name.pointer(&fields->name, sample); })
284 
285 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
286 {
287 	int idx = val - sa->offset;
288 
289 	if (idx < 0 || idx >= sa->nr_entries)
290 		return scnprintf(bf, size, intfmt, val);
291 
292 	return scnprintf(bf, size, "%s", sa->entries[idx]);
293 }
294 
295 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
296 						const char *intfmt,
297 					        struct syscall_arg *arg)
298 {
299 	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
300 }
301 
302 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
303 					      struct syscall_arg *arg)
304 {
305 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
306 }
307 
308 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
309 
310 struct strarrays {
311 	int		nr_entries;
312 	struct strarray **entries;
313 };
314 
315 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
316 	.nr_entries = ARRAY_SIZE(array), \
317 	.entries = array, \
318 }
319 
320 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
321 					struct syscall_arg *arg)
322 {
323 	struct strarrays *sas = arg->parm;
324 	int i;
325 
326 	for (i = 0; i < sas->nr_entries; ++i) {
327 		struct strarray *sa = sas->entries[i];
328 		int idx = arg->val - sa->offset;
329 
330 		if (idx >= 0 && idx < sa->nr_entries) {
331 			if (sa->entries[idx] == NULL)
332 				break;
333 			return scnprintf(bf, size, "%s", sa->entries[idx]);
334 		}
335 	}
336 
337 	return scnprintf(bf, size, "%d", arg->val);
338 }
339 
340 #ifndef AT_FDCWD
341 #define AT_FDCWD	-100
342 #endif
343 
344 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
345 					   struct syscall_arg *arg)
346 {
347 	int fd = arg->val;
348 
349 	if (fd == AT_FDCWD)
350 		return scnprintf(bf, size, "CWD");
351 
352 	return syscall_arg__scnprintf_fd(bf, size, arg);
353 }
354 
355 #define SCA_FDAT syscall_arg__scnprintf_fd_at
356 
357 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
358 					      struct syscall_arg *arg);
359 
360 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
361 
362 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
363 {
364 	return scnprintf(bf, size, "%#lx", arg->val);
365 }
366 
367 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
368 {
369 	return scnprintf(bf, size, "%d", arg->val);
370 }
371 
372 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
373 {
374 	return scnprintf(bf, size, "%ld", arg->val);
375 }
376 
377 static const char *bpf_cmd[] = {
378 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
379 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
380 };
381 static DEFINE_STRARRAY(bpf_cmd);
382 
383 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
384 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
385 
386 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
387 static DEFINE_STRARRAY(itimers);
388 
389 static const char *keyctl_options[] = {
390 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
391 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
392 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
393 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
394 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
395 };
396 static DEFINE_STRARRAY(keyctl_options);
397 
398 static const char *whences[] = { "SET", "CUR", "END",
399 #ifdef SEEK_DATA
400 "DATA",
401 #endif
402 #ifdef SEEK_HOLE
403 "HOLE",
404 #endif
405 };
406 static DEFINE_STRARRAY(whences);
407 
408 static const char *fcntl_cmds[] = {
409 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
410 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
411 	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
412 	"GETOWNER_UIDS",
413 };
414 static DEFINE_STRARRAY(fcntl_cmds);
415 
416 static const char *fcntl_linux_specific_cmds[] = {
417 	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
418 	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
419 	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
420 };
421 
422 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
423 
424 static struct strarray *fcntl_cmds_arrays[] = {
425 	&strarray__fcntl_cmds,
426 	&strarray__fcntl_linux_specific_cmds,
427 };
428 
429 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
430 
431 static const char *rlimit_resources[] = {
432 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
433 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
434 	"RTTIME",
435 };
436 static DEFINE_STRARRAY(rlimit_resources);
437 
438 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
439 static DEFINE_STRARRAY(sighow);
440 
441 static const char *clockid[] = {
442 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
443 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
444 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
445 };
446 static DEFINE_STRARRAY(clockid);
447 
448 static const char *socket_families[] = {
449 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
450 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
451 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
452 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
453 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
454 	"ALG", "NFC", "VSOCK",
455 };
456 static DEFINE_STRARRAY(socket_families);
457 
458 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
459 						 struct syscall_arg *arg)
460 {
461 	size_t printed = 0;
462 	int mode = arg->val;
463 
464 	if (mode == F_OK) /* 0 */
465 		return scnprintf(bf, size, "F");
466 #define	P_MODE(n) \
467 	if (mode & n##_OK) { \
468 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
469 		mode &= ~n##_OK; \
470 	}
471 
472 	P_MODE(R);
473 	P_MODE(W);
474 	P_MODE(X);
475 #undef P_MODE
476 
477 	if (mode)
478 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
479 
480 	return printed;
481 }
482 
483 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
484 
485 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
486 					      struct syscall_arg *arg);
487 
488 #define SCA_FILENAME syscall_arg__scnprintf_filename
489 
490 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
491 						struct syscall_arg *arg)
492 {
493 	int printed = 0, flags = arg->val;
494 
495 #define	P_FLAG(n) \
496 	if (flags & O_##n) { \
497 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
498 		flags &= ~O_##n; \
499 	}
500 
501 	P_FLAG(CLOEXEC);
502 	P_FLAG(NONBLOCK);
503 #undef P_FLAG
504 
505 	if (flags)
506 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
507 
508 	return printed;
509 }
510 
511 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
512 
513 #ifndef GRND_NONBLOCK
514 #define GRND_NONBLOCK	0x0001
515 #endif
516 #ifndef GRND_RANDOM
517 #define GRND_RANDOM	0x0002
518 #endif
519 
520 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
521 						   struct syscall_arg *arg)
522 {
523 	int printed = 0, flags = arg->val;
524 
525 #define	P_FLAG(n) \
526 	if (flags & GRND_##n) { \
527 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
528 		flags &= ~GRND_##n; \
529 	}
530 
531 	P_FLAG(RANDOM);
532 	P_FLAG(NONBLOCK);
533 #undef P_FLAG
534 
535 	if (flags)
536 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
537 
538 	return printed;
539 }
540 
541 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
542 
543 #define STRARRAY(name, array) \
544 	  { .scnprintf	= SCA_STRARRAY, \
545 	    .parm	= &strarray__##array, }
546 
547 #include "trace/beauty/arch_errno_names.c"
548 #include "trace/beauty/eventfd.c"
549 #include "trace/beauty/flock.c"
550 #include "trace/beauty/futex_op.c"
551 #include "trace/beauty/mmap.c"
552 #include "trace/beauty/mode_t.c"
553 #include "trace/beauty/msg_flags.c"
554 #include "trace/beauty/open_flags.c"
555 #include "trace/beauty/perf_event_open.c"
556 #include "trace/beauty/pid.c"
557 #include "trace/beauty/sched_policy.c"
558 #include "trace/beauty/seccomp.c"
559 #include "trace/beauty/signum.c"
560 #include "trace/beauty/socket_type.c"
561 #include "trace/beauty/waitid_options.c"
562 
563 struct syscall_arg_fmt {
564 	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
565 	void	   *parm;
566 	const char *name;
567 	bool	   show_zero;
568 };
569 
570 static struct syscall_fmt {
571 	const char *name;
572 	const char *alias;
573 	struct syscall_arg_fmt arg[6];
574 	u8	   nr_args;
575 	bool	   errpid;
576 	bool	   timeout;
577 	bool	   hexret;
578 } syscall_fmts[] = {
579 	{ .name	    = "access",
580 	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
581 	{ .name	    = "bpf",
582 	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
583 	{ .name	    = "brk",	    .hexret = true,
584 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
585 	{ .name     = "clock_gettime",
586 	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
587 	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
588 	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
589 		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
590 		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
591 		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
592 		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
593 	{ .name	    = "close",
594 	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
595 	{ .name	    = "epoll_ctl",
596 	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
597 	{ .name	    = "eventfd2",
598 	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
599 	{ .name	    = "fchmodat",
600 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
601 	{ .name	    = "fchownat",
602 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
603 	{ .name	    = "fcntl",
604 	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
605 			   .parm      = &strarrays__fcntl_cmds_arrays,
606 			   .show_zero = true, },
607 		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
608 	{ .name	    = "flock",
609 	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
610 	{ .name	    = "fstat", .alias = "newfstat", },
611 	{ .name	    = "fstatat", .alias = "newfstatat", },
612 	{ .name	    = "futex",
613 	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, }, },
614 	{ .name	    = "futimesat",
615 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
616 	{ .name	    = "getitimer",
617 	  .arg = { [0] = STRARRAY(which, itimers), }, },
618 	{ .name	    = "getpid",	    .errpid = true, },
619 	{ .name	    = "getpgid",    .errpid = true, },
620 	{ .name	    = "getppid",    .errpid = true, },
621 	{ .name	    = "getrandom",
622 	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
623 	{ .name	    = "getrlimit",
624 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
625 	{ .name	    = "gettid",	    .errpid = true, },
626 	{ .name	    = "ioctl",
627 	  .arg = {
628 #if defined(__i386__) || defined(__x86_64__)
629 /*
630  * FIXME: Make this available to all arches.
631  */
632 		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
633 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
634 #else
635 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
636 #endif
637 	{ .name	    = "kcmp",	    .nr_args = 5,
638 	  .arg = { [0] = { .name = "pid1",	.scnprintf = SCA_PID, },
639 		   [1] = { .name = "pid2",	.scnprintf = SCA_PID, },
640 		   [2] = { .name = "type",	.scnprintf = SCA_KCMP_TYPE, },
641 		   [3] = { .name = "idx1",	.scnprintf = SCA_KCMP_IDX, },
642 		   [4] = { .name = "idx2",	.scnprintf = SCA_KCMP_IDX, }, }, },
643 	{ .name	    = "keyctl",
644 	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
645 	{ .name	    = "kill",
646 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
647 	{ .name	    = "linkat",
648 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
649 	{ .name	    = "lseek",
650 	  .arg = { [2] = STRARRAY(whence, whences), }, },
651 	{ .name	    = "lstat", .alias = "newlstat", },
652 	{ .name     = "madvise",
653 	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
654 		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
655 	{ .name	    = "mkdirat",
656 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
657 	{ .name	    = "mknodat",
658 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
659 	{ .name	    = "mlock",
660 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
661 	{ .name	    = "mlockall",
662 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
663 	{ .name	    = "mmap",	    .hexret = true,
664 /* The standard mmap maps to old_mmap on s390x */
665 #if defined(__s390x__)
666 	.alias = "old_mmap",
667 #endif
668 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* addr */ },
669 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
670 		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */ }, }, },
671 	{ .name	    = "mprotect",
672 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
673 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
674 	{ .name	    = "mq_unlink",
675 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
676 	{ .name	    = "mremap",	    .hexret = true,
677 	  .arg = { [0] = { .scnprintf = SCA_HEX,	  /* addr */ },
678 		   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
679 		   [4] = { .scnprintf = SCA_HEX,	  /* new_addr */ }, }, },
680 	{ .name	    = "munlock",
681 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
682 	{ .name	    = "munmap",
683 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
684 	{ .name	    = "name_to_handle_at",
685 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
686 	{ .name	    = "newfstatat",
687 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
688 	{ .name	    = "open",
689 	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
690 	{ .name	    = "open_by_handle_at",
691 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
692 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
693 	{ .name	    = "openat",
694 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
695 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
696 	{ .name	    = "perf_event_open",
697 	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
698 		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
699 		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
700 	{ .name	    = "pipe2",
701 	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
702 	{ .name	    = "pkey_alloc",
703 	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
704 	{ .name	    = "pkey_free",
705 	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
706 	{ .name	    = "pkey_mprotect",
707 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
708 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
709 		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
710 	{ .name	    = "poll", .timeout = true, },
711 	{ .name	    = "ppoll", .timeout = true, },
712 	{ .name	    = "prctl", .alias = "arch_prctl",
713 	  .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
714 		   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
715 		   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
716 	{ .name	    = "pread", .alias = "pread64", },
717 	{ .name	    = "preadv", .alias = "pread", },
718 	{ .name	    = "prlimit64",
719 	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
720 	{ .name	    = "pwrite", .alias = "pwrite64", },
721 	{ .name	    = "readlinkat",
722 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
723 	{ .name	    = "recvfrom",
724 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
725 	{ .name	    = "recvmmsg",
726 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
727 	{ .name	    = "recvmsg",
728 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
729 	{ .name	    = "renameat",
730 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
731 	{ .name	    = "rt_sigaction",
732 	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
733 	{ .name	    = "rt_sigprocmask",
734 	  .arg = { [0] = STRARRAY(how, sighow), }, },
735 	{ .name	    = "rt_sigqueueinfo",
736 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
737 	{ .name	    = "rt_tgsigqueueinfo",
738 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
739 	{ .name	    = "sched_setscheduler",
740 	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
741 	{ .name	    = "seccomp",
742 	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
743 		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
744 	{ .name	    = "select", .timeout = true, },
745 	{ .name	    = "sendmmsg",
746 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
747 	{ .name	    = "sendmsg",
748 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
749 	{ .name	    = "sendto",
750 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
751 	{ .name	    = "set_tid_address", .errpid = true, },
752 	{ .name	    = "setitimer",
753 	  .arg = { [0] = STRARRAY(which, itimers), }, },
754 	{ .name	    = "setrlimit",
755 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
756 	{ .name	    = "socket",
757 	  .arg = { [0] = STRARRAY(family, socket_families),
758 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
759 	{ .name	    = "socketpair",
760 	  .arg = { [0] = STRARRAY(family, socket_families),
761 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
762 	{ .name	    = "stat", .alias = "newstat", },
763 	{ .name	    = "statx",
764 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
765 		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
766 		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
767 	{ .name	    = "swapoff",
768 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
769 	{ .name	    = "swapon",
770 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
771 	{ .name	    = "symlinkat",
772 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
773 	{ .name	    = "tgkill",
774 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
775 	{ .name	    = "tkill",
776 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
777 	{ .name	    = "uname", .alias = "newuname", },
778 	{ .name	    = "unlinkat",
779 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
780 	{ .name	    = "utimensat",
781 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
782 	{ .name	    = "wait4",	    .errpid = true,
783 	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
784 	{ .name	    = "waitid",	    .errpid = true,
785 	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
786 };
787 
788 static int syscall_fmt__cmp(const void *name, const void *fmtp)
789 {
790 	const struct syscall_fmt *fmt = fmtp;
791 	return strcmp(name, fmt->name);
792 }
793 
794 static struct syscall_fmt *syscall_fmt__find(const char *name)
795 {
796 	const int nmemb = ARRAY_SIZE(syscall_fmts);
797 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
798 }
799 
800 struct syscall {
801 	struct event_format *tp_format;
802 	int		    nr_args;
803 	struct format_field *args;
804 	const char	    *name;
805 	bool		    is_exit;
806 	struct syscall_fmt  *fmt;
807 	struct syscall_arg_fmt *arg_fmt;
808 };
809 
810 /*
811  * We need to have this 'calculated' boolean because in some cases we really
812  * don't know what is the duration of a syscall, for instance, when we start
813  * a session and some threads are waiting for a syscall to finish, say 'poll',
814  * in which case all we can do is to print "( ? ) for duration and for the
815  * start timestamp.
816  */
817 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
818 {
819 	double duration = (double)t / NSEC_PER_MSEC;
820 	size_t printed = fprintf(fp, "(");
821 
822 	if (!calculated)
823 		printed += fprintf(fp, "     ?   ");
824 	else if (duration >= 1.0)
825 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
826 	else if (duration >= 0.01)
827 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
828 	else
829 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
830 	return printed + fprintf(fp, "): ");
831 }
832 
833 /**
834  * filename.ptr: The filename char pointer that will be vfs_getname'd
835  * filename.entry_str_pos: Where to insert the string translated from
836  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
837  * ret_scnprintf: syscall args may set this to a different syscall return
838  *                formatter, for instance, fcntl may return fds, file flags, etc.
839  */
840 struct thread_trace {
841 	u64		  entry_time;
842 	bool		  entry_pending;
843 	unsigned long	  nr_events;
844 	unsigned long	  pfmaj, pfmin;
845 	char		  *entry_str;
846 	double		  runtime_ms;
847 	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
848         struct {
849 		unsigned long ptr;
850 		short int     entry_str_pos;
851 		bool	      pending_open;
852 		unsigned int  namelen;
853 		char	      *name;
854 	} filename;
855 	struct {
856 		int	  max;
857 		char	  **table;
858 	} paths;
859 
860 	struct intlist *syscall_stats;
861 };
862 
863 static struct thread_trace *thread_trace__new(void)
864 {
865 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
866 
867 	if (ttrace)
868 		ttrace->paths.max = -1;
869 
870 	ttrace->syscall_stats = intlist__new(NULL);
871 
872 	return ttrace;
873 }
874 
875 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
876 {
877 	struct thread_trace *ttrace;
878 
879 	if (thread == NULL)
880 		goto fail;
881 
882 	if (thread__priv(thread) == NULL)
883 		thread__set_priv(thread, thread_trace__new());
884 
885 	if (thread__priv(thread) == NULL)
886 		goto fail;
887 
888 	ttrace = thread__priv(thread);
889 	++ttrace->nr_events;
890 
891 	return ttrace;
892 fail:
893 	color_fprintf(fp, PERF_COLOR_RED,
894 		      "WARNING: not enough memory, dropping samples!\n");
895 	return NULL;
896 }
897 
898 
899 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
900 				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
901 {
902 	struct thread_trace *ttrace = thread__priv(arg->thread);
903 
904 	ttrace->ret_scnprintf = ret_scnprintf;
905 }
906 
907 #define TRACE_PFMAJ		(1 << 0)
908 #define TRACE_PFMIN		(1 << 1)
909 
910 static const size_t trace__entry_str_size = 2048;
911 
912 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
913 {
914 	struct thread_trace *ttrace = thread__priv(thread);
915 
916 	if (fd > ttrace->paths.max) {
917 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
918 
919 		if (npath == NULL)
920 			return -1;
921 
922 		if (ttrace->paths.max != -1) {
923 			memset(npath + ttrace->paths.max + 1, 0,
924 			       (fd - ttrace->paths.max) * sizeof(char *));
925 		} else {
926 			memset(npath, 0, (fd + 1) * sizeof(char *));
927 		}
928 
929 		ttrace->paths.table = npath;
930 		ttrace->paths.max   = fd;
931 	}
932 
933 	ttrace->paths.table[fd] = strdup(pathname);
934 
935 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
936 }
937 
938 static int thread__read_fd_path(struct thread *thread, int fd)
939 {
940 	char linkname[PATH_MAX], pathname[PATH_MAX];
941 	struct stat st;
942 	int ret;
943 
944 	if (thread->pid_ == thread->tid) {
945 		scnprintf(linkname, sizeof(linkname),
946 			  "/proc/%d/fd/%d", thread->pid_, fd);
947 	} else {
948 		scnprintf(linkname, sizeof(linkname),
949 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
950 	}
951 
952 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
953 		return -1;
954 
955 	ret = readlink(linkname, pathname, sizeof(pathname));
956 
957 	if (ret < 0 || ret > st.st_size)
958 		return -1;
959 
960 	pathname[ret] = '\0';
961 	return trace__set_fd_pathname(thread, fd, pathname);
962 }
963 
964 static const char *thread__fd_path(struct thread *thread, int fd,
965 				   struct trace *trace)
966 {
967 	struct thread_trace *ttrace = thread__priv(thread);
968 
969 	if (ttrace == NULL)
970 		return NULL;
971 
972 	if (fd < 0)
973 		return NULL;
974 
975 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
976 		if (!trace->live)
977 			return NULL;
978 		++trace->stats.proc_getname;
979 		if (thread__read_fd_path(thread, fd))
980 			return NULL;
981 	}
982 
983 	return ttrace->paths.table[fd];
984 }
985 
986 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
987 {
988 	int fd = arg->val;
989 	size_t printed = scnprintf(bf, size, "%d", fd);
990 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
991 
992 	if (path)
993 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
994 
995 	return printed;
996 }
997 
998 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
999 {
1000         size_t printed = scnprintf(bf, size, "%d", fd);
1001 	struct thread *thread = machine__find_thread(trace->host, pid, pid);
1002 
1003 	if (thread) {
1004 		const char *path = thread__fd_path(thread, fd, trace);
1005 
1006 		if (path)
1007 			printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1008 
1009 		thread__put(thread);
1010 	}
1011 
1012         return printed;
1013 }
1014 
1015 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1016 					      struct syscall_arg *arg)
1017 {
1018 	int fd = arg->val;
1019 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1020 	struct thread_trace *ttrace = thread__priv(arg->thread);
1021 
1022 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1023 		zfree(&ttrace->paths.table[fd]);
1024 
1025 	return printed;
1026 }
1027 
1028 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1029 				     unsigned long ptr)
1030 {
1031 	struct thread_trace *ttrace = thread__priv(thread);
1032 
1033 	ttrace->filename.ptr = ptr;
1034 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1035 }
1036 
1037 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1038 					      struct syscall_arg *arg)
1039 {
1040 	unsigned long ptr = arg->val;
1041 
1042 	if (!arg->trace->vfs_getname)
1043 		return scnprintf(bf, size, "%#x", ptr);
1044 
1045 	thread__set_filename_pos(arg->thread, bf, ptr);
1046 	return 0;
1047 }
1048 
1049 static bool trace__filter_duration(struct trace *trace, double t)
1050 {
1051 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1052 }
1053 
1054 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1055 {
1056 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1057 
1058 	return fprintf(fp, "%10.3f ", ts);
1059 }
1060 
1061 /*
1062  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1063  * using ttrace->entry_time for a thread that receives a sys_exit without
1064  * first having received a sys_enter ("poll" issued before tracing session
1065  * starts, lost sys_enter exit due to ring buffer overflow).
1066  */
1067 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1068 {
1069 	if (tstamp > 0)
1070 		return __trace__fprintf_tstamp(trace, tstamp, fp);
1071 
1072 	return fprintf(fp, "         ? ");
1073 }
1074 
1075 static bool done = false;
1076 static bool interrupted = false;
1077 
1078 static void sig_handler(int sig)
1079 {
1080 	done = true;
1081 	interrupted = sig == SIGINT;
1082 }
1083 
1084 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1085 					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1086 {
1087 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1088 	printed += fprintf_duration(duration, duration_calculated, fp);
1089 
1090 	if (trace->multiple_threads) {
1091 		if (trace->show_comm)
1092 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1093 		printed += fprintf(fp, "%d ", thread->tid);
1094 	}
1095 
1096 	return printed;
1097 }
1098 
1099 static int trace__process_event(struct trace *trace, struct machine *machine,
1100 				union perf_event *event, struct perf_sample *sample)
1101 {
1102 	int ret = 0;
1103 
1104 	switch (event->header.type) {
1105 	case PERF_RECORD_LOST:
1106 		color_fprintf(trace->output, PERF_COLOR_RED,
1107 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1108 		ret = machine__process_lost_event(machine, event, sample);
1109 		break;
1110 	default:
1111 		ret = machine__process_event(machine, event, sample);
1112 		break;
1113 	}
1114 
1115 	return ret;
1116 }
1117 
1118 static int trace__tool_process(struct perf_tool *tool,
1119 			       union perf_event *event,
1120 			       struct perf_sample *sample,
1121 			       struct machine *machine)
1122 {
1123 	struct trace *trace = container_of(tool, struct trace, tool);
1124 	return trace__process_event(trace, machine, event, sample);
1125 }
1126 
1127 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1128 {
1129 	struct machine *machine = vmachine;
1130 
1131 	if (machine->kptr_restrict_warned)
1132 		return NULL;
1133 
1134 	if (symbol_conf.kptr_restrict) {
1135 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1136 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1137 			   "Kernel samples will not be resolved.\n");
1138 		machine->kptr_restrict_warned = true;
1139 		return NULL;
1140 	}
1141 
1142 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1143 }
1144 
1145 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1146 {
1147 	int err = symbol__init(NULL);
1148 
1149 	if (err)
1150 		return err;
1151 
1152 	trace->host = machine__new_host();
1153 	if (trace->host == NULL)
1154 		return -ENOMEM;
1155 
1156 	err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1157 	if (err < 0)
1158 		goto out;
1159 
1160 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1161 					    evlist->threads, trace__tool_process, false,
1162 					    trace->opts.proc_map_timeout, 1);
1163 out:
1164 	if (err)
1165 		symbol__exit();
1166 
1167 	return err;
1168 }
1169 
1170 static void trace__symbols__exit(struct trace *trace)
1171 {
1172 	machine__exit(trace->host);
1173 	trace->host = NULL;
1174 
1175 	symbol__exit();
1176 }
1177 
1178 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1179 {
1180 	int idx;
1181 
1182 	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1183 		nr_args = sc->fmt->nr_args;
1184 
1185 	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1186 	if (sc->arg_fmt == NULL)
1187 		return -1;
1188 
1189 	for (idx = 0; idx < nr_args; ++idx) {
1190 		if (sc->fmt)
1191 			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1192 	}
1193 
1194 	sc->nr_args = nr_args;
1195 	return 0;
1196 }
1197 
1198 static int syscall__set_arg_fmts(struct syscall *sc)
1199 {
1200 	struct format_field *field;
1201 	int idx = 0, len;
1202 
1203 	for (field = sc->args; field; field = field->next, ++idx) {
1204 		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1205 			continue;
1206 
1207 		if (strcmp(field->type, "const char *") == 0 &&
1208 			 (strcmp(field->name, "filename") == 0 ||
1209 			  strcmp(field->name, "path") == 0 ||
1210 			  strcmp(field->name, "pathname") == 0))
1211 			sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1212 		else if (field->flags & FIELD_IS_POINTER)
1213 			sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1214 		else if (strcmp(field->type, "pid_t") == 0)
1215 			sc->arg_fmt[idx].scnprintf = SCA_PID;
1216 		else if (strcmp(field->type, "umode_t") == 0)
1217 			sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1218 		else if ((strcmp(field->type, "int") == 0 ||
1219 			  strcmp(field->type, "unsigned int") == 0 ||
1220 			  strcmp(field->type, "long") == 0) &&
1221 			 (len = strlen(field->name)) >= 2 &&
1222 			 strcmp(field->name + len - 2, "fd") == 0) {
1223 			/*
1224 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1225 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1226 			 * 65 int
1227 			 * 23 unsigned int
1228 			 * 7 unsigned long
1229 			 */
1230 			sc->arg_fmt[idx].scnprintf = SCA_FD;
1231 		}
1232 	}
1233 
1234 	return 0;
1235 }
1236 
1237 static int trace__read_syscall_info(struct trace *trace, int id)
1238 {
1239 	char tp_name[128];
1240 	struct syscall *sc;
1241 	const char *name = syscalltbl__name(trace->sctbl, id);
1242 
1243 	if (name == NULL)
1244 		return -1;
1245 
1246 	if (id > trace->syscalls.max) {
1247 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1248 
1249 		if (nsyscalls == NULL)
1250 			return -1;
1251 
1252 		if (trace->syscalls.max != -1) {
1253 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1254 			       (id - trace->syscalls.max) * sizeof(*sc));
1255 		} else {
1256 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1257 		}
1258 
1259 		trace->syscalls.table = nsyscalls;
1260 		trace->syscalls.max   = id;
1261 	}
1262 
1263 	sc = trace->syscalls.table + id;
1264 	sc->name = name;
1265 
1266 	sc->fmt  = syscall_fmt__find(sc->name);
1267 
1268 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1269 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1270 
1271 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1272 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1273 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1274 	}
1275 
1276 	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1277 		return -1;
1278 
1279 	if (IS_ERR(sc->tp_format))
1280 		return -1;
1281 
1282 	sc->args = sc->tp_format->format.fields;
1283 	/*
1284 	 * We need to check and discard the first variable '__syscall_nr'
1285 	 * or 'nr' that mean the syscall number. It is needless here.
1286 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1287 	 */
1288 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1289 		sc->args = sc->args->next;
1290 		--sc->nr_args;
1291 	}
1292 
1293 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1294 
1295 	return syscall__set_arg_fmts(sc);
1296 }
1297 
1298 static int trace__validate_ev_qualifier(struct trace *trace)
1299 {
1300 	int err = 0, i;
1301 	size_t nr_allocated;
1302 	struct str_node *pos;
1303 
1304 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1305 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1306 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1307 
1308 	if (trace->ev_qualifier_ids.entries == NULL) {
1309 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1310 		       trace->output);
1311 		err = -EINVAL;
1312 		goto out;
1313 	}
1314 
1315 	nr_allocated = trace->ev_qualifier_ids.nr;
1316 	i = 0;
1317 
1318 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1319 		const char *sc = pos->s;
1320 		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1321 
1322 		if (id < 0) {
1323 			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1324 			if (id >= 0)
1325 				goto matches;
1326 
1327 			if (err == 0) {
1328 				fputs("Error:\tInvalid syscall ", trace->output);
1329 				err = -EINVAL;
1330 			} else {
1331 				fputs(", ", trace->output);
1332 			}
1333 
1334 			fputs(sc, trace->output);
1335 		}
1336 matches:
1337 		trace->ev_qualifier_ids.entries[i++] = id;
1338 		if (match_next == -1)
1339 			continue;
1340 
1341 		while (1) {
1342 			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1343 			if (id < 0)
1344 				break;
1345 			if (nr_allocated == trace->ev_qualifier_ids.nr) {
1346 				void *entries;
1347 
1348 				nr_allocated += 8;
1349 				entries = realloc(trace->ev_qualifier_ids.entries,
1350 						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1351 				if (entries == NULL) {
1352 					err = -ENOMEM;
1353 					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1354 					goto out_free;
1355 				}
1356 				trace->ev_qualifier_ids.entries = entries;
1357 			}
1358 			trace->ev_qualifier_ids.nr++;
1359 			trace->ev_qualifier_ids.entries[i++] = id;
1360 		}
1361 	}
1362 
1363 	if (err < 0) {
1364 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1365 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1366 out_free:
1367 		zfree(&trace->ev_qualifier_ids.entries);
1368 		trace->ev_qualifier_ids.nr = 0;
1369 	}
1370 out:
1371 	return err;
1372 }
1373 
1374 /*
1375  * args is to be interpreted as a series of longs but we need to handle
1376  * 8-byte unaligned accesses. args points to raw_data within the event
1377  * and raw_data is guaranteed to be 8-byte unaligned because it is
1378  * preceded by raw_size which is a u32. So we need to copy args to a temp
1379  * variable to read it. Most notably this avoids extended load instructions
1380  * on unaligned addresses
1381  */
1382 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1383 {
1384 	unsigned long val;
1385 	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1386 
1387 	memcpy(&val, p, sizeof(val));
1388 	return val;
1389 }
1390 
1391 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1392 				      struct syscall_arg *arg)
1393 {
1394 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1395 		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1396 
1397 	return scnprintf(bf, size, "arg%d: ", arg->idx);
1398 }
1399 
1400 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1401 				     struct syscall_arg *arg, unsigned long val)
1402 {
1403 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1404 		arg->val = val;
1405 		if (sc->arg_fmt[arg->idx].parm)
1406 			arg->parm = sc->arg_fmt[arg->idx].parm;
1407 		return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1408 	}
1409 	return scnprintf(bf, size, "%ld", val);
1410 }
1411 
1412 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1413 				      unsigned char *args, struct trace *trace,
1414 				      struct thread *thread)
1415 {
1416 	size_t printed = 0;
1417 	unsigned long val;
1418 	u8 bit = 1;
1419 	struct syscall_arg arg = {
1420 		.args	= args,
1421 		.idx	= 0,
1422 		.mask	= 0,
1423 		.trace  = trace,
1424 		.thread = thread,
1425 	};
1426 	struct thread_trace *ttrace = thread__priv(thread);
1427 
1428 	/*
1429 	 * Things like fcntl will set this in its 'cmd' formatter to pick the
1430 	 * right formatter for the return value (an fd? file flags?), which is
1431 	 * not needed for syscalls that always return a given type, say an fd.
1432 	 */
1433 	ttrace->ret_scnprintf = NULL;
1434 
1435 	if (sc->args != NULL) {
1436 		struct format_field *field;
1437 
1438 		for (field = sc->args; field;
1439 		     field = field->next, ++arg.idx, bit <<= 1) {
1440 			if (arg.mask & bit)
1441 				continue;
1442 
1443 			val = syscall_arg__val(&arg, arg.idx);
1444 
1445 			/*
1446  			 * Suppress this argument if its value is zero and
1447  			 * and we don't have a string associated in an
1448  			 * strarray for it.
1449  			 */
1450 			if (val == 0 &&
1451 			    !(sc->arg_fmt &&
1452 			      (sc->arg_fmt[arg.idx].show_zero ||
1453 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1454 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1455 			      sc->arg_fmt[arg.idx].parm))
1456 				continue;
1457 
1458 			printed += scnprintf(bf + printed, size - printed,
1459 					     "%s%s: ", printed ? ", " : "", field->name);
1460 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1461 		}
1462 	} else if (IS_ERR(sc->tp_format)) {
1463 		/*
1464 		 * If we managed to read the tracepoint /format file, then we
1465 		 * may end up not having any args, like with gettid(), so only
1466 		 * print the raw args when we didn't manage to read it.
1467 		 */
1468 		while (arg.idx < sc->nr_args) {
1469 			if (arg.mask & bit)
1470 				goto next_arg;
1471 			val = syscall_arg__val(&arg, arg.idx);
1472 			if (printed)
1473 				printed += scnprintf(bf + printed, size - printed, ", ");
1474 			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1475 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1476 next_arg:
1477 			++arg.idx;
1478 			bit <<= 1;
1479 		}
1480 	}
1481 
1482 	return printed;
1483 }
1484 
1485 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1486 				  union perf_event *event,
1487 				  struct perf_sample *sample);
1488 
1489 static struct syscall *trace__syscall_info(struct trace *trace,
1490 					   struct perf_evsel *evsel, int id)
1491 {
1492 
1493 	if (id < 0) {
1494 
1495 		/*
1496 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1497 		 * before that, leaving at a higher verbosity level till that is
1498 		 * explained. Reproduced with plain ftrace with:
1499 		 *
1500 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1501 		 * grep "NR -1 " /t/trace_pipe
1502 		 *
1503 		 * After generating some load on the machine.
1504  		 */
1505 		if (verbose > 1) {
1506 			static u64 n;
1507 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1508 				id, perf_evsel__name(evsel), ++n);
1509 		}
1510 		return NULL;
1511 	}
1512 
1513 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1514 	    trace__read_syscall_info(trace, id))
1515 		goto out_cant_read;
1516 
1517 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1518 		goto out_cant_read;
1519 
1520 	return &trace->syscalls.table[id];
1521 
1522 out_cant_read:
1523 	if (verbose > 0) {
1524 		fprintf(trace->output, "Problems reading syscall %d", id);
1525 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1526 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1527 		fputs(" information\n", trace->output);
1528 	}
1529 	return NULL;
1530 }
1531 
1532 static void thread__update_stats(struct thread_trace *ttrace,
1533 				 int id, struct perf_sample *sample)
1534 {
1535 	struct int_node *inode;
1536 	struct stats *stats;
1537 	u64 duration = 0;
1538 
1539 	inode = intlist__findnew(ttrace->syscall_stats, id);
1540 	if (inode == NULL)
1541 		return;
1542 
1543 	stats = inode->priv;
1544 	if (stats == NULL) {
1545 		stats = malloc(sizeof(struct stats));
1546 		if (stats == NULL)
1547 			return;
1548 		init_stats(stats);
1549 		inode->priv = stats;
1550 	}
1551 
1552 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1553 		duration = sample->time - ttrace->entry_time;
1554 
1555 	update_stats(stats, duration);
1556 }
1557 
1558 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1559 {
1560 	struct thread_trace *ttrace;
1561 	u64 duration;
1562 	size_t printed;
1563 
1564 	if (trace->current == NULL)
1565 		return 0;
1566 
1567 	ttrace = thread__priv(trace->current);
1568 
1569 	if (!ttrace->entry_pending)
1570 		return 0;
1571 
1572 	duration = sample->time - ttrace->entry_time;
1573 
1574 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1575 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1576 	ttrace->entry_pending = false;
1577 
1578 	return printed;
1579 }
1580 
1581 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1582 			    union perf_event *event __maybe_unused,
1583 			    struct perf_sample *sample)
1584 {
1585 	char *msg;
1586 	void *args;
1587 	size_t printed = 0;
1588 	struct thread *thread;
1589 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1590 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1591 	struct thread_trace *ttrace;
1592 
1593 	if (sc == NULL)
1594 		return -1;
1595 
1596 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1597 	ttrace = thread__trace(thread, trace->output);
1598 	if (ttrace == NULL)
1599 		goto out_put;
1600 
1601 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1602 
1603 	if (ttrace->entry_str == NULL) {
1604 		ttrace->entry_str = malloc(trace__entry_str_size);
1605 		if (!ttrace->entry_str)
1606 			goto out_put;
1607 	}
1608 
1609 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1610 		trace__printf_interrupted_entry(trace, sample);
1611 
1612 	ttrace->entry_time = sample->time;
1613 	msg = ttrace->entry_str;
1614 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1615 
1616 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1617 					   args, trace, thread);
1618 
1619 	if (sc->is_exit) {
1620 		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1621 			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1622 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1623 		}
1624 	} else {
1625 		ttrace->entry_pending = true;
1626 		/* See trace__vfs_getname & trace__sys_exit */
1627 		ttrace->filename.pending_open = false;
1628 	}
1629 
1630 	if (trace->current != thread) {
1631 		thread__put(trace->current);
1632 		trace->current = thread__get(thread);
1633 	}
1634 	err = 0;
1635 out_put:
1636 	thread__put(thread);
1637 	return err;
1638 }
1639 
1640 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1641 				    struct perf_sample *sample,
1642 				    struct callchain_cursor *cursor)
1643 {
1644 	struct addr_location al;
1645 
1646 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1647 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, evsel->attr.sample_max_stack))
1648 		return -1;
1649 
1650 	return 0;
1651 }
1652 
1653 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1654 {
1655 	/* TODO: user-configurable print_opts */
1656 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1657 				        EVSEL__PRINT_DSO |
1658 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1659 
1660 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1661 }
1662 
1663 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1664 {
1665 	struct perf_env *env = perf_evsel__env(evsel);
1666 	const char *arch_name = perf_env__arch(env);
1667 
1668 	return arch_syscalls__strerrno(arch_name, err);
1669 }
1670 
1671 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1672 			   union perf_event *event __maybe_unused,
1673 			   struct perf_sample *sample)
1674 {
1675 	long ret;
1676 	u64 duration = 0;
1677 	bool duration_calculated = false;
1678 	struct thread *thread;
1679 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1680 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1681 	struct thread_trace *ttrace;
1682 
1683 	if (sc == NULL)
1684 		return -1;
1685 
1686 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1687 	ttrace = thread__trace(thread, trace->output);
1688 	if (ttrace == NULL)
1689 		goto out_put;
1690 
1691 	if (trace->summary)
1692 		thread__update_stats(ttrace, id, sample);
1693 
1694 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1695 
1696 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1697 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1698 		ttrace->filename.pending_open = false;
1699 		++trace->stats.vfs_getname;
1700 	}
1701 
1702 	if (ttrace->entry_time) {
1703 		duration = sample->time - ttrace->entry_time;
1704 		if (trace__filter_duration(trace, duration))
1705 			goto out;
1706 		duration_calculated = true;
1707 	} else if (trace->duration_filter)
1708 		goto out;
1709 
1710 	if (sample->callchain) {
1711 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1712 		if (callchain_ret == 0) {
1713 			if (callchain_cursor.nr < trace->min_stack)
1714 				goto out;
1715 			callchain_ret = 1;
1716 		}
1717 	}
1718 
1719 	if (trace->summary_only)
1720 		goto out;
1721 
1722 	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1723 
1724 	if (ttrace->entry_pending) {
1725 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1726 	} else {
1727 		fprintf(trace->output, " ... [");
1728 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1729 		fprintf(trace->output, "]: %s()", sc->name);
1730 	}
1731 
1732 	if (sc->fmt == NULL) {
1733 		if (ret < 0)
1734 			goto errno_print;
1735 signed_print:
1736 		fprintf(trace->output, ") = %ld", ret);
1737 	} else if (ret < 0) {
1738 errno_print: {
1739 		char bf[STRERR_BUFSIZE];
1740 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1741 			   *e = errno_to_name(evsel, -ret);
1742 
1743 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1744 	}
1745 	} else if (ret == 0 && sc->fmt->timeout)
1746 		fprintf(trace->output, ") = 0 Timeout");
1747 	else if (ttrace->ret_scnprintf) {
1748 		char bf[1024];
1749 		struct syscall_arg arg = {
1750 			.val	= ret,
1751 			.thread	= thread,
1752 			.trace	= trace,
1753 		};
1754 		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1755 		ttrace->ret_scnprintf = NULL;
1756 		fprintf(trace->output, ") = %s", bf);
1757 	} else if (sc->fmt->hexret)
1758 		fprintf(trace->output, ") = %#lx", ret);
1759 	else if (sc->fmt->errpid) {
1760 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1761 
1762 		if (child != NULL) {
1763 			fprintf(trace->output, ") = %ld", ret);
1764 			if (child->comm_set)
1765 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1766 			thread__put(child);
1767 		}
1768 	} else
1769 		goto signed_print;
1770 
1771 	fputc('\n', trace->output);
1772 
1773 	if (callchain_ret > 0)
1774 		trace__fprintf_callchain(trace, sample);
1775 	else if (callchain_ret < 0)
1776 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1777 out:
1778 	ttrace->entry_pending = false;
1779 	err = 0;
1780 out_put:
1781 	thread__put(thread);
1782 	return err;
1783 }
1784 
1785 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1786 			      union perf_event *event __maybe_unused,
1787 			      struct perf_sample *sample)
1788 {
1789 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1790 	struct thread_trace *ttrace;
1791 	size_t filename_len, entry_str_len, to_move;
1792 	ssize_t remaining_space;
1793 	char *pos;
1794 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1795 
1796 	if (!thread)
1797 		goto out;
1798 
1799 	ttrace = thread__priv(thread);
1800 	if (!ttrace)
1801 		goto out_put;
1802 
1803 	filename_len = strlen(filename);
1804 	if (filename_len == 0)
1805 		goto out_put;
1806 
1807 	if (ttrace->filename.namelen < filename_len) {
1808 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1809 
1810 		if (f == NULL)
1811 			goto out_put;
1812 
1813 		ttrace->filename.namelen = filename_len;
1814 		ttrace->filename.name = f;
1815 	}
1816 
1817 	strcpy(ttrace->filename.name, filename);
1818 	ttrace->filename.pending_open = true;
1819 
1820 	if (!ttrace->filename.ptr)
1821 		goto out_put;
1822 
1823 	entry_str_len = strlen(ttrace->entry_str);
1824 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1825 	if (remaining_space <= 0)
1826 		goto out_put;
1827 
1828 	if (filename_len > (size_t)remaining_space) {
1829 		filename += filename_len - remaining_space;
1830 		filename_len = remaining_space;
1831 	}
1832 
1833 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1834 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1835 	memmove(pos + filename_len, pos, to_move);
1836 	memcpy(pos, filename, filename_len);
1837 
1838 	ttrace->filename.ptr = 0;
1839 	ttrace->filename.entry_str_pos = 0;
1840 out_put:
1841 	thread__put(thread);
1842 out:
1843 	return 0;
1844 }
1845 
1846 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1847 				     union perf_event *event __maybe_unused,
1848 				     struct perf_sample *sample)
1849 {
1850         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1851 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1852 	struct thread *thread = machine__findnew_thread(trace->host,
1853 							sample->pid,
1854 							sample->tid);
1855 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1856 
1857 	if (ttrace == NULL)
1858 		goto out_dump;
1859 
1860 	ttrace->runtime_ms += runtime_ms;
1861 	trace->runtime_ms += runtime_ms;
1862 out_put:
1863 	thread__put(thread);
1864 	return 0;
1865 
1866 out_dump:
1867 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1868 	       evsel->name,
1869 	       perf_evsel__strval(evsel, sample, "comm"),
1870 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1871 	       runtime,
1872 	       perf_evsel__intval(evsel, sample, "vruntime"));
1873 	goto out_put;
1874 }
1875 
1876 static int bpf_output__printer(enum binary_printer_ops op,
1877 			       unsigned int val, void *extra __maybe_unused, FILE *fp)
1878 {
1879 	unsigned char ch = (unsigned char)val;
1880 
1881 	switch (op) {
1882 	case BINARY_PRINT_CHAR_DATA:
1883 		return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1884 	case BINARY_PRINT_DATA_BEGIN:
1885 	case BINARY_PRINT_LINE_BEGIN:
1886 	case BINARY_PRINT_ADDR:
1887 	case BINARY_PRINT_NUM_DATA:
1888 	case BINARY_PRINT_NUM_PAD:
1889 	case BINARY_PRINT_SEP:
1890 	case BINARY_PRINT_CHAR_PAD:
1891 	case BINARY_PRINT_LINE_END:
1892 	case BINARY_PRINT_DATA_END:
1893 	default:
1894 		break;
1895 	}
1896 
1897 	return 0;
1898 }
1899 
1900 static void bpf_output__fprintf(struct trace *trace,
1901 				struct perf_sample *sample)
1902 {
1903 	binary__fprintf(sample->raw_data, sample->raw_size, 8,
1904 			bpf_output__printer, NULL, trace->output);
1905 }
1906 
1907 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1908 				union perf_event *event __maybe_unused,
1909 				struct perf_sample *sample)
1910 {
1911 	int callchain_ret = 0;
1912 
1913 	if (sample->callchain) {
1914 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1915 		if (callchain_ret == 0) {
1916 			if (callchain_cursor.nr < trace->min_stack)
1917 				goto out;
1918 			callchain_ret = 1;
1919 		}
1920 	}
1921 
1922 	trace__printf_interrupted_entry(trace, sample);
1923 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1924 
1925 	if (trace->trace_syscalls)
1926 		fprintf(trace->output, "(         ): ");
1927 
1928 	fprintf(trace->output, "%s:", evsel->name);
1929 
1930 	if (perf_evsel__is_bpf_output(evsel)) {
1931 		bpf_output__fprintf(trace, sample);
1932 	} else if (evsel->tp_format) {
1933 		event_format__fprintf(evsel->tp_format, sample->cpu,
1934 				      sample->raw_data, sample->raw_size,
1935 				      trace->output);
1936 	}
1937 
1938 	fprintf(trace->output, ")\n");
1939 
1940 	if (callchain_ret > 0)
1941 		trace__fprintf_callchain(trace, sample);
1942 	else if (callchain_ret < 0)
1943 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1944 out:
1945 	return 0;
1946 }
1947 
1948 static void print_location(FILE *f, struct perf_sample *sample,
1949 			   struct addr_location *al,
1950 			   bool print_dso, bool print_sym)
1951 {
1952 
1953 	if ((verbose > 0 || print_dso) && al->map)
1954 		fprintf(f, "%s@", al->map->dso->long_name);
1955 
1956 	if ((verbose > 0 || print_sym) && al->sym)
1957 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1958 			al->addr - al->sym->start);
1959 	else if (al->map)
1960 		fprintf(f, "0x%" PRIx64, al->addr);
1961 	else
1962 		fprintf(f, "0x%" PRIx64, sample->addr);
1963 }
1964 
1965 static int trace__pgfault(struct trace *trace,
1966 			  struct perf_evsel *evsel,
1967 			  union perf_event *event __maybe_unused,
1968 			  struct perf_sample *sample)
1969 {
1970 	struct thread *thread;
1971 	struct addr_location al;
1972 	char map_type = 'd';
1973 	struct thread_trace *ttrace;
1974 	int err = -1;
1975 	int callchain_ret = 0;
1976 
1977 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1978 
1979 	if (sample->callchain) {
1980 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1981 		if (callchain_ret == 0) {
1982 			if (callchain_cursor.nr < trace->min_stack)
1983 				goto out_put;
1984 			callchain_ret = 1;
1985 		}
1986 	}
1987 
1988 	ttrace = thread__trace(thread, trace->output);
1989 	if (ttrace == NULL)
1990 		goto out_put;
1991 
1992 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1993 		ttrace->pfmaj++;
1994 	else
1995 		ttrace->pfmin++;
1996 
1997 	if (trace->summary_only)
1998 		goto out;
1999 
2000 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
2001 			      sample->ip, &al);
2002 
2003 	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2004 
2005 	fprintf(trace->output, "%sfault [",
2006 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2007 		"maj" : "min");
2008 
2009 	print_location(trace->output, sample, &al, false, true);
2010 
2011 	fprintf(trace->output, "] => ");
2012 
2013 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
2014 				   sample->addr, &al);
2015 
2016 	if (!al.map) {
2017 		thread__find_addr_location(thread, sample->cpumode,
2018 					   MAP__FUNCTION, sample->addr, &al);
2019 
2020 		if (al.map)
2021 			map_type = 'x';
2022 		else
2023 			map_type = '?';
2024 	}
2025 
2026 	print_location(trace->output, sample, &al, true, false);
2027 
2028 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2029 
2030 	if (callchain_ret > 0)
2031 		trace__fprintf_callchain(trace, sample);
2032 	else if (callchain_ret < 0)
2033 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2034 out:
2035 	err = 0;
2036 out_put:
2037 	thread__put(thread);
2038 	return err;
2039 }
2040 
2041 static void trace__set_base_time(struct trace *trace,
2042 				 struct perf_evsel *evsel,
2043 				 struct perf_sample *sample)
2044 {
2045 	/*
2046 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2047 	 * and don't use sample->time unconditionally, we may end up having
2048 	 * some other event in the future without PERF_SAMPLE_TIME for good
2049 	 * reason, i.e. we may not be interested in its timestamps, just in
2050 	 * it taking place, picking some piece of information when it
2051 	 * appears in our event stream (vfs_getname comes to mind).
2052 	 */
2053 	if (trace->base_time == 0 && !trace->full_time &&
2054 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2055 		trace->base_time = sample->time;
2056 }
2057 
2058 static int trace__process_sample(struct perf_tool *tool,
2059 				 union perf_event *event,
2060 				 struct perf_sample *sample,
2061 				 struct perf_evsel *evsel,
2062 				 struct machine *machine __maybe_unused)
2063 {
2064 	struct trace *trace = container_of(tool, struct trace, tool);
2065 	struct thread *thread;
2066 	int err = 0;
2067 
2068 	tracepoint_handler handler = evsel->handler;
2069 
2070 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2071 	if (thread && thread__is_filtered(thread))
2072 		goto out;
2073 
2074 	trace__set_base_time(trace, evsel, sample);
2075 
2076 	if (handler) {
2077 		++trace->nr_events;
2078 		handler(trace, evsel, event, sample);
2079 	}
2080 out:
2081 	thread__put(thread);
2082 	return err;
2083 }
2084 
2085 static int trace__record(struct trace *trace, int argc, const char **argv)
2086 {
2087 	unsigned int rec_argc, i, j;
2088 	const char **rec_argv;
2089 	const char * const record_args[] = {
2090 		"record",
2091 		"-R",
2092 		"-m", "1024",
2093 		"-c", "1",
2094 	};
2095 
2096 	const char * const sc_args[] = { "-e", };
2097 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2098 	const char * const majpf_args[] = { "-e", "major-faults" };
2099 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2100 	const char * const minpf_args[] = { "-e", "minor-faults" };
2101 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2102 
2103 	/* +1 is for the event string below */
2104 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2105 		majpf_args_nr + minpf_args_nr + argc;
2106 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2107 
2108 	if (rec_argv == NULL)
2109 		return -ENOMEM;
2110 
2111 	j = 0;
2112 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2113 		rec_argv[j++] = record_args[i];
2114 
2115 	if (trace->trace_syscalls) {
2116 		for (i = 0; i < sc_args_nr; i++)
2117 			rec_argv[j++] = sc_args[i];
2118 
2119 		/* event string may be different for older kernels - e.g., RHEL6 */
2120 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2121 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2122 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2123 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2124 		else {
2125 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2126 			free(rec_argv);
2127 			return -1;
2128 		}
2129 	}
2130 
2131 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2132 		for (i = 0; i < majpf_args_nr; i++)
2133 			rec_argv[j++] = majpf_args[i];
2134 
2135 	if (trace->trace_pgfaults & TRACE_PFMIN)
2136 		for (i = 0; i < minpf_args_nr; i++)
2137 			rec_argv[j++] = minpf_args[i];
2138 
2139 	for (i = 0; i < (unsigned int)argc; i++)
2140 		rec_argv[j++] = argv[i];
2141 
2142 	return cmd_record(j, rec_argv);
2143 }
2144 
2145 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2146 
2147 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2148 {
2149 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2150 
2151 	if (IS_ERR(evsel))
2152 		return false;
2153 
2154 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2155 		perf_evsel__delete(evsel);
2156 		return false;
2157 	}
2158 
2159 	evsel->handler = trace__vfs_getname;
2160 	perf_evlist__add(evlist, evsel);
2161 	return true;
2162 }
2163 
2164 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2165 {
2166 	struct perf_evsel *evsel;
2167 	struct perf_event_attr attr = {
2168 		.type = PERF_TYPE_SOFTWARE,
2169 		.mmap_data = 1,
2170 	};
2171 
2172 	attr.config = config;
2173 	attr.sample_period = 1;
2174 
2175 	event_attr_init(&attr);
2176 
2177 	evsel = perf_evsel__new(&attr);
2178 	if (evsel)
2179 		evsel->handler = trace__pgfault;
2180 
2181 	return evsel;
2182 }
2183 
2184 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2185 {
2186 	const u32 type = event->header.type;
2187 	struct perf_evsel *evsel;
2188 
2189 	if (type != PERF_RECORD_SAMPLE) {
2190 		trace__process_event(trace, trace->host, event, sample);
2191 		return;
2192 	}
2193 
2194 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2195 	if (evsel == NULL) {
2196 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2197 		return;
2198 	}
2199 
2200 	trace__set_base_time(trace, evsel, sample);
2201 
2202 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2203 	    sample->raw_data == NULL) {
2204 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2205 		       perf_evsel__name(evsel), sample->tid,
2206 		       sample->cpu, sample->raw_size);
2207 	} else {
2208 		tracepoint_handler handler = evsel->handler;
2209 		handler(trace, evsel, event, sample);
2210 	}
2211 }
2212 
2213 static int trace__add_syscall_newtp(struct trace *trace)
2214 {
2215 	int ret = -1;
2216 	struct perf_evlist *evlist = trace->evlist;
2217 	struct perf_evsel *sys_enter, *sys_exit;
2218 
2219 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2220 	if (sys_enter == NULL)
2221 		goto out;
2222 
2223 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2224 		goto out_delete_sys_enter;
2225 
2226 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2227 	if (sys_exit == NULL)
2228 		goto out_delete_sys_enter;
2229 
2230 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2231 		goto out_delete_sys_exit;
2232 
2233 	perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2234 	perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2235 
2236 	perf_evlist__add(evlist, sys_enter);
2237 	perf_evlist__add(evlist, sys_exit);
2238 
2239 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2240 		/*
2241 		 * We're interested only in the user space callchain
2242 		 * leading to the syscall, allow overriding that for
2243 		 * debugging reasons using --kernel_syscall_callchains
2244 		 */
2245 		sys_exit->attr.exclude_callchain_kernel = 1;
2246 	}
2247 
2248 	trace->syscalls.events.sys_enter = sys_enter;
2249 	trace->syscalls.events.sys_exit  = sys_exit;
2250 
2251 	ret = 0;
2252 out:
2253 	return ret;
2254 
2255 out_delete_sys_exit:
2256 	perf_evsel__delete_priv(sys_exit);
2257 out_delete_sys_enter:
2258 	perf_evsel__delete_priv(sys_enter);
2259 	goto out;
2260 }
2261 
2262 static int trace__set_ev_qualifier_filter(struct trace *trace)
2263 {
2264 	int err = -1;
2265 	struct perf_evsel *sys_exit;
2266 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2267 						trace->ev_qualifier_ids.nr,
2268 						trace->ev_qualifier_ids.entries);
2269 
2270 	if (filter == NULL)
2271 		goto out_enomem;
2272 
2273 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2274 					  filter)) {
2275 		sys_exit = trace->syscalls.events.sys_exit;
2276 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2277 	}
2278 
2279 	free(filter);
2280 out:
2281 	return err;
2282 out_enomem:
2283 	errno = ENOMEM;
2284 	goto out;
2285 }
2286 
2287 static int trace__set_filter_loop_pids(struct trace *trace)
2288 {
2289 	unsigned int nr = 1;
2290 	pid_t pids[32] = {
2291 		getpid(),
2292 	};
2293 	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2294 
2295 	while (thread && nr < ARRAY_SIZE(pids)) {
2296 		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2297 
2298 		if (parent == NULL)
2299 			break;
2300 
2301 		if (!strcmp(thread__comm_str(parent), "sshd")) {
2302 			pids[nr++] = parent->tid;
2303 			break;
2304 		}
2305 		thread = parent;
2306 	}
2307 
2308 	return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2309 }
2310 
2311 static int trace__run(struct trace *trace, int argc, const char **argv)
2312 {
2313 	struct perf_evlist *evlist = trace->evlist;
2314 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2315 	int err = -1, i;
2316 	unsigned long before;
2317 	const bool forks = argc > 0;
2318 	bool draining = false;
2319 
2320 	trace->live = true;
2321 
2322 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2323 		goto out_error_raw_syscalls;
2324 
2325 	if (trace->trace_syscalls)
2326 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2327 
2328 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2329 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2330 		if (pgfault_maj == NULL)
2331 			goto out_error_mem;
2332 		perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2333 		perf_evlist__add(evlist, pgfault_maj);
2334 	}
2335 
2336 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2337 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2338 		if (pgfault_min == NULL)
2339 			goto out_error_mem;
2340 		perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2341 		perf_evlist__add(evlist, pgfault_min);
2342 	}
2343 
2344 	if (trace->sched &&
2345 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2346 				   trace__sched_stat_runtime))
2347 		goto out_error_sched_stat_runtime;
2348 
2349 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2350 	if (err < 0) {
2351 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2352 		goto out_delete_evlist;
2353 	}
2354 
2355 	err = trace__symbols_init(trace, evlist);
2356 	if (err < 0) {
2357 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2358 		goto out_delete_evlist;
2359 	}
2360 
2361 	perf_evlist__config(evlist, &trace->opts, &callchain_param);
2362 
2363 	signal(SIGCHLD, sig_handler);
2364 	signal(SIGINT, sig_handler);
2365 
2366 	if (forks) {
2367 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2368 						    argv, false, NULL);
2369 		if (err < 0) {
2370 			fprintf(trace->output, "Couldn't run the workload!\n");
2371 			goto out_delete_evlist;
2372 		}
2373 	}
2374 
2375 	err = perf_evlist__open(evlist);
2376 	if (err < 0)
2377 		goto out_error_open;
2378 
2379 	err = bpf__apply_obj_config();
2380 	if (err) {
2381 		char errbuf[BUFSIZ];
2382 
2383 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2384 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2385 			 errbuf);
2386 		goto out_error_open;
2387 	}
2388 
2389 	/*
2390 	 * Better not use !target__has_task() here because we need to cover the
2391 	 * case where no threads were specified in the command line, but a
2392 	 * workload was, and in that case we will fill in the thread_map when
2393 	 * we fork the workload in perf_evlist__prepare_workload.
2394 	 */
2395 	if (trace->filter_pids.nr > 0)
2396 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2397 	else if (thread_map__pid(evlist->threads, 0) == -1)
2398 		err = trace__set_filter_loop_pids(trace);
2399 
2400 	if (err < 0)
2401 		goto out_error_mem;
2402 
2403 	if (trace->ev_qualifier_ids.nr > 0) {
2404 		err = trace__set_ev_qualifier_filter(trace);
2405 		if (err < 0)
2406 			goto out_errno;
2407 
2408 		pr_debug("event qualifier tracepoint filter: %s\n",
2409 			 trace->syscalls.events.sys_exit->filter);
2410 	}
2411 
2412 	err = perf_evlist__apply_filters(evlist, &evsel);
2413 	if (err < 0)
2414 		goto out_error_apply_filters;
2415 
2416 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2417 	if (err < 0)
2418 		goto out_error_mmap;
2419 
2420 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2421 		perf_evlist__enable(evlist);
2422 
2423 	if (forks)
2424 		perf_evlist__start_workload(evlist);
2425 
2426 	if (trace->opts.initial_delay) {
2427 		usleep(trace->opts.initial_delay * 1000);
2428 		perf_evlist__enable(evlist);
2429 	}
2430 
2431 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2432 				  evlist->threads->nr > 1 ||
2433 				  perf_evlist__first(evlist)->attr.inherit;
2434 
2435 	/*
2436 	 * Now that we already used evsel->attr to ask the kernel to setup the
2437 	 * events, lets reuse evsel->attr.sample_max_stack as the limit in
2438 	 * trace__resolve_callchain(), allowing per-event max-stack settings
2439 	 * to override an explicitely set --max-stack global setting.
2440 	 */
2441 	evlist__for_each_entry(evlist, evsel) {
2442 		if ((evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
2443 		    evsel->attr.sample_max_stack == 0)
2444 			evsel->attr.sample_max_stack = trace->max_stack;
2445 	}
2446 again:
2447 	before = trace->nr_events;
2448 
2449 	for (i = 0; i < evlist->nr_mmaps; i++) {
2450 		union perf_event *event;
2451 
2452 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2453 			struct perf_sample sample;
2454 
2455 			++trace->nr_events;
2456 
2457 			err = perf_evlist__parse_sample(evlist, event, &sample);
2458 			if (err) {
2459 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2460 				goto next_event;
2461 			}
2462 
2463 			trace__handle_event(trace, event, &sample);
2464 next_event:
2465 			perf_evlist__mmap_consume(evlist, i);
2466 
2467 			if (interrupted)
2468 				goto out_disable;
2469 
2470 			if (done && !draining) {
2471 				perf_evlist__disable(evlist);
2472 				draining = true;
2473 			}
2474 		}
2475 	}
2476 
2477 	if (trace->nr_events == before) {
2478 		int timeout = done ? 100 : -1;
2479 
2480 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2481 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2482 				draining = true;
2483 
2484 			goto again;
2485 		}
2486 	} else {
2487 		goto again;
2488 	}
2489 
2490 out_disable:
2491 	thread__zput(trace->current);
2492 
2493 	perf_evlist__disable(evlist);
2494 
2495 	if (!err) {
2496 		if (trace->summary)
2497 			trace__fprintf_thread_summary(trace, trace->output);
2498 
2499 		if (trace->show_tool_stats) {
2500 			fprintf(trace->output, "Stats:\n "
2501 					       " vfs_getname : %" PRIu64 "\n"
2502 					       " proc_getname: %" PRIu64 "\n",
2503 				trace->stats.vfs_getname,
2504 				trace->stats.proc_getname);
2505 		}
2506 	}
2507 
2508 out_delete_evlist:
2509 	trace__symbols__exit(trace);
2510 
2511 	perf_evlist__delete(evlist);
2512 	trace->evlist = NULL;
2513 	trace->live = false;
2514 	return err;
2515 {
2516 	char errbuf[BUFSIZ];
2517 
2518 out_error_sched_stat_runtime:
2519 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2520 	goto out_error;
2521 
2522 out_error_raw_syscalls:
2523 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2524 	goto out_error;
2525 
2526 out_error_mmap:
2527 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2528 	goto out_error;
2529 
2530 out_error_open:
2531 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2532 
2533 out_error:
2534 	fprintf(trace->output, "%s\n", errbuf);
2535 	goto out_delete_evlist;
2536 
2537 out_error_apply_filters:
2538 	fprintf(trace->output,
2539 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2540 		evsel->filter, perf_evsel__name(evsel), errno,
2541 		str_error_r(errno, errbuf, sizeof(errbuf)));
2542 	goto out_delete_evlist;
2543 }
2544 out_error_mem:
2545 	fprintf(trace->output, "Not enough memory to run!\n");
2546 	goto out_delete_evlist;
2547 
2548 out_errno:
2549 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2550 	goto out_delete_evlist;
2551 }
2552 
2553 static int trace__replay(struct trace *trace)
2554 {
2555 	const struct perf_evsel_str_handler handlers[] = {
2556 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2557 	};
2558 	struct perf_data data = {
2559 		.file      = {
2560 			.path = input_name,
2561 		},
2562 		.mode      = PERF_DATA_MODE_READ,
2563 		.force     = trace->force,
2564 	};
2565 	struct perf_session *session;
2566 	struct perf_evsel *evsel;
2567 	int err = -1;
2568 
2569 	trace->tool.sample	  = trace__process_sample;
2570 	trace->tool.mmap	  = perf_event__process_mmap;
2571 	trace->tool.mmap2	  = perf_event__process_mmap2;
2572 	trace->tool.comm	  = perf_event__process_comm;
2573 	trace->tool.exit	  = perf_event__process_exit;
2574 	trace->tool.fork	  = perf_event__process_fork;
2575 	trace->tool.attr	  = perf_event__process_attr;
2576 	trace->tool.tracing_data  = perf_event__process_tracing_data;
2577 	trace->tool.build_id	  = perf_event__process_build_id;
2578 	trace->tool.namespaces	  = perf_event__process_namespaces;
2579 
2580 	trace->tool.ordered_events = true;
2581 	trace->tool.ordering_requires_timestamps = true;
2582 
2583 	/* add tid to output */
2584 	trace->multiple_threads = true;
2585 
2586 	session = perf_session__new(&data, false, &trace->tool);
2587 	if (session == NULL)
2588 		return -1;
2589 
2590 	if (trace->opts.target.pid)
2591 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2592 
2593 	if (trace->opts.target.tid)
2594 		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2595 
2596 	if (symbol__init(&session->header.env) < 0)
2597 		goto out;
2598 
2599 	trace->host = &session->machines.host;
2600 
2601 	err = perf_session__set_tracepoints_handlers(session, handlers);
2602 	if (err)
2603 		goto out;
2604 
2605 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2606 						     "raw_syscalls:sys_enter");
2607 	/* older kernels have syscalls tp versus raw_syscalls */
2608 	if (evsel == NULL)
2609 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2610 							     "syscalls:sys_enter");
2611 
2612 	if (evsel &&
2613 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2614 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2615 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2616 		goto out;
2617 	}
2618 
2619 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2620 						     "raw_syscalls:sys_exit");
2621 	if (evsel == NULL)
2622 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2623 							     "syscalls:sys_exit");
2624 	if (evsel &&
2625 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2626 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2627 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2628 		goto out;
2629 	}
2630 
2631 	evlist__for_each_entry(session->evlist, evsel) {
2632 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2633 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2634 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2635 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2636 			evsel->handler = trace__pgfault;
2637 	}
2638 
2639 	setup_pager();
2640 
2641 	err = perf_session__process_events(session);
2642 	if (err)
2643 		pr_err("Failed to process events, error %d", err);
2644 
2645 	else if (trace->summary)
2646 		trace__fprintf_thread_summary(trace, trace->output);
2647 
2648 out:
2649 	perf_session__delete(session);
2650 
2651 	return err;
2652 }
2653 
2654 static size_t trace__fprintf_threads_header(FILE *fp)
2655 {
2656 	size_t printed;
2657 
2658 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2659 
2660 	return printed;
2661 }
2662 
2663 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2664 	struct stats 	*stats;
2665 	double		msecs;
2666 	int		syscall;
2667 )
2668 {
2669 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2670 	struct stats *stats = source->priv;
2671 
2672 	entry->syscall = source->i;
2673 	entry->stats   = stats;
2674 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2675 }
2676 
2677 static size_t thread__dump_stats(struct thread_trace *ttrace,
2678 				 struct trace *trace, FILE *fp)
2679 {
2680 	size_t printed = 0;
2681 	struct syscall *sc;
2682 	struct rb_node *nd;
2683 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2684 
2685 	if (syscall_stats == NULL)
2686 		return 0;
2687 
2688 	printed += fprintf(fp, "\n");
2689 
2690 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2691 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2692 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2693 
2694 	resort_rb__for_each_entry(nd, syscall_stats) {
2695 		struct stats *stats = syscall_stats_entry->stats;
2696 		if (stats) {
2697 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2698 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2699 			double avg = avg_stats(stats);
2700 			double pct;
2701 			u64 n = (u64) stats->n;
2702 
2703 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2704 			avg /= NSEC_PER_MSEC;
2705 
2706 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2707 			printed += fprintf(fp, "   %-15s", sc->name);
2708 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2709 					   n, syscall_stats_entry->msecs, min, avg);
2710 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2711 		}
2712 	}
2713 
2714 	resort_rb__delete(syscall_stats);
2715 	printed += fprintf(fp, "\n\n");
2716 
2717 	return printed;
2718 }
2719 
2720 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2721 {
2722 	size_t printed = 0;
2723 	struct thread_trace *ttrace = thread__priv(thread);
2724 	double ratio;
2725 
2726 	if (ttrace == NULL)
2727 		return 0;
2728 
2729 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2730 
2731 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2732 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2733 	printed += fprintf(fp, "%.1f%%", ratio);
2734 	if (ttrace->pfmaj)
2735 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2736 	if (ttrace->pfmin)
2737 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2738 	if (trace->sched)
2739 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2740 	else if (fputc('\n', fp) != EOF)
2741 		++printed;
2742 
2743 	printed += thread__dump_stats(ttrace, trace, fp);
2744 
2745 	return printed;
2746 }
2747 
2748 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2749 {
2750 	return ttrace ? ttrace->nr_events : 0;
2751 }
2752 
2753 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2754 	struct thread *thread;
2755 )
2756 {
2757 	entry->thread = rb_entry(nd, struct thread, rb_node);
2758 }
2759 
2760 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2761 {
2762 	size_t printed = trace__fprintf_threads_header(fp);
2763 	struct rb_node *nd;
2764 	int i;
2765 
2766 	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2767 		DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2768 
2769 		if (threads == NULL) {
2770 			fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2771 			return 0;
2772 		}
2773 
2774 		resort_rb__for_each_entry(nd, threads)
2775 			printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2776 
2777 		resort_rb__delete(threads);
2778 	}
2779 	return printed;
2780 }
2781 
2782 static int trace__set_duration(const struct option *opt, const char *str,
2783 			       int unset __maybe_unused)
2784 {
2785 	struct trace *trace = opt->value;
2786 
2787 	trace->duration_filter = atof(str);
2788 	return 0;
2789 }
2790 
2791 static int trace__set_filter_pids(const struct option *opt, const char *str,
2792 				  int unset __maybe_unused)
2793 {
2794 	int ret = -1;
2795 	size_t i;
2796 	struct trace *trace = opt->value;
2797 	/*
2798 	 * FIXME: introduce a intarray class, plain parse csv and create a
2799 	 * { int nr, int entries[] } struct...
2800 	 */
2801 	struct intlist *list = intlist__new(str);
2802 
2803 	if (list == NULL)
2804 		return -1;
2805 
2806 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2807 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2808 
2809 	if (trace->filter_pids.entries == NULL)
2810 		goto out;
2811 
2812 	trace->filter_pids.entries[0] = getpid();
2813 
2814 	for (i = 1; i < trace->filter_pids.nr; ++i)
2815 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2816 
2817 	intlist__delete(list);
2818 	ret = 0;
2819 out:
2820 	return ret;
2821 }
2822 
2823 static int trace__open_output(struct trace *trace, const char *filename)
2824 {
2825 	struct stat st;
2826 
2827 	if (!stat(filename, &st) && st.st_size) {
2828 		char oldname[PATH_MAX];
2829 
2830 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2831 		unlink(oldname);
2832 		rename(filename, oldname);
2833 	}
2834 
2835 	trace->output = fopen(filename, "w");
2836 
2837 	return trace->output == NULL ? -errno : 0;
2838 }
2839 
2840 static int parse_pagefaults(const struct option *opt, const char *str,
2841 			    int unset __maybe_unused)
2842 {
2843 	int *trace_pgfaults = opt->value;
2844 
2845 	if (strcmp(str, "all") == 0)
2846 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2847 	else if (strcmp(str, "maj") == 0)
2848 		*trace_pgfaults |= TRACE_PFMAJ;
2849 	else if (strcmp(str, "min") == 0)
2850 		*trace_pgfaults |= TRACE_PFMIN;
2851 	else
2852 		return -1;
2853 
2854 	return 0;
2855 }
2856 
2857 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2858 {
2859 	struct perf_evsel *evsel;
2860 
2861 	evlist__for_each_entry(evlist, evsel)
2862 		evsel->handler = handler;
2863 }
2864 
2865 /*
2866  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2867  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2868  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2869  *
2870  * It'd be better to introduce a parse_options() variant that would return a
2871  * list with the terms it didn't match to an event...
2872  */
2873 static int trace__parse_events_option(const struct option *opt, const char *str,
2874 				      int unset __maybe_unused)
2875 {
2876 	struct trace *trace = (struct trace *)opt->value;
2877 	const char *s = str;
2878 	char *sep = NULL, *lists[2] = { NULL, NULL, };
2879 	int len = strlen(str) + 1, err = -1, list, idx;
2880 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2881 	char group_name[PATH_MAX];
2882 
2883 	if (strace_groups_dir == NULL)
2884 		return -1;
2885 
2886 	if (*s == '!') {
2887 		++s;
2888 		trace->not_ev_qualifier = true;
2889 	}
2890 
2891 	while (1) {
2892 		if ((sep = strchr(s, ',')) != NULL)
2893 			*sep = '\0';
2894 
2895 		list = 0;
2896 		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2897 		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2898 			list = 1;
2899 		} else {
2900 			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2901 			if (access(group_name, R_OK) == 0)
2902 				list = 1;
2903 		}
2904 
2905 		if (lists[list]) {
2906 			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2907 		} else {
2908 			lists[list] = malloc(len);
2909 			if (lists[list] == NULL)
2910 				goto out;
2911 			strcpy(lists[list], s);
2912 		}
2913 
2914 		if (!sep)
2915 			break;
2916 
2917 		*sep = ',';
2918 		s = sep + 1;
2919 	}
2920 
2921 	if (lists[1] != NULL) {
2922 		struct strlist_config slist_config = {
2923 			.dirname = strace_groups_dir,
2924 		};
2925 
2926 		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2927 		if (trace->ev_qualifier == NULL) {
2928 			fputs("Not enough memory to parse event qualifier", trace->output);
2929 			goto out;
2930 		}
2931 
2932 		if (trace__validate_ev_qualifier(trace))
2933 			goto out;
2934 	}
2935 
2936 	err = 0;
2937 
2938 	if (lists[0]) {
2939 		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2940 					       "event selector. use 'perf list' to list available events",
2941 					       parse_events_option);
2942 		err = parse_events_option(&o, lists[0], 0);
2943 	}
2944 out:
2945 	if (sep)
2946 		*sep = ',';
2947 
2948 	return err;
2949 }
2950 
2951 int cmd_trace(int argc, const char **argv)
2952 {
2953 	const char *trace_usage[] = {
2954 		"perf trace [<options>] [<command>]",
2955 		"perf trace [<options>] -- <command> [<options>]",
2956 		"perf trace record [<options>] [<command>]",
2957 		"perf trace record [<options>] -- <command> [<options>]",
2958 		NULL
2959 	};
2960 	struct trace trace = {
2961 		.syscalls = {
2962 			. max = -1,
2963 		},
2964 		.opts = {
2965 			.target = {
2966 				.uid	   = UINT_MAX,
2967 				.uses_mmap = true,
2968 			},
2969 			.user_freq     = UINT_MAX,
2970 			.user_interval = ULLONG_MAX,
2971 			.no_buffering  = true,
2972 			.mmap_pages    = UINT_MAX,
2973 			.proc_map_timeout  = 500,
2974 		},
2975 		.output = stderr,
2976 		.show_comm = true,
2977 		.trace_syscalls = true,
2978 		.kernel_syscallchains = false,
2979 		.max_stack = UINT_MAX,
2980 	};
2981 	const char *output_name = NULL;
2982 	const struct option trace_options[] = {
2983 	OPT_CALLBACK('e', "event", &trace, "event",
2984 		     "event/syscall selector. use 'perf list' to list available events",
2985 		     trace__parse_events_option),
2986 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2987 		    "show the thread COMM next to its id"),
2988 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2989 	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2990 		     trace__parse_events_option),
2991 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
2992 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2993 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2994 		    "trace events on existing process id"),
2995 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2996 		    "trace events on existing thread id"),
2997 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2998 		     "pids to filter (by the kernel)", trace__set_filter_pids),
2999 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3000 		    "system-wide collection from all CPUs"),
3001 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3002 		    "list of cpus to monitor"),
3003 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3004 		    "child tasks do not inherit counters"),
3005 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3006 		     "number of mmap data pages",
3007 		     perf_evlist__parse_mmap_pages),
3008 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3009 		   "user to profile"),
3010 	OPT_CALLBACK(0, "duration", &trace, "float",
3011 		     "show only events with duration > N.M ms",
3012 		     trace__set_duration),
3013 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3014 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3015 	OPT_BOOLEAN('T', "time", &trace.full_time,
3016 		    "Show full timestamp, not time relative to first start"),
3017 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
3018 		    "Show only syscall summary with statistics"),
3019 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
3020 		    "Show all syscalls and summary with statistics"),
3021 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3022 		     "Trace pagefaults", parse_pagefaults, "maj"),
3023 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3024 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3025 	OPT_CALLBACK(0, "call-graph", &trace.opts,
3026 		     "record_mode[,record_size]", record_callchain_help,
3027 		     &record_parse_callchain_opt),
3028 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3029 		    "Show the kernel callchains on the syscall exit path"),
3030 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3031 		     "Set the minimum stack depth when parsing the callchain, "
3032 		     "anything below the specified depth will be ignored."),
3033 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3034 		     "Set the maximum stack depth when parsing the callchain, "
3035 		     "anything beyond the specified depth will be ignored. "
3036 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3037 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3038 			"per thread proc mmap processing timeout in ms"),
3039 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3040 		     "ms to wait before starting measurement after program "
3041 		     "start"),
3042 	OPT_END()
3043 	};
3044 	bool __maybe_unused max_stack_user_set = true;
3045 	bool mmap_pages_user_set = true;
3046 	const char * const trace_subcommands[] = { "record", NULL };
3047 	int err;
3048 	char bf[BUFSIZ];
3049 
3050 	signal(SIGSEGV, sighandler_dump_stack);
3051 	signal(SIGFPE, sighandler_dump_stack);
3052 
3053 	trace.evlist = perf_evlist__new();
3054 	trace.sctbl = syscalltbl__new();
3055 
3056 	if (trace.evlist == NULL || trace.sctbl == NULL) {
3057 		pr_err("Not enough memory to run!\n");
3058 		err = -ENOMEM;
3059 		goto out;
3060 	}
3061 
3062 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3063 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3064 
3065 	err = bpf__setup_stdout(trace.evlist);
3066 	if (err) {
3067 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3068 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3069 		goto out;
3070 	}
3071 
3072 	err = -1;
3073 
3074 	if (trace.trace_pgfaults) {
3075 		trace.opts.sample_address = true;
3076 		trace.opts.sample_time = true;
3077 	}
3078 
3079 	if (trace.opts.mmap_pages == UINT_MAX)
3080 		mmap_pages_user_set = false;
3081 
3082 	if (trace.max_stack == UINT_MAX) {
3083 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3084 		max_stack_user_set = false;
3085 	}
3086 
3087 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3088 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3089 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3090 	}
3091 #endif
3092 
3093 	if (callchain_param.enabled) {
3094 		if (!mmap_pages_user_set && geteuid() == 0)
3095 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3096 
3097 		symbol_conf.use_callchain = true;
3098 	}
3099 
3100 	if (trace.evlist->nr_entries > 0)
3101 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3102 
3103 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3104 		return trace__record(&trace, argc-1, &argv[1]);
3105 
3106 	/* summary_only implies summary option, but don't overwrite summary if set */
3107 	if (trace.summary_only)
3108 		trace.summary = trace.summary_only;
3109 
3110 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3111 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3112 		pr_err("Please specify something to trace.\n");
3113 		return -1;
3114 	}
3115 
3116 	if (!trace.trace_syscalls && trace.ev_qualifier) {
3117 		pr_err("The -e option can't be used with --no-syscalls.\n");
3118 		goto out;
3119 	}
3120 
3121 	if (output_name != NULL) {
3122 		err = trace__open_output(&trace, output_name);
3123 		if (err < 0) {
3124 			perror("failed to create output file");
3125 			goto out;
3126 		}
3127 	}
3128 
3129 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
3130 
3131 	err = target__validate(&trace.opts.target);
3132 	if (err) {
3133 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3134 		fprintf(trace.output, "%s", bf);
3135 		goto out_close;
3136 	}
3137 
3138 	err = target__parse_uid(&trace.opts.target);
3139 	if (err) {
3140 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3141 		fprintf(trace.output, "%s", bf);
3142 		goto out_close;
3143 	}
3144 
3145 	if (!argc && target__none(&trace.opts.target))
3146 		trace.opts.target.system_wide = true;
3147 
3148 	if (input_name)
3149 		err = trace__replay(&trace);
3150 	else
3151 		err = trace__run(&trace, argc, argv);
3152 
3153 out_close:
3154 	if (output_name != NULL)
3155 		fclose(trace.output);
3156 out:
3157 	return err;
3158 }
3159