xref: /linux/tools/perf/builtin-trace.c (revision 26b433d0da062d6e19d75350c0171d3cf8ff560d)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/event.h"
25 #include "util/evlist.h"
26 #include <subcmd/exec-cmd.h>
27 #include "util/machine.h"
28 #include "util/path.h"
29 #include "util/session.h"
30 #include "util/thread.h"
31 #include <subcmd/parse-options.h>
32 #include "util/strlist.h"
33 #include "util/intlist.h"
34 #include "util/thread_map.h"
35 #include "util/stat.h"
36 #include "trace/beauty/beauty.h"
37 #include "trace-event.h"
38 #include "util/parse-events.h"
39 #include "util/bpf-loader.h"
40 #include "callchain.h"
41 #include "print_binary.h"
42 #include "string2.h"
43 #include "syscalltbl.h"
44 #include "rb_resort.h"
45 
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
49 #include <poll.h>
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/audit.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 
61 #include "sane_ctype.h"
62 
63 #ifndef O_CLOEXEC
64 # define O_CLOEXEC		02000000
65 #endif
66 
67 #ifndef F_LINUX_SPECIFIC_BASE
68 # define F_LINUX_SPECIFIC_BASE	1024
69 #endif
70 
71 struct trace {
72 	struct perf_tool	tool;
73 	struct syscalltbl	*sctbl;
74 	struct {
75 		int		max;
76 		struct syscall  *table;
77 		struct {
78 			struct perf_evsel *sys_enter,
79 					  *sys_exit;
80 		}		events;
81 	} syscalls;
82 	struct record_opts	opts;
83 	struct perf_evlist	*evlist;
84 	struct machine		*host;
85 	struct thread		*current;
86 	u64			base_time;
87 	FILE			*output;
88 	unsigned long		nr_events;
89 	struct strlist		*ev_qualifier;
90 	struct {
91 		size_t		nr;
92 		int		*entries;
93 	}			ev_qualifier_ids;
94 	struct {
95 		size_t		nr;
96 		pid_t		*entries;
97 	}			filter_pids;
98 	double			duration_filter;
99 	double			runtime_ms;
100 	struct {
101 		u64		vfs_getname,
102 				proc_getname;
103 	} stats;
104 	unsigned int		max_stack;
105 	unsigned int		min_stack;
106 	bool			not_ev_qualifier;
107 	bool			live;
108 	bool			full_time;
109 	bool			sched;
110 	bool			multiple_threads;
111 	bool			summary;
112 	bool			summary_only;
113 	bool			show_comm;
114 	bool			show_tool_stats;
115 	bool			trace_syscalls;
116 	bool			kernel_syscallchains;
117 	bool			force;
118 	bool			vfs_getname;
119 	int			trace_pgfaults;
120 	int			open_id;
121 };
122 
123 struct tp_field {
124 	int offset;
125 	union {
126 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
127 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
128 	};
129 };
130 
131 #define TP_UINT_FIELD(bits) \
132 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
133 { \
134 	u##bits value; \
135 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
136 	return value;  \
137 }
138 
139 TP_UINT_FIELD(8);
140 TP_UINT_FIELD(16);
141 TP_UINT_FIELD(32);
142 TP_UINT_FIELD(64);
143 
144 #define TP_UINT_FIELD__SWAPPED(bits) \
145 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
146 { \
147 	u##bits value; \
148 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
149 	return bswap_##bits(value);\
150 }
151 
152 TP_UINT_FIELD__SWAPPED(16);
153 TP_UINT_FIELD__SWAPPED(32);
154 TP_UINT_FIELD__SWAPPED(64);
155 
156 static int tp_field__init_uint(struct tp_field *field,
157 			       struct format_field *format_field,
158 			       bool needs_swap)
159 {
160 	field->offset = format_field->offset;
161 
162 	switch (format_field->size) {
163 	case 1:
164 		field->integer = tp_field__u8;
165 		break;
166 	case 2:
167 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
168 		break;
169 	case 4:
170 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
171 		break;
172 	case 8:
173 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
174 		break;
175 	default:
176 		return -1;
177 	}
178 
179 	return 0;
180 }
181 
182 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
183 {
184 	return sample->raw_data + field->offset;
185 }
186 
187 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
188 {
189 	field->offset = format_field->offset;
190 	field->pointer = tp_field__ptr;
191 	return 0;
192 }
193 
194 struct syscall_tp {
195 	struct tp_field id;
196 	union {
197 		struct tp_field args, ret;
198 	};
199 };
200 
201 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
202 					  struct tp_field *field,
203 					  const char *name)
204 {
205 	struct format_field *format_field = perf_evsel__field(evsel, name);
206 
207 	if (format_field == NULL)
208 		return -1;
209 
210 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
211 }
212 
213 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
214 	({ struct syscall_tp *sc = evsel->priv;\
215 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
216 
217 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
218 					 struct tp_field *field,
219 					 const char *name)
220 {
221 	struct format_field *format_field = perf_evsel__field(evsel, name);
222 
223 	if (format_field == NULL)
224 		return -1;
225 
226 	return tp_field__init_ptr(field, format_field);
227 }
228 
229 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
230 	({ struct syscall_tp *sc = evsel->priv;\
231 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
232 
233 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
234 {
235 	zfree(&evsel->priv);
236 	perf_evsel__delete(evsel);
237 }
238 
239 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
240 {
241 	evsel->priv = malloc(sizeof(struct syscall_tp));
242 	if (evsel->priv != NULL) {
243 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
244 			goto out_delete;
245 
246 		evsel->handler = handler;
247 		return 0;
248 	}
249 
250 	return -ENOMEM;
251 
252 out_delete:
253 	zfree(&evsel->priv);
254 	return -ENOENT;
255 }
256 
257 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
258 {
259 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
260 
261 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
262 	if (IS_ERR(evsel))
263 		evsel = perf_evsel__newtp("syscalls", direction);
264 
265 	if (IS_ERR(evsel))
266 		return NULL;
267 
268 	if (perf_evsel__init_syscall_tp(evsel, handler))
269 		goto out_delete;
270 
271 	return evsel;
272 
273 out_delete:
274 	perf_evsel__delete_priv(evsel);
275 	return NULL;
276 }
277 
278 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
279 	({ struct syscall_tp *fields = evsel->priv; \
280 	   fields->name.integer(&fields->name, sample); })
281 
282 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
283 	({ struct syscall_tp *fields = evsel->priv; \
284 	   fields->name.pointer(&fields->name, sample); })
285 
286 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
287 {
288 	int idx = val - sa->offset;
289 
290 	if (idx < 0 || idx >= sa->nr_entries)
291 		return scnprintf(bf, size, intfmt, val);
292 
293 	return scnprintf(bf, size, "%s", sa->entries[idx]);
294 }
295 
296 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
297 						const char *intfmt,
298 					        struct syscall_arg *arg)
299 {
300 	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
301 }
302 
303 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
304 					      struct syscall_arg *arg)
305 {
306 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
307 }
308 
309 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
310 
311 struct strarrays {
312 	int		nr_entries;
313 	struct strarray **entries;
314 };
315 
316 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
317 	.nr_entries = ARRAY_SIZE(array), \
318 	.entries = array, \
319 }
320 
321 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
322 					struct syscall_arg *arg)
323 {
324 	struct strarrays *sas = arg->parm;
325 	int i;
326 
327 	for (i = 0; i < sas->nr_entries; ++i) {
328 		struct strarray *sa = sas->entries[i];
329 		int idx = arg->val - sa->offset;
330 
331 		if (idx >= 0 && idx < sa->nr_entries) {
332 			if (sa->entries[idx] == NULL)
333 				break;
334 			return scnprintf(bf, size, "%s", sa->entries[idx]);
335 		}
336 	}
337 
338 	return scnprintf(bf, size, "%d", arg->val);
339 }
340 
341 #ifndef AT_FDCWD
342 #define AT_FDCWD	-100
343 #endif
344 
345 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
346 					   struct syscall_arg *arg)
347 {
348 	int fd = arg->val;
349 
350 	if (fd == AT_FDCWD)
351 		return scnprintf(bf, size, "CWD");
352 
353 	return syscall_arg__scnprintf_fd(bf, size, arg);
354 }
355 
356 #define SCA_FDAT syscall_arg__scnprintf_fd_at
357 
358 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
359 					      struct syscall_arg *arg);
360 
361 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
362 
363 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
364 {
365 	return scnprintf(bf, size, "%#lx", arg->val);
366 }
367 
368 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
369 {
370 	return scnprintf(bf, size, "%d", arg->val);
371 }
372 
373 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
374 {
375 	return scnprintf(bf, size, "%ld", arg->val);
376 }
377 
378 static const char *bpf_cmd[] = {
379 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
380 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
381 };
382 static DEFINE_STRARRAY(bpf_cmd);
383 
384 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
385 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
386 
387 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
388 static DEFINE_STRARRAY(itimers);
389 
390 static const char *keyctl_options[] = {
391 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
392 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
393 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
394 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
395 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
396 };
397 static DEFINE_STRARRAY(keyctl_options);
398 
399 static const char *whences[] = { "SET", "CUR", "END",
400 #ifdef SEEK_DATA
401 "DATA",
402 #endif
403 #ifdef SEEK_HOLE
404 "HOLE",
405 #endif
406 };
407 static DEFINE_STRARRAY(whences);
408 
409 static const char *fcntl_cmds[] = {
410 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
411 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
412 	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
413 	"GETOWNER_UIDS",
414 };
415 static DEFINE_STRARRAY(fcntl_cmds);
416 
417 static const char *fcntl_linux_specific_cmds[] = {
418 	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
419 	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
420 	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
421 };
422 
423 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
424 
425 static struct strarray *fcntl_cmds_arrays[] = {
426 	&strarray__fcntl_cmds,
427 	&strarray__fcntl_linux_specific_cmds,
428 };
429 
430 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
431 
432 static const char *rlimit_resources[] = {
433 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
434 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
435 	"RTTIME",
436 };
437 static DEFINE_STRARRAY(rlimit_resources);
438 
439 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
440 static DEFINE_STRARRAY(sighow);
441 
442 static const char *clockid[] = {
443 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
444 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
445 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
446 };
447 static DEFINE_STRARRAY(clockid);
448 
449 static const char *socket_families[] = {
450 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
451 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
452 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
453 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
454 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
455 	"ALG", "NFC", "VSOCK",
456 };
457 static DEFINE_STRARRAY(socket_families);
458 
459 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
460 						 struct syscall_arg *arg)
461 {
462 	size_t printed = 0;
463 	int mode = arg->val;
464 
465 	if (mode == F_OK) /* 0 */
466 		return scnprintf(bf, size, "F");
467 #define	P_MODE(n) \
468 	if (mode & n##_OK) { \
469 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
470 		mode &= ~n##_OK; \
471 	}
472 
473 	P_MODE(R);
474 	P_MODE(W);
475 	P_MODE(X);
476 #undef P_MODE
477 
478 	if (mode)
479 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
480 
481 	return printed;
482 }
483 
484 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
485 
486 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
487 					      struct syscall_arg *arg);
488 
489 #define SCA_FILENAME syscall_arg__scnprintf_filename
490 
491 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
492 						struct syscall_arg *arg)
493 {
494 	int printed = 0, flags = arg->val;
495 
496 #define	P_FLAG(n) \
497 	if (flags & O_##n) { \
498 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
499 		flags &= ~O_##n; \
500 	}
501 
502 	P_FLAG(CLOEXEC);
503 	P_FLAG(NONBLOCK);
504 #undef P_FLAG
505 
506 	if (flags)
507 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
508 
509 	return printed;
510 }
511 
512 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
513 
514 #ifndef GRND_NONBLOCK
515 #define GRND_NONBLOCK	0x0001
516 #endif
517 #ifndef GRND_RANDOM
518 #define GRND_RANDOM	0x0002
519 #endif
520 
521 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
522 						   struct syscall_arg *arg)
523 {
524 	int printed = 0, flags = arg->val;
525 
526 #define	P_FLAG(n) \
527 	if (flags & GRND_##n) { \
528 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
529 		flags &= ~GRND_##n; \
530 	}
531 
532 	P_FLAG(RANDOM);
533 	P_FLAG(NONBLOCK);
534 #undef P_FLAG
535 
536 	if (flags)
537 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
538 
539 	return printed;
540 }
541 
542 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
543 
544 #define STRARRAY(name, array) \
545 	  { .scnprintf	= SCA_STRARRAY, \
546 	    .parm	= &strarray__##array, }
547 
548 #include "trace/beauty/eventfd.c"
549 #include "trace/beauty/flock.c"
550 #include "trace/beauty/futex_op.c"
551 #include "trace/beauty/mmap.c"
552 #include "trace/beauty/mode_t.c"
553 #include "trace/beauty/msg_flags.c"
554 #include "trace/beauty/open_flags.c"
555 #include "trace/beauty/perf_event_open.c"
556 #include "trace/beauty/pid.c"
557 #include "trace/beauty/sched_policy.c"
558 #include "trace/beauty/seccomp.c"
559 #include "trace/beauty/signum.c"
560 #include "trace/beauty/socket_type.c"
561 #include "trace/beauty/waitid_options.c"
562 
563 struct syscall_arg_fmt {
564 	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
565 	void	   *parm;
566 	const char *name;
567 	bool	   show_zero;
568 };
569 
570 static struct syscall_fmt {
571 	const char *name;
572 	const char *alias;
573 	struct syscall_arg_fmt arg[6];
574 	u8	   nr_args;
575 	bool	   errpid;
576 	bool	   timeout;
577 	bool	   hexret;
578 } syscall_fmts[] = {
579 	{ .name	    = "access",
580 	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
581 	{ .name	    = "arch_prctl", .alias = "prctl", },
582 	{ .name	    = "bpf",
583 	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
584 	{ .name	    = "brk",	    .hexret = true,
585 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
586 	{ .name     = "clock_gettime",
587 	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
588 	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
589 	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
590 		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
591 		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
592 		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
593 		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
594 	{ .name	    = "close",
595 	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
596 	{ .name	    = "epoll_ctl",
597 	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
598 	{ .name	    = "eventfd2",
599 	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
600 	{ .name	    = "fchmodat",
601 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
602 	{ .name	    = "fchownat",
603 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
604 	{ .name	    = "fcntl",
605 	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
606 			   .parm      = &strarrays__fcntl_cmds_arrays,
607 			   .show_zero = true, },
608 		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
609 	{ .name	    = "flock",
610 	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
611 	{ .name	    = "fstat", .alias = "newfstat", },
612 	{ .name	    = "fstatat", .alias = "newfstatat", },
613 	{ .name	    = "futex",
614 	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, }, },
615 	{ .name	    = "futimesat",
616 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
617 	{ .name	    = "getitimer",
618 	  .arg = { [0] = STRARRAY(which, itimers), }, },
619 	{ .name	    = "getpid",	    .errpid = true, },
620 	{ .name	    = "getpgid",    .errpid = true, },
621 	{ .name	    = "getppid",    .errpid = true, },
622 	{ .name	    = "getrandom",
623 	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
624 	{ .name	    = "getrlimit",
625 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
626 	{ .name	    = "ioctl",
627 	  .arg = {
628 #if defined(__i386__) || defined(__x86_64__)
629 /*
630  * FIXME: Make this available to all arches.
631  */
632 		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
633 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
634 #else
635 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
636 #endif
637 	{ .name	    = "keyctl",
638 	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
639 	{ .name	    = "kill",
640 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
641 	{ .name	    = "linkat",
642 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
643 	{ .name	    = "lseek",
644 	  .arg = { [2] = STRARRAY(whence, whences), }, },
645 	{ .name	    = "lstat", .alias = "newlstat", },
646 	{ .name     = "madvise",
647 	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
648 		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
649 	{ .name	    = "mkdirat",
650 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
651 	{ .name	    = "mknodat",
652 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
653 	{ .name	    = "mlock",
654 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
655 	{ .name	    = "mlockall",
656 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
657 	{ .name	    = "mmap",	    .hexret = true,
658 /* The standard mmap maps to old_mmap on s390x */
659 #if defined(__s390x__)
660 	.alias = "old_mmap",
661 #endif
662 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* addr */ },
663 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
664 		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */ }, }, },
665 	{ .name	    = "mprotect",
666 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
667 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
668 	{ .name	    = "mq_unlink",
669 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
670 	{ .name	    = "mremap",	    .hexret = true,
671 	  .arg = { [0] = { .scnprintf = SCA_HEX,	  /* addr */ },
672 		   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
673 		   [4] = { .scnprintf = SCA_HEX,	  /* new_addr */ }, }, },
674 	{ .name	    = "munlock",
675 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
676 	{ .name	    = "munmap",
677 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
678 	{ .name	    = "name_to_handle_at",
679 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
680 	{ .name	    = "newfstatat",
681 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
682 	{ .name	    = "open",
683 	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
684 	{ .name	    = "open_by_handle_at",
685 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
686 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
687 	{ .name	    = "openat",
688 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
689 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
690 	{ .name	    = "perf_event_open",
691 	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
692 		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
693 		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
694 	{ .name	    = "pipe2",
695 	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
696 	{ .name	    = "pkey_alloc",
697 	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
698 	{ .name	    = "pkey_free",
699 	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
700 	{ .name	    = "pkey_mprotect",
701 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
702 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
703 		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
704 	{ .name	    = "poll", .timeout = true, },
705 	{ .name	    = "ppoll", .timeout = true, },
706 	{ .name	    = "pread", .alias = "pread64", },
707 	{ .name	    = "preadv", .alias = "pread", },
708 	{ .name	    = "prlimit64",
709 	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
710 	{ .name	    = "pwrite", .alias = "pwrite64", },
711 	{ .name	    = "readlinkat",
712 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
713 	{ .name	    = "recvfrom",
714 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
715 	{ .name	    = "recvmmsg",
716 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
717 	{ .name	    = "recvmsg",
718 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
719 	{ .name	    = "renameat",
720 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
721 	{ .name	    = "rt_sigaction",
722 	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
723 	{ .name	    = "rt_sigprocmask",
724 	  .arg = { [0] = STRARRAY(how, sighow), }, },
725 	{ .name	    = "rt_sigqueueinfo",
726 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
727 	{ .name	    = "rt_tgsigqueueinfo",
728 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
729 	{ .name	    = "sched_setscheduler",
730 	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
731 	{ .name	    = "seccomp",
732 	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
733 		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
734 	{ .name	    = "select", .timeout = true, },
735 	{ .name	    = "sendmmsg",
736 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
737 	{ .name	    = "sendmsg",
738 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
739 	{ .name	    = "sendto",
740 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
741 	{ .name	    = "set_tid_address", .errpid = true, },
742 	{ .name	    = "setitimer",
743 	  .arg = { [0] = STRARRAY(which, itimers), }, },
744 	{ .name	    = "setrlimit",
745 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
746 	{ .name	    = "socket",
747 	  .arg = { [0] = STRARRAY(family, socket_families),
748 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
749 	{ .name	    = "socketpair",
750 	  .arg = { [0] = STRARRAY(family, socket_families),
751 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
752 	{ .name	    = "stat", .alias = "newstat", },
753 	{ .name	    = "statx",
754 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
755 		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
756 		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
757 	{ .name	    = "swapoff",
758 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
759 	{ .name	    = "swapon",
760 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
761 	{ .name	    = "symlinkat",
762 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
763 	{ .name	    = "tgkill",
764 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
765 	{ .name	    = "tkill",
766 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
767 	{ .name	    = "uname", .alias = "newuname", },
768 	{ .name	    = "unlinkat",
769 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
770 	{ .name	    = "utimensat",
771 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
772 	{ .name	    = "wait4",	    .errpid = true,
773 	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
774 	{ .name	    = "waitid",	    .errpid = true,
775 	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
776 };
777 
778 static int syscall_fmt__cmp(const void *name, const void *fmtp)
779 {
780 	const struct syscall_fmt *fmt = fmtp;
781 	return strcmp(name, fmt->name);
782 }
783 
784 static struct syscall_fmt *syscall_fmt__find(const char *name)
785 {
786 	const int nmemb = ARRAY_SIZE(syscall_fmts);
787 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
788 }
789 
790 struct syscall {
791 	struct event_format *tp_format;
792 	int		    nr_args;
793 	struct format_field *args;
794 	const char	    *name;
795 	bool		    is_exit;
796 	struct syscall_fmt  *fmt;
797 	struct syscall_arg_fmt *arg_fmt;
798 };
799 
800 /*
801  * We need to have this 'calculated' boolean because in some cases we really
802  * don't know what is the duration of a syscall, for instance, when we start
803  * a session and some threads are waiting for a syscall to finish, say 'poll',
804  * in which case all we can do is to print "( ? ) for duration and for the
805  * start timestamp.
806  */
807 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
808 {
809 	double duration = (double)t / NSEC_PER_MSEC;
810 	size_t printed = fprintf(fp, "(");
811 
812 	if (!calculated)
813 		printed += fprintf(fp, "     ?   ");
814 	else if (duration >= 1.0)
815 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
816 	else if (duration >= 0.01)
817 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
818 	else
819 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
820 	return printed + fprintf(fp, "): ");
821 }
822 
823 /**
824  * filename.ptr: The filename char pointer that will be vfs_getname'd
825  * filename.entry_str_pos: Where to insert the string translated from
826  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
827  * ret_scnprintf: syscall args may set this to a different syscall return
828  *                formatter, for instance, fcntl may return fds, file flags, etc.
829  */
830 struct thread_trace {
831 	u64		  entry_time;
832 	bool		  entry_pending;
833 	unsigned long	  nr_events;
834 	unsigned long	  pfmaj, pfmin;
835 	char		  *entry_str;
836 	double		  runtime_ms;
837 	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
838         struct {
839 		unsigned long ptr;
840 		short int     entry_str_pos;
841 		bool	      pending_open;
842 		unsigned int  namelen;
843 		char	      *name;
844 	} filename;
845 	struct {
846 		int	  max;
847 		char	  **table;
848 	} paths;
849 
850 	struct intlist *syscall_stats;
851 };
852 
853 static struct thread_trace *thread_trace__new(void)
854 {
855 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
856 
857 	if (ttrace)
858 		ttrace->paths.max = -1;
859 
860 	ttrace->syscall_stats = intlist__new(NULL);
861 
862 	return ttrace;
863 }
864 
865 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
866 {
867 	struct thread_trace *ttrace;
868 
869 	if (thread == NULL)
870 		goto fail;
871 
872 	if (thread__priv(thread) == NULL)
873 		thread__set_priv(thread, thread_trace__new());
874 
875 	if (thread__priv(thread) == NULL)
876 		goto fail;
877 
878 	ttrace = thread__priv(thread);
879 	++ttrace->nr_events;
880 
881 	return ttrace;
882 fail:
883 	color_fprintf(fp, PERF_COLOR_RED,
884 		      "WARNING: not enough memory, dropping samples!\n");
885 	return NULL;
886 }
887 
888 
889 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
890 				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
891 {
892 	struct thread_trace *ttrace = thread__priv(arg->thread);
893 
894 	ttrace->ret_scnprintf = ret_scnprintf;
895 }
896 
897 #define TRACE_PFMAJ		(1 << 0)
898 #define TRACE_PFMIN		(1 << 1)
899 
900 static const size_t trace__entry_str_size = 2048;
901 
902 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
903 {
904 	struct thread_trace *ttrace = thread__priv(thread);
905 
906 	if (fd > ttrace->paths.max) {
907 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
908 
909 		if (npath == NULL)
910 			return -1;
911 
912 		if (ttrace->paths.max != -1) {
913 			memset(npath + ttrace->paths.max + 1, 0,
914 			       (fd - ttrace->paths.max) * sizeof(char *));
915 		} else {
916 			memset(npath, 0, (fd + 1) * sizeof(char *));
917 		}
918 
919 		ttrace->paths.table = npath;
920 		ttrace->paths.max   = fd;
921 	}
922 
923 	ttrace->paths.table[fd] = strdup(pathname);
924 
925 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
926 }
927 
928 static int thread__read_fd_path(struct thread *thread, int fd)
929 {
930 	char linkname[PATH_MAX], pathname[PATH_MAX];
931 	struct stat st;
932 	int ret;
933 
934 	if (thread->pid_ == thread->tid) {
935 		scnprintf(linkname, sizeof(linkname),
936 			  "/proc/%d/fd/%d", thread->pid_, fd);
937 	} else {
938 		scnprintf(linkname, sizeof(linkname),
939 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
940 	}
941 
942 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
943 		return -1;
944 
945 	ret = readlink(linkname, pathname, sizeof(pathname));
946 
947 	if (ret < 0 || ret > st.st_size)
948 		return -1;
949 
950 	pathname[ret] = '\0';
951 	return trace__set_fd_pathname(thread, fd, pathname);
952 }
953 
954 static const char *thread__fd_path(struct thread *thread, int fd,
955 				   struct trace *trace)
956 {
957 	struct thread_trace *ttrace = thread__priv(thread);
958 
959 	if (ttrace == NULL)
960 		return NULL;
961 
962 	if (fd < 0)
963 		return NULL;
964 
965 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
966 		if (!trace->live)
967 			return NULL;
968 		++trace->stats.proc_getname;
969 		if (thread__read_fd_path(thread, fd))
970 			return NULL;
971 	}
972 
973 	return ttrace->paths.table[fd];
974 }
975 
976 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
977 {
978 	int fd = arg->val;
979 	size_t printed = scnprintf(bf, size, "%d", fd);
980 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
981 
982 	if (path)
983 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
984 
985 	return printed;
986 }
987 
988 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
989 					      struct syscall_arg *arg)
990 {
991 	int fd = arg->val;
992 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
993 	struct thread_trace *ttrace = thread__priv(arg->thread);
994 
995 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
996 		zfree(&ttrace->paths.table[fd]);
997 
998 	return printed;
999 }
1000 
1001 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1002 				     unsigned long ptr)
1003 {
1004 	struct thread_trace *ttrace = thread__priv(thread);
1005 
1006 	ttrace->filename.ptr = ptr;
1007 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1008 }
1009 
1010 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1011 					      struct syscall_arg *arg)
1012 {
1013 	unsigned long ptr = arg->val;
1014 
1015 	if (!arg->trace->vfs_getname)
1016 		return scnprintf(bf, size, "%#x", ptr);
1017 
1018 	thread__set_filename_pos(arg->thread, bf, ptr);
1019 	return 0;
1020 }
1021 
1022 static bool trace__filter_duration(struct trace *trace, double t)
1023 {
1024 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1025 }
1026 
1027 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1028 {
1029 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1030 
1031 	return fprintf(fp, "%10.3f ", ts);
1032 }
1033 
1034 /*
1035  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1036  * using ttrace->entry_time for a thread that receives a sys_exit without
1037  * first having received a sys_enter ("poll" issued before tracing session
1038  * starts, lost sys_enter exit due to ring buffer overflow).
1039  */
1040 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1041 {
1042 	if (tstamp > 0)
1043 		return __trace__fprintf_tstamp(trace, tstamp, fp);
1044 
1045 	return fprintf(fp, "         ? ");
1046 }
1047 
1048 static bool done = false;
1049 static bool interrupted = false;
1050 
1051 static void sig_handler(int sig)
1052 {
1053 	done = true;
1054 	interrupted = sig == SIGINT;
1055 }
1056 
1057 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1058 					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1059 {
1060 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1061 	printed += fprintf_duration(duration, duration_calculated, fp);
1062 
1063 	if (trace->multiple_threads) {
1064 		if (trace->show_comm)
1065 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1066 		printed += fprintf(fp, "%d ", thread->tid);
1067 	}
1068 
1069 	return printed;
1070 }
1071 
1072 static int trace__process_event(struct trace *trace, struct machine *machine,
1073 				union perf_event *event, struct perf_sample *sample)
1074 {
1075 	int ret = 0;
1076 
1077 	switch (event->header.type) {
1078 	case PERF_RECORD_LOST:
1079 		color_fprintf(trace->output, PERF_COLOR_RED,
1080 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1081 		ret = machine__process_lost_event(machine, event, sample);
1082 		break;
1083 	default:
1084 		ret = machine__process_event(machine, event, sample);
1085 		break;
1086 	}
1087 
1088 	return ret;
1089 }
1090 
1091 static int trace__tool_process(struct perf_tool *tool,
1092 			       union perf_event *event,
1093 			       struct perf_sample *sample,
1094 			       struct machine *machine)
1095 {
1096 	struct trace *trace = container_of(tool, struct trace, tool);
1097 	return trace__process_event(trace, machine, event, sample);
1098 }
1099 
1100 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1101 {
1102 	struct machine *machine = vmachine;
1103 
1104 	if (machine->kptr_restrict_warned)
1105 		return NULL;
1106 
1107 	if (symbol_conf.kptr_restrict) {
1108 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1109 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1110 			   "Kernel samples will not be resolved.\n");
1111 		machine->kptr_restrict_warned = true;
1112 		return NULL;
1113 	}
1114 
1115 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1116 }
1117 
1118 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1119 {
1120 	int err = symbol__init(NULL);
1121 
1122 	if (err)
1123 		return err;
1124 
1125 	trace->host = machine__new_host();
1126 	if (trace->host == NULL)
1127 		return -ENOMEM;
1128 
1129 	if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1130 		return -errno;
1131 
1132 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1133 					    evlist->threads, trace__tool_process, false,
1134 					    trace->opts.proc_map_timeout);
1135 	if (err)
1136 		symbol__exit();
1137 
1138 	return err;
1139 }
1140 
1141 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1142 {
1143 	int idx;
1144 
1145 	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1146 		nr_args = sc->fmt->nr_args;
1147 
1148 	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1149 	if (sc->arg_fmt == NULL)
1150 		return -1;
1151 
1152 	for (idx = 0; idx < nr_args; ++idx) {
1153 		if (sc->fmt)
1154 			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1155 	}
1156 
1157 	sc->nr_args = nr_args;
1158 	return 0;
1159 }
1160 
1161 static int syscall__set_arg_fmts(struct syscall *sc)
1162 {
1163 	struct format_field *field;
1164 	int idx = 0, len;
1165 
1166 	for (field = sc->args; field; field = field->next, ++idx) {
1167 		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1168 			continue;
1169 
1170 		if (strcmp(field->type, "const char *") == 0 &&
1171 			 (strcmp(field->name, "filename") == 0 ||
1172 			  strcmp(field->name, "path") == 0 ||
1173 			  strcmp(field->name, "pathname") == 0))
1174 			sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1175 		else if (field->flags & FIELD_IS_POINTER)
1176 			sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1177 		else if (strcmp(field->type, "pid_t") == 0)
1178 			sc->arg_fmt[idx].scnprintf = SCA_PID;
1179 		else if (strcmp(field->type, "umode_t") == 0)
1180 			sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1181 		else if ((strcmp(field->type, "int") == 0 ||
1182 			  strcmp(field->type, "unsigned int") == 0 ||
1183 			  strcmp(field->type, "long") == 0) &&
1184 			 (len = strlen(field->name)) >= 2 &&
1185 			 strcmp(field->name + len - 2, "fd") == 0) {
1186 			/*
1187 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1188 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1189 			 * 65 int
1190 			 * 23 unsigned int
1191 			 * 7 unsigned long
1192 			 */
1193 			sc->arg_fmt[idx].scnprintf = SCA_FD;
1194 		}
1195 	}
1196 
1197 	return 0;
1198 }
1199 
1200 static int trace__read_syscall_info(struct trace *trace, int id)
1201 {
1202 	char tp_name[128];
1203 	struct syscall *sc;
1204 	const char *name = syscalltbl__name(trace->sctbl, id);
1205 
1206 	if (name == NULL)
1207 		return -1;
1208 
1209 	if (id > trace->syscalls.max) {
1210 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1211 
1212 		if (nsyscalls == NULL)
1213 			return -1;
1214 
1215 		if (trace->syscalls.max != -1) {
1216 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1217 			       (id - trace->syscalls.max) * sizeof(*sc));
1218 		} else {
1219 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1220 		}
1221 
1222 		trace->syscalls.table = nsyscalls;
1223 		trace->syscalls.max   = id;
1224 	}
1225 
1226 	sc = trace->syscalls.table + id;
1227 	sc->name = name;
1228 
1229 	sc->fmt  = syscall_fmt__find(sc->name);
1230 
1231 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1232 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1233 
1234 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1235 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1236 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1237 	}
1238 
1239 	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1240 		return -1;
1241 
1242 	if (IS_ERR(sc->tp_format))
1243 		return -1;
1244 
1245 	sc->args = sc->tp_format->format.fields;
1246 	/*
1247 	 * We need to check and discard the first variable '__syscall_nr'
1248 	 * or 'nr' that mean the syscall number. It is needless here.
1249 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1250 	 */
1251 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1252 		sc->args = sc->args->next;
1253 		--sc->nr_args;
1254 	}
1255 
1256 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1257 
1258 	return syscall__set_arg_fmts(sc);
1259 }
1260 
1261 static int trace__validate_ev_qualifier(struct trace *trace)
1262 {
1263 	int err = 0, i;
1264 	struct str_node *pos;
1265 
1266 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1267 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1268 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1269 
1270 	if (trace->ev_qualifier_ids.entries == NULL) {
1271 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1272 		       trace->output);
1273 		err = -EINVAL;
1274 		goto out;
1275 	}
1276 
1277 	i = 0;
1278 
1279 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1280 		const char *sc = pos->s;
1281 		int id = syscalltbl__id(trace->sctbl, sc);
1282 
1283 		if (id < 0) {
1284 			if (err == 0) {
1285 				fputs("Error:\tInvalid syscall ", trace->output);
1286 				err = -EINVAL;
1287 			} else {
1288 				fputs(", ", trace->output);
1289 			}
1290 
1291 			fputs(sc, trace->output);
1292 		}
1293 
1294 		trace->ev_qualifier_ids.entries[i++] = id;
1295 	}
1296 
1297 	if (err < 0) {
1298 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1299 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1300 		zfree(&trace->ev_qualifier_ids.entries);
1301 		trace->ev_qualifier_ids.nr = 0;
1302 	}
1303 out:
1304 	return err;
1305 }
1306 
1307 /*
1308  * args is to be interpreted as a series of longs but we need to handle
1309  * 8-byte unaligned accesses. args points to raw_data within the event
1310  * and raw_data is guaranteed to be 8-byte unaligned because it is
1311  * preceded by raw_size which is a u32. So we need to copy args to a temp
1312  * variable to read it. Most notably this avoids extended load instructions
1313  * on unaligned addresses
1314  */
1315 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1316 {
1317 	unsigned long val;
1318 	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1319 
1320 	memcpy(&val, p, sizeof(val));
1321 	return val;
1322 }
1323 
1324 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1325 				      struct syscall_arg *arg)
1326 {
1327 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1328 		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1329 
1330 	return scnprintf(bf, size, "arg%d: ", arg->idx);
1331 }
1332 
1333 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1334 				     struct syscall_arg *arg, unsigned long val)
1335 {
1336 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1337 		arg->val = val;
1338 		if (sc->arg_fmt[arg->idx].parm)
1339 			arg->parm = sc->arg_fmt[arg->idx].parm;
1340 		return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1341 	}
1342 	return scnprintf(bf, size, "%ld", val);
1343 }
1344 
1345 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1346 				      unsigned char *args, struct trace *trace,
1347 				      struct thread *thread)
1348 {
1349 	size_t printed = 0;
1350 	unsigned long val;
1351 	u8 bit = 1;
1352 	struct syscall_arg arg = {
1353 		.args	= args,
1354 		.idx	= 0,
1355 		.mask	= 0,
1356 		.trace  = trace,
1357 		.thread = thread,
1358 	};
1359 	struct thread_trace *ttrace = thread__priv(thread);
1360 
1361 	/*
1362 	 * Things like fcntl will set this in its 'cmd' formatter to pick the
1363 	 * right formatter for the return value (an fd? file flags?), which is
1364 	 * not needed for syscalls that always return a given type, say an fd.
1365 	 */
1366 	ttrace->ret_scnprintf = NULL;
1367 
1368 	if (sc->args != NULL) {
1369 		struct format_field *field;
1370 
1371 		for (field = sc->args; field;
1372 		     field = field->next, ++arg.idx, bit <<= 1) {
1373 			if (arg.mask & bit)
1374 				continue;
1375 
1376 			val = syscall_arg__val(&arg, arg.idx);
1377 
1378 			/*
1379  			 * Suppress this argument if its value is zero and
1380  			 * and we don't have a string associated in an
1381  			 * strarray for it.
1382  			 */
1383 			if (val == 0 &&
1384 			    !(sc->arg_fmt &&
1385 			      (sc->arg_fmt[arg.idx].show_zero ||
1386 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1387 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1388 			      sc->arg_fmt[arg.idx].parm))
1389 				continue;
1390 
1391 			printed += scnprintf(bf + printed, size - printed,
1392 					     "%s%s: ", printed ? ", " : "", field->name);
1393 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1394 		}
1395 	} else if (IS_ERR(sc->tp_format)) {
1396 		/*
1397 		 * If we managed to read the tracepoint /format file, then we
1398 		 * may end up not having any args, like with gettid(), so only
1399 		 * print the raw args when we didn't manage to read it.
1400 		 */
1401 		while (arg.idx < sc->nr_args) {
1402 			if (arg.mask & bit)
1403 				goto next_arg;
1404 			val = syscall_arg__val(&arg, arg.idx);
1405 			if (printed)
1406 				printed += scnprintf(bf + printed, size - printed, ", ");
1407 			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1408 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1409 next_arg:
1410 			++arg.idx;
1411 			bit <<= 1;
1412 		}
1413 	}
1414 
1415 	return printed;
1416 }
1417 
1418 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1419 				  union perf_event *event,
1420 				  struct perf_sample *sample);
1421 
1422 static struct syscall *trace__syscall_info(struct trace *trace,
1423 					   struct perf_evsel *evsel, int id)
1424 {
1425 
1426 	if (id < 0) {
1427 
1428 		/*
1429 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1430 		 * before that, leaving at a higher verbosity level till that is
1431 		 * explained. Reproduced with plain ftrace with:
1432 		 *
1433 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1434 		 * grep "NR -1 " /t/trace_pipe
1435 		 *
1436 		 * After generating some load on the machine.
1437  		 */
1438 		if (verbose > 1) {
1439 			static u64 n;
1440 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1441 				id, perf_evsel__name(evsel), ++n);
1442 		}
1443 		return NULL;
1444 	}
1445 
1446 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1447 	    trace__read_syscall_info(trace, id))
1448 		goto out_cant_read;
1449 
1450 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1451 		goto out_cant_read;
1452 
1453 	return &trace->syscalls.table[id];
1454 
1455 out_cant_read:
1456 	if (verbose > 0) {
1457 		fprintf(trace->output, "Problems reading syscall %d", id);
1458 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1459 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1460 		fputs(" information\n", trace->output);
1461 	}
1462 	return NULL;
1463 }
1464 
1465 static void thread__update_stats(struct thread_trace *ttrace,
1466 				 int id, struct perf_sample *sample)
1467 {
1468 	struct int_node *inode;
1469 	struct stats *stats;
1470 	u64 duration = 0;
1471 
1472 	inode = intlist__findnew(ttrace->syscall_stats, id);
1473 	if (inode == NULL)
1474 		return;
1475 
1476 	stats = inode->priv;
1477 	if (stats == NULL) {
1478 		stats = malloc(sizeof(struct stats));
1479 		if (stats == NULL)
1480 			return;
1481 		init_stats(stats);
1482 		inode->priv = stats;
1483 	}
1484 
1485 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1486 		duration = sample->time - ttrace->entry_time;
1487 
1488 	update_stats(stats, duration);
1489 }
1490 
1491 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1492 {
1493 	struct thread_trace *ttrace;
1494 	u64 duration;
1495 	size_t printed;
1496 
1497 	if (trace->current == NULL)
1498 		return 0;
1499 
1500 	ttrace = thread__priv(trace->current);
1501 
1502 	if (!ttrace->entry_pending)
1503 		return 0;
1504 
1505 	duration = sample->time - ttrace->entry_time;
1506 
1507 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1508 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1509 	ttrace->entry_pending = false;
1510 
1511 	return printed;
1512 }
1513 
1514 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1515 			    union perf_event *event __maybe_unused,
1516 			    struct perf_sample *sample)
1517 {
1518 	char *msg;
1519 	void *args;
1520 	size_t printed = 0;
1521 	struct thread *thread;
1522 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1523 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1524 	struct thread_trace *ttrace;
1525 
1526 	if (sc == NULL)
1527 		return -1;
1528 
1529 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1530 	ttrace = thread__trace(thread, trace->output);
1531 	if (ttrace == NULL)
1532 		goto out_put;
1533 
1534 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1535 
1536 	if (ttrace->entry_str == NULL) {
1537 		ttrace->entry_str = malloc(trace__entry_str_size);
1538 		if (!ttrace->entry_str)
1539 			goto out_put;
1540 	}
1541 
1542 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1543 		trace__printf_interrupted_entry(trace, sample);
1544 
1545 	ttrace->entry_time = sample->time;
1546 	msg = ttrace->entry_str;
1547 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1548 
1549 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1550 					   args, trace, thread);
1551 
1552 	if (sc->is_exit) {
1553 		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1554 			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1555 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1556 		}
1557 	} else {
1558 		ttrace->entry_pending = true;
1559 		/* See trace__vfs_getname & trace__sys_exit */
1560 		ttrace->filename.pending_open = false;
1561 	}
1562 
1563 	if (trace->current != thread) {
1564 		thread__put(trace->current);
1565 		trace->current = thread__get(thread);
1566 	}
1567 	err = 0;
1568 out_put:
1569 	thread__put(thread);
1570 	return err;
1571 }
1572 
1573 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1574 				    struct perf_sample *sample,
1575 				    struct callchain_cursor *cursor)
1576 {
1577 	struct addr_location al;
1578 
1579 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1580 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1581 		return -1;
1582 
1583 	return 0;
1584 }
1585 
1586 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1587 {
1588 	/* TODO: user-configurable print_opts */
1589 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1590 				        EVSEL__PRINT_DSO |
1591 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1592 
1593 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1594 }
1595 
1596 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1597 			   union perf_event *event __maybe_unused,
1598 			   struct perf_sample *sample)
1599 {
1600 	long ret;
1601 	u64 duration = 0;
1602 	bool duration_calculated = false;
1603 	struct thread *thread;
1604 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1605 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1606 	struct thread_trace *ttrace;
1607 
1608 	if (sc == NULL)
1609 		return -1;
1610 
1611 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1612 	ttrace = thread__trace(thread, trace->output);
1613 	if (ttrace == NULL)
1614 		goto out_put;
1615 
1616 	if (trace->summary)
1617 		thread__update_stats(ttrace, id, sample);
1618 
1619 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1620 
1621 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1622 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1623 		ttrace->filename.pending_open = false;
1624 		++trace->stats.vfs_getname;
1625 	}
1626 
1627 	if (ttrace->entry_time) {
1628 		duration = sample->time - ttrace->entry_time;
1629 		if (trace__filter_duration(trace, duration))
1630 			goto out;
1631 		duration_calculated = true;
1632 	} else if (trace->duration_filter)
1633 		goto out;
1634 
1635 	if (sample->callchain) {
1636 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1637 		if (callchain_ret == 0) {
1638 			if (callchain_cursor.nr < trace->min_stack)
1639 				goto out;
1640 			callchain_ret = 1;
1641 		}
1642 	}
1643 
1644 	if (trace->summary_only)
1645 		goto out;
1646 
1647 	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1648 
1649 	if (ttrace->entry_pending) {
1650 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1651 	} else {
1652 		fprintf(trace->output, " ... [");
1653 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1654 		fprintf(trace->output, "]: %s()", sc->name);
1655 	}
1656 
1657 	if (sc->fmt == NULL) {
1658 		if (ret < 0)
1659 			goto errno_print;
1660 signed_print:
1661 		fprintf(trace->output, ") = %ld", ret);
1662 	} else if (ret < 0) {
1663 errno_print: {
1664 		char bf[STRERR_BUFSIZE];
1665 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1666 			   *e = audit_errno_to_name(-ret);
1667 
1668 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1669 	}
1670 	} else if (ret == 0 && sc->fmt->timeout)
1671 		fprintf(trace->output, ") = 0 Timeout");
1672 	else if (ttrace->ret_scnprintf) {
1673 		char bf[1024];
1674 		struct syscall_arg arg = {
1675 			.val	= ret,
1676 			.thread	= thread,
1677 			.trace	= trace,
1678 		};
1679 		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1680 		ttrace->ret_scnprintf = NULL;
1681 		fprintf(trace->output, ") = %s", bf);
1682 	} else if (sc->fmt->hexret)
1683 		fprintf(trace->output, ") = %#lx", ret);
1684 	else if (sc->fmt->errpid) {
1685 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1686 
1687 		if (child != NULL) {
1688 			fprintf(trace->output, ") = %ld", ret);
1689 			if (child->comm_set)
1690 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1691 			thread__put(child);
1692 		}
1693 	} else
1694 		goto signed_print;
1695 
1696 	fputc('\n', trace->output);
1697 
1698 	if (callchain_ret > 0)
1699 		trace__fprintf_callchain(trace, sample);
1700 	else if (callchain_ret < 0)
1701 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1702 out:
1703 	ttrace->entry_pending = false;
1704 	err = 0;
1705 out_put:
1706 	thread__put(thread);
1707 	return err;
1708 }
1709 
1710 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1711 			      union perf_event *event __maybe_unused,
1712 			      struct perf_sample *sample)
1713 {
1714 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1715 	struct thread_trace *ttrace;
1716 	size_t filename_len, entry_str_len, to_move;
1717 	ssize_t remaining_space;
1718 	char *pos;
1719 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1720 
1721 	if (!thread)
1722 		goto out;
1723 
1724 	ttrace = thread__priv(thread);
1725 	if (!ttrace)
1726 		goto out_put;
1727 
1728 	filename_len = strlen(filename);
1729 	if (filename_len == 0)
1730 		goto out_put;
1731 
1732 	if (ttrace->filename.namelen < filename_len) {
1733 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1734 
1735 		if (f == NULL)
1736 			goto out_put;
1737 
1738 		ttrace->filename.namelen = filename_len;
1739 		ttrace->filename.name = f;
1740 	}
1741 
1742 	strcpy(ttrace->filename.name, filename);
1743 	ttrace->filename.pending_open = true;
1744 
1745 	if (!ttrace->filename.ptr)
1746 		goto out_put;
1747 
1748 	entry_str_len = strlen(ttrace->entry_str);
1749 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1750 	if (remaining_space <= 0)
1751 		goto out_put;
1752 
1753 	if (filename_len > (size_t)remaining_space) {
1754 		filename += filename_len - remaining_space;
1755 		filename_len = remaining_space;
1756 	}
1757 
1758 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1759 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1760 	memmove(pos + filename_len, pos, to_move);
1761 	memcpy(pos, filename, filename_len);
1762 
1763 	ttrace->filename.ptr = 0;
1764 	ttrace->filename.entry_str_pos = 0;
1765 out_put:
1766 	thread__put(thread);
1767 out:
1768 	return 0;
1769 }
1770 
1771 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1772 				     union perf_event *event __maybe_unused,
1773 				     struct perf_sample *sample)
1774 {
1775         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1776 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1777 	struct thread *thread = machine__findnew_thread(trace->host,
1778 							sample->pid,
1779 							sample->tid);
1780 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1781 
1782 	if (ttrace == NULL)
1783 		goto out_dump;
1784 
1785 	ttrace->runtime_ms += runtime_ms;
1786 	trace->runtime_ms += runtime_ms;
1787 out_put:
1788 	thread__put(thread);
1789 	return 0;
1790 
1791 out_dump:
1792 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1793 	       evsel->name,
1794 	       perf_evsel__strval(evsel, sample, "comm"),
1795 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1796 	       runtime,
1797 	       perf_evsel__intval(evsel, sample, "vruntime"));
1798 	goto out_put;
1799 }
1800 
1801 static void bpf_output__printer(enum binary_printer_ops op,
1802 				unsigned int val, void *extra)
1803 {
1804 	FILE *output = extra;
1805 	unsigned char ch = (unsigned char)val;
1806 
1807 	switch (op) {
1808 	case BINARY_PRINT_CHAR_DATA:
1809 		fprintf(output, "%c", isprint(ch) ? ch : '.');
1810 		break;
1811 	case BINARY_PRINT_DATA_BEGIN:
1812 	case BINARY_PRINT_LINE_BEGIN:
1813 	case BINARY_PRINT_ADDR:
1814 	case BINARY_PRINT_NUM_DATA:
1815 	case BINARY_PRINT_NUM_PAD:
1816 	case BINARY_PRINT_SEP:
1817 	case BINARY_PRINT_CHAR_PAD:
1818 	case BINARY_PRINT_LINE_END:
1819 	case BINARY_PRINT_DATA_END:
1820 	default:
1821 		break;
1822 	}
1823 }
1824 
1825 static void bpf_output__fprintf(struct trace *trace,
1826 				struct perf_sample *sample)
1827 {
1828 	print_binary(sample->raw_data, sample->raw_size, 8,
1829 		     bpf_output__printer, trace->output);
1830 }
1831 
1832 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1833 				union perf_event *event __maybe_unused,
1834 				struct perf_sample *sample)
1835 {
1836 	int callchain_ret = 0;
1837 
1838 	if (sample->callchain) {
1839 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1840 		if (callchain_ret == 0) {
1841 			if (callchain_cursor.nr < trace->min_stack)
1842 				goto out;
1843 			callchain_ret = 1;
1844 		}
1845 	}
1846 
1847 	trace__printf_interrupted_entry(trace, sample);
1848 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1849 
1850 	if (trace->trace_syscalls)
1851 		fprintf(trace->output, "(         ): ");
1852 
1853 	fprintf(trace->output, "%s:", evsel->name);
1854 
1855 	if (perf_evsel__is_bpf_output(evsel)) {
1856 		bpf_output__fprintf(trace, sample);
1857 	} else if (evsel->tp_format) {
1858 		event_format__fprintf(evsel->tp_format, sample->cpu,
1859 				      sample->raw_data, sample->raw_size,
1860 				      trace->output);
1861 	}
1862 
1863 	fprintf(trace->output, ")\n");
1864 
1865 	if (callchain_ret > 0)
1866 		trace__fprintf_callchain(trace, sample);
1867 	else if (callchain_ret < 0)
1868 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1869 out:
1870 	return 0;
1871 }
1872 
1873 static void print_location(FILE *f, struct perf_sample *sample,
1874 			   struct addr_location *al,
1875 			   bool print_dso, bool print_sym)
1876 {
1877 
1878 	if ((verbose > 0 || print_dso) && al->map)
1879 		fprintf(f, "%s@", al->map->dso->long_name);
1880 
1881 	if ((verbose > 0 || print_sym) && al->sym)
1882 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1883 			al->addr - al->sym->start);
1884 	else if (al->map)
1885 		fprintf(f, "0x%" PRIx64, al->addr);
1886 	else
1887 		fprintf(f, "0x%" PRIx64, sample->addr);
1888 }
1889 
1890 static int trace__pgfault(struct trace *trace,
1891 			  struct perf_evsel *evsel,
1892 			  union perf_event *event __maybe_unused,
1893 			  struct perf_sample *sample)
1894 {
1895 	struct thread *thread;
1896 	struct addr_location al;
1897 	char map_type = 'd';
1898 	struct thread_trace *ttrace;
1899 	int err = -1;
1900 	int callchain_ret = 0;
1901 
1902 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1903 
1904 	if (sample->callchain) {
1905 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1906 		if (callchain_ret == 0) {
1907 			if (callchain_cursor.nr < trace->min_stack)
1908 				goto out_put;
1909 			callchain_ret = 1;
1910 		}
1911 	}
1912 
1913 	ttrace = thread__trace(thread, trace->output);
1914 	if (ttrace == NULL)
1915 		goto out_put;
1916 
1917 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1918 		ttrace->pfmaj++;
1919 	else
1920 		ttrace->pfmin++;
1921 
1922 	if (trace->summary_only)
1923 		goto out;
1924 
1925 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1926 			      sample->ip, &al);
1927 
1928 	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1929 
1930 	fprintf(trace->output, "%sfault [",
1931 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1932 		"maj" : "min");
1933 
1934 	print_location(trace->output, sample, &al, false, true);
1935 
1936 	fprintf(trace->output, "] => ");
1937 
1938 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1939 				   sample->addr, &al);
1940 
1941 	if (!al.map) {
1942 		thread__find_addr_location(thread, sample->cpumode,
1943 					   MAP__FUNCTION, sample->addr, &al);
1944 
1945 		if (al.map)
1946 			map_type = 'x';
1947 		else
1948 			map_type = '?';
1949 	}
1950 
1951 	print_location(trace->output, sample, &al, true, false);
1952 
1953 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1954 
1955 	if (callchain_ret > 0)
1956 		trace__fprintf_callchain(trace, sample);
1957 	else if (callchain_ret < 0)
1958 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1959 out:
1960 	err = 0;
1961 out_put:
1962 	thread__put(thread);
1963 	return err;
1964 }
1965 
1966 static void trace__set_base_time(struct trace *trace,
1967 				 struct perf_evsel *evsel,
1968 				 struct perf_sample *sample)
1969 {
1970 	/*
1971 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1972 	 * and don't use sample->time unconditionally, we may end up having
1973 	 * some other event in the future without PERF_SAMPLE_TIME for good
1974 	 * reason, i.e. we may not be interested in its timestamps, just in
1975 	 * it taking place, picking some piece of information when it
1976 	 * appears in our event stream (vfs_getname comes to mind).
1977 	 */
1978 	if (trace->base_time == 0 && !trace->full_time &&
1979 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1980 		trace->base_time = sample->time;
1981 }
1982 
1983 static int trace__process_sample(struct perf_tool *tool,
1984 				 union perf_event *event,
1985 				 struct perf_sample *sample,
1986 				 struct perf_evsel *evsel,
1987 				 struct machine *machine __maybe_unused)
1988 {
1989 	struct trace *trace = container_of(tool, struct trace, tool);
1990 	struct thread *thread;
1991 	int err = 0;
1992 
1993 	tracepoint_handler handler = evsel->handler;
1994 
1995 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1996 	if (thread && thread__is_filtered(thread))
1997 		goto out;
1998 
1999 	trace__set_base_time(trace, evsel, sample);
2000 
2001 	if (handler) {
2002 		++trace->nr_events;
2003 		handler(trace, evsel, event, sample);
2004 	}
2005 out:
2006 	thread__put(thread);
2007 	return err;
2008 }
2009 
2010 static int trace__record(struct trace *trace, int argc, const char **argv)
2011 {
2012 	unsigned int rec_argc, i, j;
2013 	const char **rec_argv;
2014 	const char * const record_args[] = {
2015 		"record",
2016 		"-R",
2017 		"-m", "1024",
2018 		"-c", "1",
2019 	};
2020 
2021 	const char * const sc_args[] = { "-e", };
2022 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2023 	const char * const majpf_args[] = { "-e", "major-faults" };
2024 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2025 	const char * const minpf_args[] = { "-e", "minor-faults" };
2026 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2027 
2028 	/* +1 is for the event string below */
2029 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2030 		majpf_args_nr + minpf_args_nr + argc;
2031 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2032 
2033 	if (rec_argv == NULL)
2034 		return -ENOMEM;
2035 
2036 	j = 0;
2037 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2038 		rec_argv[j++] = record_args[i];
2039 
2040 	if (trace->trace_syscalls) {
2041 		for (i = 0; i < sc_args_nr; i++)
2042 			rec_argv[j++] = sc_args[i];
2043 
2044 		/* event string may be different for older kernels - e.g., RHEL6 */
2045 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2046 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2047 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2048 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2049 		else {
2050 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2051 			return -1;
2052 		}
2053 	}
2054 
2055 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2056 		for (i = 0; i < majpf_args_nr; i++)
2057 			rec_argv[j++] = majpf_args[i];
2058 
2059 	if (trace->trace_pgfaults & TRACE_PFMIN)
2060 		for (i = 0; i < minpf_args_nr; i++)
2061 			rec_argv[j++] = minpf_args[i];
2062 
2063 	for (i = 0; i < (unsigned int)argc; i++)
2064 		rec_argv[j++] = argv[i];
2065 
2066 	return cmd_record(j, rec_argv);
2067 }
2068 
2069 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2070 
2071 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2072 {
2073 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2074 
2075 	if (IS_ERR(evsel))
2076 		return false;
2077 
2078 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2079 		perf_evsel__delete(evsel);
2080 		return false;
2081 	}
2082 
2083 	evsel->handler = trace__vfs_getname;
2084 	perf_evlist__add(evlist, evsel);
2085 	return true;
2086 }
2087 
2088 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2089 {
2090 	struct perf_evsel *evsel;
2091 	struct perf_event_attr attr = {
2092 		.type = PERF_TYPE_SOFTWARE,
2093 		.mmap_data = 1,
2094 	};
2095 
2096 	attr.config = config;
2097 	attr.sample_period = 1;
2098 
2099 	event_attr_init(&attr);
2100 
2101 	evsel = perf_evsel__new(&attr);
2102 	if (evsel)
2103 		evsel->handler = trace__pgfault;
2104 
2105 	return evsel;
2106 }
2107 
2108 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2109 {
2110 	const u32 type = event->header.type;
2111 	struct perf_evsel *evsel;
2112 
2113 	if (type != PERF_RECORD_SAMPLE) {
2114 		trace__process_event(trace, trace->host, event, sample);
2115 		return;
2116 	}
2117 
2118 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2119 	if (evsel == NULL) {
2120 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2121 		return;
2122 	}
2123 
2124 	trace__set_base_time(trace, evsel, sample);
2125 
2126 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2127 	    sample->raw_data == NULL) {
2128 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2129 		       perf_evsel__name(evsel), sample->tid,
2130 		       sample->cpu, sample->raw_size);
2131 	} else {
2132 		tracepoint_handler handler = evsel->handler;
2133 		handler(trace, evsel, event, sample);
2134 	}
2135 }
2136 
2137 static int trace__add_syscall_newtp(struct trace *trace)
2138 {
2139 	int ret = -1;
2140 	struct perf_evlist *evlist = trace->evlist;
2141 	struct perf_evsel *sys_enter, *sys_exit;
2142 
2143 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2144 	if (sys_enter == NULL)
2145 		goto out;
2146 
2147 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2148 		goto out_delete_sys_enter;
2149 
2150 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2151 	if (sys_exit == NULL)
2152 		goto out_delete_sys_enter;
2153 
2154 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2155 		goto out_delete_sys_exit;
2156 
2157 	perf_evlist__add(evlist, sys_enter);
2158 	perf_evlist__add(evlist, sys_exit);
2159 
2160 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2161 		/*
2162 		 * We're interested only in the user space callchain
2163 		 * leading to the syscall, allow overriding that for
2164 		 * debugging reasons using --kernel_syscall_callchains
2165 		 */
2166 		sys_exit->attr.exclude_callchain_kernel = 1;
2167 	}
2168 
2169 	trace->syscalls.events.sys_enter = sys_enter;
2170 	trace->syscalls.events.sys_exit  = sys_exit;
2171 
2172 	ret = 0;
2173 out:
2174 	return ret;
2175 
2176 out_delete_sys_exit:
2177 	perf_evsel__delete_priv(sys_exit);
2178 out_delete_sys_enter:
2179 	perf_evsel__delete_priv(sys_enter);
2180 	goto out;
2181 }
2182 
2183 static int trace__set_ev_qualifier_filter(struct trace *trace)
2184 {
2185 	int err = -1;
2186 	struct perf_evsel *sys_exit;
2187 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2188 						trace->ev_qualifier_ids.nr,
2189 						trace->ev_qualifier_ids.entries);
2190 
2191 	if (filter == NULL)
2192 		goto out_enomem;
2193 
2194 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2195 					  filter)) {
2196 		sys_exit = trace->syscalls.events.sys_exit;
2197 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2198 	}
2199 
2200 	free(filter);
2201 out:
2202 	return err;
2203 out_enomem:
2204 	errno = ENOMEM;
2205 	goto out;
2206 }
2207 
2208 static int trace__set_filter_loop_pids(struct trace *trace)
2209 {
2210 	unsigned int nr = 1;
2211 	pid_t pids[32] = {
2212 		getpid(),
2213 	};
2214 	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2215 
2216 	while (thread && nr < ARRAY_SIZE(pids)) {
2217 		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2218 
2219 		if (parent == NULL)
2220 			break;
2221 
2222 		if (!strcmp(thread__comm_str(parent), "sshd")) {
2223 			pids[nr++] = parent->tid;
2224 			break;
2225 		}
2226 		thread = parent;
2227 	}
2228 
2229 	return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2230 }
2231 
2232 static int trace__run(struct trace *trace, int argc, const char **argv)
2233 {
2234 	struct perf_evlist *evlist = trace->evlist;
2235 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2236 	int err = -1, i;
2237 	unsigned long before;
2238 	const bool forks = argc > 0;
2239 	bool draining = false;
2240 
2241 	trace->live = true;
2242 
2243 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2244 		goto out_error_raw_syscalls;
2245 
2246 	if (trace->trace_syscalls)
2247 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2248 
2249 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2250 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2251 		if (pgfault_maj == NULL)
2252 			goto out_error_mem;
2253 		perf_evlist__add(evlist, pgfault_maj);
2254 	}
2255 
2256 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2257 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2258 		if (pgfault_min == NULL)
2259 			goto out_error_mem;
2260 		perf_evlist__add(evlist, pgfault_min);
2261 	}
2262 
2263 	if (trace->sched &&
2264 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2265 				   trace__sched_stat_runtime))
2266 		goto out_error_sched_stat_runtime;
2267 
2268 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2269 	if (err < 0) {
2270 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2271 		goto out_delete_evlist;
2272 	}
2273 
2274 	err = trace__symbols_init(trace, evlist);
2275 	if (err < 0) {
2276 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2277 		goto out_delete_evlist;
2278 	}
2279 
2280 	perf_evlist__config(evlist, &trace->opts, NULL);
2281 
2282 	if (callchain_param.enabled) {
2283 		bool use_identifier = false;
2284 
2285 		if (trace->syscalls.events.sys_exit) {
2286 			perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2287 						     &trace->opts, &callchain_param);
2288 			use_identifier = true;
2289 		}
2290 
2291 		if (pgfault_maj) {
2292 			perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2293 			use_identifier = true;
2294 		}
2295 
2296 		if (pgfault_min) {
2297 			perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2298 			use_identifier = true;
2299 		}
2300 
2301 		if (use_identifier) {
2302 		       /*
2303 			* Now we have evsels with different sample_ids, use
2304 			* PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2305 			* from a fixed position in each ring buffer record.
2306 			*
2307 			* As of this the changeset introducing this comment, this
2308 			* isn't strictly needed, as the fields that can come before
2309 			* PERF_SAMPLE_ID are all used, but we'll probably disable
2310 			* some of those for things like copying the payload of
2311 			* pointer syscall arguments, and for vfs_getname we don't
2312 			* need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2313 			* here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2314 			*/
2315 			perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2316 			perf_evlist__reset_sample_bit(evlist, ID);
2317 		}
2318 	}
2319 
2320 	signal(SIGCHLD, sig_handler);
2321 	signal(SIGINT, sig_handler);
2322 
2323 	if (forks) {
2324 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2325 						    argv, false, NULL);
2326 		if (err < 0) {
2327 			fprintf(trace->output, "Couldn't run the workload!\n");
2328 			goto out_delete_evlist;
2329 		}
2330 	}
2331 
2332 	err = perf_evlist__open(evlist);
2333 	if (err < 0)
2334 		goto out_error_open;
2335 
2336 	err = bpf__apply_obj_config();
2337 	if (err) {
2338 		char errbuf[BUFSIZ];
2339 
2340 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2341 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2342 			 errbuf);
2343 		goto out_error_open;
2344 	}
2345 
2346 	/*
2347 	 * Better not use !target__has_task() here because we need to cover the
2348 	 * case where no threads were specified in the command line, but a
2349 	 * workload was, and in that case we will fill in the thread_map when
2350 	 * we fork the workload in perf_evlist__prepare_workload.
2351 	 */
2352 	if (trace->filter_pids.nr > 0)
2353 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2354 	else if (thread_map__pid(evlist->threads, 0) == -1)
2355 		err = trace__set_filter_loop_pids(trace);
2356 
2357 	if (err < 0)
2358 		goto out_error_mem;
2359 
2360 	if (trace->ev_qualifier_ids.nr > 0) {
2361 		err = trace__set_ev_qualifier_filter(trace);
2362 		if (err < 0)
2363 			goto out_errno;
2364 
2365 		pr_debug("event qualifier tracepoint filter: %s\n",
2366 			 trace->syscalls.events.sys_exit->filter);
2367 	}
2368 
2369 	err = perf_evlist__apply_filters(evlist, &evsel);
2370 	if (err < 0)
2371 		goto out_error_apply_filters;
2372 
2373 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2374 	if (err < 0)
2375 		goto out_error_mmap;
2376 
2377 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2378 		perf_evlist__enable(evlist);
2379 
2380 	if (forks)
2381 		perf_evlist__start_workload(evlist);
2382 
2383 	if (trace->opts.initial_delay) {
2384 		usleep(trace->opts.initial_delay * 1000);
2385 		perf_evlist__enable(evlist);
2386 	}
2387 
2388 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2389 				  evlist->threads->nr > 1 ||
2390 				  perf_evlist__first(evlist)->attr.inherit;
2391 again:
2392 	before = trace->nr_events;
2393 
2394 	for (i = 0; i < evlist->nr_mmaps; i++) {
2395 		union perf_event *event;
2396 
2397 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2398 			struct perf_sample sample;
2399 
2400 			++trace->nr_events;
2401 
2402 			err = perf_evlist__parse_sample(evlist, event, &sample);
2403 			if (err) {
2404 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2405 				goto next_event;
2406 			}
2407 
2408 			trace__handle_event(trace, event, &sample);
2409 next_event:
2410 			perf_evlist__mmap_consume(evlist, i);
2411 
2412 			if (interrupted)
2413 				goto out_disable;
2414 
2415 			if (done && !draining) {
2416 				perf_evlist__disable(evlist);
2417 				draining = true;
2418 			}
2419 		}
2420 	}
2421 
2422 	if (trace->nr_events == before) {
2423 		int timeout = done ? 100 : -1;
2424 
2425 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2426 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2427 				draining = true;
2428 
2429 			goto again;
2430 		}
2431 	} else {
2432 		goto again;
2433 	}
2434 
2435 out_disable:
2436 	thread__zput(trace->current);
2437 
2438 	perf_evlist__disable(evlist);
2439 
2440 	if (!err) {
2441 		if (trace->summary)
2442 			trace__fprintf_thread_summary(trace, trace->output);
2443 
2444 		if (trace->show_tool_stats) {
2445 			fprintf(trace->output, "Stats:\n "
2446 					       " vfs_getname : %" PRIu64 "\n"
2447 					       " proc_getname: %" PRIu64 "\n",
2448 				trace->stats.vfs_getname,
2449 				trace->stats.proc_getname);
2450 		}
2451 	}
2452 
2453 out_delete_evlist:
2454 	perf_evlist__delete(evlist);
2455 	trace->evlist = NULL;
2456 	trace->live = false;
2457 	return err;
2458 {
2459 	char errbuf[BUFSIZ];
2460 
2461 out_error_sched_stat_runtime:
2462 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2463 	goto out_error;
2464 
2465 out_error_raw_syscalls:
2466 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2467 	goto out_error;
2468 
2469 out_error_mmap:
2470 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2471 	goto out_error;
2472 
2473 out_error_open:
2474 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2475 
2476 out_error:
2477 	fprintf(trace->output, "%s\n", errbuf);
2478 	goto out_delete_evlist;
2479 
2480 out_error_apply_filters:
2481 	fprintf(trace->output,
2482 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2483 		evsel->filter, perf_evsel__name(evsel), errno,
2484 		str_error_r(errno, errbuf, sizeof(errbuf)));
2485 	goto out_delete_evlist;
2486 }
2487 out_error_mem:
2488 	fprintf(trace->output, "Not enough memory to run!\n");
2489 	goto out_delete_evlist;
2490 
2491 out_errno:
2492 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2493 	goto out_delete_evlist;
2494 }
2495 
2496 static int trace__replay(struct trace *trace)
2497 {
2498 	const struct perf_evsel_str_handler handlers[] = {
2499 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2500 	};
2501 	struct perf_data_file file = {
2502 		.path  = input_name,
2503 		.mode  = PERF_DATA_MODE_READ,
2504 		.force = trace->force,
2505 	};
2506 	struct perf_session *session;
2507 	struct perf_evsel *evsel;
2508 	int err = -1;
2509 
2510 	trace->tool.sample	  = trace__process_sample;
2511 	trace->tool.mmap	  = perf_event__process_mmap;
2512 	trace->tool.mmap2	  = perf_event__process_mmap2;
2513 	trace->tool.comm	  = perf_event__process_comm;
2514 	trace->tool.exit	  = perf_event__process_exit;
2515 	trace->tool.fork	  = perf_event__process_fork;
2516 	trace->tool.attr	  = perf_event__process_attr;
2517 	trace->tool.tracing_data  = perf_event__process_tracing_data;
2518 	trace->tool.build_id	  = perf_event__process_build_id;
2519 	trace->tool.namespaces	  = perf_event__process_namespaces;
2520 
2521 	trace->tool.ordered_events = true;
2522 	trace->tool.ordering_requires_timestamps = true;
2523 
2524 	/* add tid to output */
2525 	trace->multiple_threads = true;
2526 
2527 	session = perf_session__new(&file, false, &trace->tool);
2528 	if (session == NULL)
2529 		return -1;
2530 
2531 	if (trace->opts.target.pid)
2532 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2533 
2534 	if (trace->opts.target.tid)
2535 		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2536 
2537 	if (symbol__init(&session->header.env) < 0)
2538 		goto out;
2539 
2540 	trace->host = &session->machines.host;
2541 
2542 	err = perf_session__set_tracepoints_handlers(session, handlers);
2543 	if (err)
2544 		goto out;
2545 
2546 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2547 						     "raw_syscalls:sys_enter");
2548 	/* older kernels have syscalls tp versus raw_syscalls */
2549 	if (evsel == NULL)
2550 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2551 							     "syscalls:sys_enter");
2552 
2553 	if (evsel &&
2554 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2555 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2556 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2557 		goto out;
2558 	}
2559 
2560 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2561 						     "raw_syscalls:sys_exit");
2562 	if (evsel == NULL)
2563 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2564 							     "syscalls:sys_exit");
2565 	if (evsel &&
2566 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2567 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2568 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2569 		goto out;
2570 	}
2571 
2572 	evlist__for_each_entry(session->evlist, evsel) {
2573 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2574 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2575 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2576 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2577 			evsel->handler = trace__pgfault;
2578 	}
2579 
2580 	setup_pager();
2581 
2582 	err = perf_session__process_events(session);
2583 	if (err)
2584 		pr_err("Failed to process events, error %d", err);
2585 
2586 	else if (trace->summary)
2587 		trace__fprintf_thread_summary(trace, trace->output);
2588 
2589 out:
2590 	perf_session__delete(session);
2591 
2592 	return err;
2593 }
2594 
2595 static size_t trace__fprintf_threads_header(FILE *fp)
2596 {
2597 	size_t printed;
2598 
2599 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2600 
2601 	return printed;
2602 }
2603 
2604 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2605 	struct stats 	*stats;
2606 	double		msecs;
2607 	int		syscall;
2608 )
2609 {
2610 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2611 	struct stats *stats = source->priv;
2612 
2613 	entry->syscall = source->i;
2614 	entry->stats   = stats;
2615 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2616 }
2617 
2618 static size_t thread__dump_stats(struct thread_trace *ttrace,
2619 				 struct trace *trace, FILE *fp)
2620 {
2621 	size_t printed = 0;
2622 	struct syscall *sc;
2623 	struct rb_node *nd;
2624 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2625 
2626 	if (syscall_stats == NULL)
2627 		return 0;
2628 
2629 	printed += fprintf(fp, "\n");
2630 
2631 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2632 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2633 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2634 
2635 	resort_rb__for_each_entry(nd, syscall_stats) {
2636 		struct stats *stats = syscall_stats_entry->stats;
2637 		if (stats) {
2638 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2639 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2640 			double avg = avg_stats(stats);
2641 			double pct;
2642 			u64 n = (u64) stats->n;
2643 
2644 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2645 			avg /= NSEC_PER_MSEC;
2646 
2647 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2648 			printed += fprintf(fp, "   %-15s", sc->name);
2649 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2650 					   n, syscall_stats_entry->msecs, min, avg);
2651 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2652 		}
2653 	}
2654 
2655 	resort_rb__delete(syscall_stats);
2656 	printed += fprintf(fp, "\n\n");
2657 
2658 	return printed;
2659 }
2660 
2661 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2662 {
2663 	size_t printed = 0;
2664 	struct thread_trace *ttrace = thread__priv(thread);
2665 	double ratio;
2666 
2667 	if (ttrace == NULL)
2668 		return 0;
2669 
2670 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2671 
2672 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2673 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2674 	printed += fprintf(fp, "%.1f%%", ratio);
2675 	if (ttrace->pfmaj)
2676 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2677 	if (ttrace->pfmin)
2678 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2679 	if (trace->sched)
2680 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2681 	else if (fputc('\n', fp) != EOF)
2682 		++printed;
2683 
2684 	printed += thread__dump_stats(ttrace, trace, fp);
2685 
2686 	return printed;
2687 }
2688 
2689 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2690 {
2691 	return ttrace ? ttrace->nr_events : 0;
2692 }
2693 
2694 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2695 	struct thread *thread;
2696 )
2697 {
2698 	entry->thread = rb_entry(nd, struct thread, rb_node);
2699 }
2700 
2701 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2702 {
2703 	DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2704 	size_t printed = trace__fprintf_threads_header(fp);
2705 	struct rb_node *nd;
2706 
2707 	if (threads == NULL) {
2708 		fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2709 		return 0;
2710 	}
2711 
2712 	resort_rb__for_each_entry(nd, threads)
2713 		printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2714 
2715 	resort_rb__delete(threads);
2716 
2717 	return printed;
2718 }
2719 
2720 static int trace__set_duration(const struct option *opt, const char *str,
2721 			       int unset __maybe_unused)
2722 {
2723 	struct trace *trace = opt->value;
2724 
2725 	trace->duration_filter = atof(str);
2726 	return 0;
2727 }
2728 
2729 static int trace__set_filter_pids(const struct option *opt, const char *str,
2730 				  int unset __maybe_unused)
2731 {
2732 	int ret = -1;
2733 	size_t i;
2734 	struct trace *trace = opt->value;
2735 	/*
2736 	 * FIXME: introduce a intarray class, plain parse csv and create a
2737 	 * { int nr, int entries[] } struct...
2738 	 */
2739 	struct intlist *list = intlist__new(str);
2740 
2741 	if (list == NULL)
2742 		return -1;
2743 
2744 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2745 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2746 
2747 	if (trace->filter_pids.entries == NULL)
2748 		goto out;
2749 
2750 	trace->filter_pids.entries[0] = getpid();
2751 
2752 	for (i = 1; i < trace->filter_pids.nr; ++i)
2753 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2754 
2755 	intlist__delete(list);
2756 	ret = 0;
2757 out:
2758 	return ret;
2759 }
2760 
2761 static int trace__open_output(struct trace *trace, const char *filename)
2762 {
2763 	struct stat st;
2764 
2765 	if (!stat(filename, &st) && st.st_size) {
2766 		char oldname[PATH_MAX];
2767 
2768 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2769 		unlink(oldname);
2770 		rename(filename, oldname);
2771 	}
2772 
2773 	trace->output = fopen(filename, "w");
2774 
2775 	return trace->output == NULL ? -errno : 0;
2776 }
2777 
2778 static int parse_pagefaults(const struct option *opt, const char *str,
2779 			    int unset __maybe_unused)
2780 {
2781 	int *trace_pgfaults = opt->value;
2782 
2783 	if (strcmp(str, "all") == 0)
2784 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2785 	else if (strcmp(str, "maj") == 0)
2786 		*trace_pgfaults |= TRACE_PFMAJ;
2787 	else if (strcmp(str, "min") == 0)
2788 		*trace_pgfaults |= TRACE_PFMIN;
2789 	else
2790 		return -1;
2791 
2792 	return 0;
2793 }
2794 
2795 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2796 {
2797 	struct perf_evsel *evsel;
2798 
2799 	evlist__for_each_entry(evlist, evsel)
2800 		evsel->handler = handler;
2801 }
2802 
2803 /*
2804  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2805  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2806  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2807  *
2808  * It'd be better to introduce a parse_options() variant that would return a
2809  * list with the terms it didn't match to an event...
2810  */
2811 static int trace__parse_events_option(const struct option *opt, const char *str,
2812 				      int unset __maybe_unused)
2813 {
2814 	struct trace *trace = (struct trace *)opt->value;
2815 	const char *s = str;
2816 	char *sep = NULL, *lists[2] = { NULL, NULL, };
2817 	int len = strlen(str) + 1, err = -1, list;
2818 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2819 	char group_name[PATH_MAX];
2820 
2821 	if (strace_groups_dir == NULL)
2822 		return -1;
2823 
2824 	if (*s == '!') {
2825 		++s;
2826 		trace->not_ev_qualifier = true;
2827 	}
2828 
2829 	while (1) {
2830 		if ((sep = strchr(s, ',')) != NULL)
2831 			*sep = '\0';
2832 
2833 		list = 0;
2834 		if (syscalltbl__id(trace->sctbl, s) >= 0) {
2835 			list = 1;
2836 		} else {
2837 			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2838 			if (access(group_name, R_OK) == 0)
2839 				list = 1;
2840 		}
2841 
2842 		if (lists[list]) {
2843 			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2844 		} else {
2845 			lists[list] = malloc(len);
2846 			if (lists[list] == NULL)
2847 				goto out;
2848 			strcpy(lists[list], s);
2849 		}
2850 
2851 		if (!sep)
2852 			break;
2853 
2854 		*sep = ',';
2855 		s = sep + 1;
2856 	}
2857 
2858 	if (lists[1] != NULL) {
2859 		struct strlist_config slist_config = {
2860 			.dirname = strace_groups_dir,
2861 		};
2862 
2863 		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2864 		if (trace->ev_qualifier == NULL) {
2865 			fputs("Not enough memory to parse event qualifier", trace->output);
2866 			goto out;
2867 		}
2868 
2869 		if (trace__validate_ev_qualifier(trace))
2870 			goto out;
2871 	}
2872 
2873 	err = 0;
2874 
2875 	if (lists[0]) {
2876 		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2877 					       "event selector. use 'perf list' to list available events",
2878 					       parse_events_option);
2879 		err = parse_events_option(&o, lists[0], 0);
2880 	}
2881 out:
2882 	if (sep)
2883 		*sep = ',';
2884 
2885 	return err;
2886 }
2887 
2888 int cmd_trace(int argc, const char **argv)
2889 {
2890 	const char *trace_usage[] = {
2891 		"perf trace [<options>] [<command>]",
2892 		"perf trace [<options>] -- <command> [<options>]",
2893 		"perf trace record [<options>] [<command>]",
2894 		"perf trace record [<options>] -- <command> [<options>]",
2895 		NULL
2896 	};
2897 	struct trace trace = {
2898 		.syscalls = {
2899 			. max = -1,
2900 		},
2901 		.opts = {
2902 			.target = {
2903 				.uid	   = UINT_MAX,
2904 				.uses_mmap = true,
2905 			},
2906 			.user_freq     = UINT_MAX,
2907 			.user_interval = ULLONG_MAX,
2908 			.no_buffering  = true,
2909 			.mmap_pages    = UINT_MAX,
2910 			.proc_map_timeout  = 500,
2911 		},
2912 		.output = stderr,
2913 		.show_comm = true,
2914 		.trace_syscalls = true,
2915 		.kernel_syscallchains = false,
2916 		.max_stack = UINT_MAX,
2917 	};
2918 	const char *output_name = NULL;
2919 	const struct option trace_options[] = {
2920 	OPT_CALLBACK('e', "event", &trace, "event",
2921 		     "event/syscall selector. use 'perf list' to list available events",
2922 		     trace__parse_events_option),
2923 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2924 		    "show the thread COMM next to its id"),
2925 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2926 	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2927 		     trace__parse_events_option),
2928 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
2929 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2930 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2931 		    "trace events on existing process id"),
2932 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2933 		    "trace events on existing thread id"),
2934 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2935 		     "pids to filter (by the kernel)", trace__set_filter_pids),
2936 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2937 		    "system-wide collection from all CPUs"),
2938 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2939 		    "list of cpus to monitor"),
2940 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2941 		    "child tasks do not inherit counters"),
2942 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2943 		     "number of mmap data pages",
2944 		     perf_evlist__parse_mmap_pages),
2945 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2946 		   "user to profile"),
2947 	OPT_CALLBACK(0, "duration", &trace, "float",
2948 		     "show only events with duration > N.M ms",
2949 		     trace__set_duration),
2950 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2951 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2952 	OPT_BOOLEAN('T', "time", &trace.full_time,
2953 		    "Show full timestamp, not time relative to first start"),
2954 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
2955 		    "Show only syscall summary with statistics"),
2956 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
2957 		    "Show all syscalls and summary with statistics"),
2958 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2959 		     "Trace pagefaults", parse_pagefaults, "maj"),
2960 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2961 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2962 	OPT_CALLBACK(0, "call-graph", &trace.opts,
2963 		     "record_mode[,record_size]", record_callchain_help,
2964 		     &record_parse_callchain_opt),
2965 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2966 		    "Show the kernel callchains on the syscall exit path"),
2967 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2968 		     "Set the minimum stack depth when parsing the callchain, "
2969 		     "anything below the specified depth will be ignored."),
2970 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2971 		     "Set the maximum stack depth when parsing the callchain, "
2972 		     "anything beyond the specified depth will be ignored. "
2973 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2974 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2975 			"per thread proc mmap processing timeout in ms"),
2976 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
2977 		     "ms to wait before starting measurement after program "
2978 		     "start"),
2979 	OPT_END()
2980 	};
2981 	bool __maybe_unused max_stack_user_set = true;
2982 	bool mmap_pages_user_set = true;
2983 	const char * const trace_subcommands[] = { "record", NULL };
2984 	int err;
2985 	char bf[BUFSIZ];
2986 
2987 	signal(SIGSEGV, sighandler_dump_stack);
2988 	signal(SIGFPE, sighandler_dump_stack);
2989 
2990 	trace.evlist = perf_evlist__new();
2991 	trace.sctbl = syscalltbl__new();
2992 
2993 	if (trace.evlist == NULL || trace.sctbl == NULL) {
2994 		pr_err("Not enough memory to run!\n");
2995 		err = -ENOMEM;
2996 		goto out;
2997 	}
2998 
2999 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3000 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3001 
3002 	err = bpf__setup_stdout(trace.evlist);
3003 	if (err) {
3004 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3005 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3006 		goto out;
3007 	}
3008 
3009 	err = -1;
3010 
3011 	if (trace.trace_pgfaults) {
3012 		trace.opts.sample_address = true;
3013 		trace.opts.sample_time = true;
3014 	}
3015 
3016 	if (trace.opts.mmap_pages == UINT_MAX)
3017 		mmap_pages_user_set = false;
3018 
3019 	if (trace.max_stack == UINT_MAX) {
3020 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3021 		max_stack_user_set = false;
3022 	}
3023 
3024 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3025 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
3026 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3027 #endif
3028 
3029 	if (callchain_param.enabled) {
3030 		if (!mmap_pages_user_set && geteuid() == 0)
3031 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3032 
3033 		symbol_conf.use_callchain = true;
3034 	}
3035 
3036 	if (trace.evlist->nr_entries > 0)
3037 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3038 
3039 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3040 		return trace__record(&trace, argc-1, &argv[1]);
3041 
3042 	/* summary_only implies summary option, but don't overwrite summary if set */
3043 	if (trace.summary_only)
3044 		trace.summary = trace.summary_only;
3045 
3046 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3047 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3048 		pr_err("Please specify something to trace.\n");
3049 		return -1;
3050 	}
3051 
3052 	if (!trace.trace_syscalls && trace.ev_qualifier) {
3053 		pr_err("The -e option can't be used with --no-syscalls.\n");
3054 		goto out;
3055 	}
3056 
3057 	if (output_name != NULL) {
3058 		err = trace__open_output(&trace, output_name);
3059 		if (err < 0) {
3060 			perror("failed to create output file");
3061 			goto out;
3062 		}
3063 	}
3064 
3065 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
3066 
3067 	err = target__validate(&trace.opts.target);
3068 	if (err) {
3069 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3070 		fprintf(trace.output, "%s", bf);
3071 		goto out_close;
3072 	}
3073 
3074 	err = target__parse_uid(&trace.opts.target);
3075 	if (err) {
3076 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3077 		fprintf(trace.output, "%s", bf);
3078 		goto out_close;
3079 	}
3080 
3081 	if (!argc && target__none(&trace.opts.target))
3082 		trace.opts.target.system_wide = true;
3083 
3084 	if (input_name)
3085 		err = trace__replay(&trace);
3086 	else
3087 		err = trace__run(&trace, argc, argv);
3088 
3089 out_close:
3090 	if (output_name != NULL)
3091 		fclose(trace.output);
3092 out:
3093 	return err;
3094 }
3095