xref: /linux/tools/perf/builtin-trace.c (revision 9be95bdc53c12ada23e39027237fd05e1393d893)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/event.h"
25 #include "util/evlist.h"
26 #include <subcmd/exec-cmd.h>
27 #include "util/machine.h"
28 #include "util/path.h"
29 #include "util/session.h"
30 #include "util/thread.h"
31 #include <subcmd/parse-options.h>
32 #include "util/strlist.h"
33 #include "util/intlist.h"
34 #include "util/thread_map.h"
35 #include "util/stat.h"
36 #include "trace/beauty/beauty.h"
37 #include "trace-event.h"
38 #include "util/parse-events.h"
39 #include "util/bpf-loader.h"
40 #include "callchain.h"
41 #include "print_binary.h"
42 #include "string2.h"
43 #include "syscalltbl.h"
44 #include "rb_resort.h"
45 
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
49 #include <poll.h>
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/audit.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 
61 #include "sane_ctype.h"
62 
63 #ifndef O_CLOEXEC
64 # define O_CLOEXEC		02000000
65 #endif
66 
67 #ifndef F_LINUX_SPECIFIC_BASE
68 # define F_LINUX_SPECIFIC_BASE	1024
69 #endif
70 
71 struct trace {
72 	struct perf_tool	tool;
73 	struct syscalltbl	*sctbl;
74 	struct {
75 		int		max;
76 		struct syscall  *table;
77 		struct {
78 			struct perf_evsel *sys_enter,
79 					  *sys_exit;
80 		}		events;
81 	} syscalls;
82 	struct record_opts	opts;
83 	struct perf_evlist	*evlist;
84 	struct machine		*host;
85 	struct thread		*current;
86 	u64			base_time;
87 	FILE			*output;
88 	unsigned long		nr_events;
89 	struct strlist		*ev_qualifier;
90 	struct {
91 		size_t		nr;
92 		int		*entries;
93 	}			ev_qualifier_ids;
94 	struct {
95 		size_t		nr;
96 		pid_t		*entries;
97 	}			filter_pids;
98 	double			duration_filter;
99 	double			runtime_ms;
100 	struct {
101 		u64		vfs_getname,
102 				proc_getname;
103 	} stats;
104 	unsigned int		max_stack;
105 	unsigned int		min_stack;
106 	bool			not_ev_qualifier;
107 	bool			live;
108 	bool			full_time;
109 	bool			sched;
110 	bool			multiple_threads;
111 	bool			summary;
112 	bool			summary_only;
113 	bool			show_comm;
114 	bool			show_tool_stats;
115 	bool			trace_syscalls;
116 	bool			kernel_syscallchains;
117 	bool			force;
118 	bool			vfs_getname;
119 	int			trace_pgfaults;
120 	int			open_id;
121 };
122 
123 struct tp_field {
124 	int offset;
125 	union {
126 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
127 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
128 	};
129 };
130 
131 #define TP_UINT_FIELD(bits) \
132 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
133 { \
134 	u##bits value; \
135 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
136 	return value;  \
137 }
138 
139 TP_UINT_FIELD(8);
140 TP_UINT_FIELD(16);
141 TP_UINT_FIELD(32);
142 TP_UINT_FIELD(64);
143 
144 #define TP_UINT_FIELD__SWAPPED(bits) \
145 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
146 { \
147 	u##bits value; \
148 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
149 	return bswap_##bits(value);\
150 }
151 
152 TP_UINT_FIELD__SWAPPED(16);
153 TP_UINT_FIELD__SWAPPED(32);
154 TP_UINT_FIELD__SWAPPED(64);
155 
156 static int tp_field__init_uint(struct tp_field *field,
157 			       struct format_field *format_field,
158 			       bool needs_swap)
159 {
160 	field->offset = format_field->offset;
161 
162 	switch (format_field->size) {
163 	case 1:
164 		field->integer = tp_field__u8;
165 		break;
166 	case 2:
167 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
168 		break;
169 	case 4:
170 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
171 		break;
172 	case 8:
173 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
174 		break;
175 	default:
176 		return -1;
177 	}
178 
179 	return 0;
180 }
181 
182 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
183 {
184 	return sample->raw_data + field->offset;
185 }
186 
187 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
188 {
189 	field->offset = format_field->offset;
190 	field->pointer = tp_field__ptr;
191 	return 0;
192 }
193 
194 struct syscall_tp {
195 	struct tp_field id;
196 	union {
197 		struct tp_field args, ret;
198 	};
199 };
200 
201 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
202 					  struct tp_field *field,
203 					  const char *name)
204 {
205 	struct format_field *format_field = perf_evsel__field(evsel, name);
206 
207 	if (format_field == NULL)
208 		return -1;
209 
210 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
211 }
212 
213 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
214 	({ struct syscall_tp *sc = evsel->priv;\
215 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
216 
217 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
218 					 struct tp_field *field,
219 					 const char *name)
220 {
221 	struct format_field *format_field = perf_evsel__field(evsel, name);
222 
223 	if (format_field == NULL)
224 		return -1;
225 
226 	return tp_field__init_ptr(field, format_field);
227 }
228 
229 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
230 	({ struct syscall_tp *sc = evsel->priv;\
231 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
232 
233 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
234 {
235 	zfree(&evsel->priv);
236 	perf_evsel__delete(evsel);
237 }
238 
239 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
240 {
241 	evsel->priv = malloc(sizeof(struct syscall_tp));
242 	if (evsel->priv != NULL) {
243 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
244 			goto out_delete;
245 
246 		evsel->handler = handler;
247 		return 0;
248 	}
249 
250 	return -ENOMEM;
251 
252 out_delete:
253 	zfree(&evsel->priv);
254 	return -ENOENT;
255 }
256 
257 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
258 {
259 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
260 
261 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
262 	if (IS_ERR(evsel))
263 		evsel = perf_evsel__newtp("syscalls", direction);
264 
265 	if (IS_ERR(evsel))
266 		return NULL;
267 
268 	if (perf_evsel__init_syscall_tp(evsel, handler))
269 		goto out_delete;
270 
271 	return evsel;
272 
273 out_delete:
274 	perf_evsel__delete_priv(evsel);
275 	return NULL;
276 }
277 
278 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
279 	({ struct syscall_tp *fields = evsel->priv; \
280 	   fields->name.integer(&fields->name, sample); })
281 
282 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
283 	({ struct syscall_tp *fields = evsel->priv; \
284 	   fields->name.pointer(&fields->name, sample); })
285 
286 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
287 {
288 	int idx = val - sa->offset;
289 
290 	if (idx < 0 || idx >= sa->nr_entries)
291 		return scnprintf(bf, size, intfmt, val);
292 
293 	return scnprintf(bf, size, "%s", sa->entries[idx]);
294 }
295 
296 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
297 						const char *intfmt,
298 					        struct syscall_arg *arg)
299 {
300 	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
301 }
302 
303 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
304 					      struct syscall_arg *arg)
305 {
306 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
307 }
308 
309 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
310 
311 struct strarrays {
312 	int		nr_entries;
313 	struct strarray **entries;
314 };
315 
316 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
317 	.nr_entries = ARRAY_SIZE(array), \
318 	.entries = array, \
319 }
320 
321 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
322 					struct syscall_arg *arg)
323 {
324 	struct strarrays *sas = arg->parm;
325 	int i;
326 
327 	for (i = 0; i < sas->nr_entries; ++i) {
328 		struct strarray *sa = sas->entries[i];
329 		int idx = arg->val - sa->offset;
330 
331 		if (idx >= 0 && idx < sa->nr_entries) {
332 			if (sa->entries[idx] == NULL)
333 				break;
334 			return scnprintf(bf, size, "%s", sa->entries[idx]);
335 		}
336 	}
337 
338 	return scnprintf(bf, size, "%d", arg->val);
339 }
340 
341 #ifndef AT_FDCWD
342 #define AT_FDCWD	-100
343 #endif
344 
345 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
346 					   struct syscall_arg *arg)
347 {
348 	int fd = arg->val;
349 
350 	if (fd == AT_FDCWD)
351 		return scnprintf(bf, size, "CWD");
352 
353 	return syscall_arg__scnprintf_fd(bf, size, arg);
354 }
355 
356 #define SCA_FDAT syscall_arg__scnprintf_fd_at
357 
358 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
359 					      struct syscall_arg *arg);
360 
361 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
362 
363 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
364 {
365 	return scnprintf(bf, size, "%#lx", arg->val);
366 }
367 
368 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
369 {
370 	return scnprintf(bf, size, "%d", arg->val);
371 }
372 
373 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
374 {
375 	return scnprintf(bf, size, "%ld", arg->val);
376 }
377 
378 static const char *bpf_cmd[] = {
379 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
380 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
381 };
382 static DEFINE_STRARRAY(bpf_cmd);
383 
384 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
385 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
386 
387 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
388 static DEFINE_STRARRAY(itimers);
389 
390 static const char *keyctl_options[] = {
391 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
392 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
393 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
394 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
395 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
396 };
397 static DEFINE_STRARRAY(keyctl_options);
398 
399 static const char *whences[] = { "SET", "CUR", "END",
400 #ifdef SEEK_DATA
401 "DATA",
402 #endif
403 #ifdef SEEK_HOLE
404 "HOLE",
405 #endif
406 };
407 static DEFINE_STRARRAY(whences);
408 
409 static const char *fcntl_cmds[] = {
410 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
411 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
412 	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
413 	"GETOWNER_UIDS",
414 };
415 static DEFINE_STRARRAY(fcntl_cmds);
416 
417 static const char *fcntl_linux_specific_cmds[] = {
418 	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
419 	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
420 	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
421 };
422 
423 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
424 
425 static struct strarray *fcntl_cmds_arrays[] = {
426 	&strarray__fcntl_cmds,
427 	&strarray__fcntl_linux_specific_cmds,
428 };
429 
430 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
431 
432 static const char *rlimit_resources[] = {
433 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
434 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
435 	"RTTIME",
436 };
437 static DEFINE_STRARRAY(rlimit_resources);
438 
439 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
440 static DEFINE_STRARRAY(sighow);
441 
442 static const char *clockid[] = {
443 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
444 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
445 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
446 };
447 static DEFINE_STRARRAY(clockid);
448 
449 static const char *socket_families[] = {
450 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
451 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
452 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
453 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
454 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
455 	"ALG", "NFC", "VSOCK",
456 };
457 static DEFINE_STRARRAY(socket_families);
458 
459 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
460 						 struct syscall_arg *arg)
461 {
462 	size_t printed = 0;
463 	int mode = arg->val;
464 
465 	if (mode == F_OK) /* 0 */
466 		return scnprintf(bf, size, "F");
467 #define	P_MODE(n) \
468 	if (mode & n##_OK) { \
469 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
470 		mode &= ~n##_OK; \
471 	}
472 
473 	P_MODE(R);
474 	P_MODE(W);
475 	P_MODE(X);
476 #undef P_MODE
477 
478 	if (mode)
479 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
480 
481 	return printed;
482 }
483 
484 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
485 
486 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
487 					      struct syscall_arg *arg);
488 
489 #define SCA_FILENAME syscall_arg__scnprintf_filename
490 
491 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
492 						struct syscall_arg *arg)
493 {
494 	int printed = 0, flags = arg->val;
495 
496 #define	P_FLAG(n) \
497 	if (flags & O_##n) { \
498 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
499 		flags &= ~O_##n; \
500 	}
501 
502 	P_FLAG(CLOEXEC);
503 	P_FLAG(NONBLOCK);
504 #undef P_FLAG
505 
506 	if (flags)
507 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
508 
509 	return printed;
510 }
511 
512 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
513 
514 #ifndef GRND_NONBLOCK
515 #define GRND_NONBLOCK	0x0001
516 #endif
517 #ifndef GRND_RANDOM
518 #define GRND_RANDOM	0x0002
519 #endif
520 
521 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
522 						   struct syscall_arg *arg)
523 {
524 	int printed = 0, flags = arg->val;
525 
526 #define	P_FLAG(n) \
527 	if (flags & GRND_##n) { \
528 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
529 		flags &= ~GRND_##n; \
530 	}
531 
532 	P_FLAG(RANDOM);
533 	P_FLAG(NONBLOCK);
534 #undef P_FLAG
535 
536 	if (flags)
537 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
538 
539 	return printed;
540 }
541 
542 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
543 
544 #define STRARRAY(name, array) \
545 	  { .scnprintf	= SCA_STRARRAY, \
546 	    .parm	= &strarray__##array, }
547 
548 #include "trace/beauty/eventfd.c"
549 #include "trace/beauty/flock.c"
550 #include "trace/beauty/futex_op.c"
551 #include "trace/beauty/mmap.c"
552 #include "trace/beauty/mode_t.c"
553 #include "trace/beauty/msg_flags.c"
554 #include "trace/beauty/open_flags.c"
555 #include "trace/beauty/perf_event_open.c"
556 #include "trace/beauty/pid.c"
557 #include "trace/beauty/sched_policy.c"
558 #include "trace/beauty/seccomp.c"
559 #include "trace/beauty/signum.c"
560 #include "trace/beauty/socket_type.c"
561 #include "trace/beauty/waitid_options.c"
562 
563 struct syscall_arg_fmt {
564 	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
565 	void	   *parm;
566 	const char *name;
567 	bool	   show_zero;
568 };
569 
570 static struct syscall_fmt {
571 	const char *name;
572 	const char *alias;
573 	struct syscall_arg_fmt arg[6];
574 	u8	   nr_args;
575 	bool	   errpid;
576 	bool	   timeout;
577 	bool	   hexret;
578 } syscall_fmts[] = {
579 	{ .name	    = "access",
580 	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
581 	{ .name	    = "arch_prctl", .alias = "prctl", },
582 	{ .name	    = "bpf",
583 	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
584 	{ .name	    = "brk",	    .hexret = true,
585 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
586 	{ .name     = "clock_gettime",
587 	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
588 	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
589 	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
590 		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
591 		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
592 		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
593 		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
594 	{ .name	    = "close",
595 	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
596 	{ .name	    = "epoll_ctl",
597 	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
598 	{ .name	    = "eventfd2",
599 	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
600 	{ .name	    = "fchmodat",
601 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
602 	{ .name	    = "fchownat",
603 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
604 	{ .name	    = "fcntl",
605 	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
606 			   .parm      = &strarrays__fcntl_cmds_arrays,
607 			   .show_zero = true, },
608 		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
609 	{ .name	    = "flock",
610 	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
611 	{ .name	    = "fstat", .alias = "newfstat", },
612 	{ .name	    = "fstatat", .alias = "newfstatat", },
613 	{ .name	    = "futex",
614 	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, }, },
615 	{ .name	    = "futimesat",
616 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
617 	{ .name	    = "getitimer",
618 	  .arg = { [0] = STRARRAY(which, itimers), }, },
619 	{ .name	    = "getpid",	    .errpid = true, },
620 	{ .name	    = "getpgid",    .errpid = true, },
621 	{ .name	    = "getppid",    .errpid = true, },
622 	{ .name	    = "getrandom",
623 	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
624 	{ .name	    = "getrlimit",
625 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
626 	{ .name	    = "ioctl",
627 	  .arg = {
628 #if defined(__i386__) || defined(__x86_64__)
629 /*
630  * FIXME: Make this available to all arches.
631  */
632 		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
633 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
634 #else
635 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
636 #endif
637 	{ .name	    = "keyctl",
638 	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
639 	{ .name	    = "kill",
640 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
641 	{ .name	    = "linkat",
642 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
643 	{ .name	    = "lseek",
644 	  .arg = { [2] = STRARRAY(whence, whences), }, },
645 	{ .name	    = "lstat", .alias = "newlstat", },
646 	{ .name     = "madvise",
647 	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
648 		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
649 	{ .name	    = "mkdirat",
650 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
651 	{ .name	    = "mknodat",
652 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
653 	{ .name	    = "mlock",
654 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
655 	{ .name	    = "mlockall",
656 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
657 	{ .name	    = "mmap",	    .hexret = true,
658 /* The standard mmap maps to old_mmap on s390x */
659 #if defined(__s390x__)
660 	.alias = "old_mmap",
661 #endif
662 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* addr */ },
663 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
664 		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */ }, }, },
665 	{ .name	    = "mprotect",
666 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
667 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
668 	{ .name	    = "mq_unlink",
669 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
670 	{ .name	    = "mremap",	    .hexret = true,
671 	  .arg = { [0] = { .scnprintf = SCA_HEX,	  /* addr */ },
672 		   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
673 		   [4] = { .scnprintf = SCA_HEX,	  /* new_addr */ }, }, },
674 	{ .name	    = "munlock",
675 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
676 	{ .name	    = "munmap",
677 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
678 	{ .name	    = "name_to_handle_at",
679 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
680 	{ .name	    = "newfstatat",
681 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
682 	{ .name	    = "open",
683 	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
684 	{ .name	    = "open_by_handle_at",
685 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
686 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
687 	{ .name	    = "openat",
688 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
689 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
690 	{ .name	    = "perf_event_open",
691 	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
692 		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
693 		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
694 	{ .name	    = "pipe2",
695 	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
696 	{ .name	    = "pkey_alloc",
697 	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
698 	{ .name	    = "pkey_free",
699 	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
700 	{ .name	    = "pkey_mprotect",
701 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
702 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
703 		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
704 	{ .name	    = "poll", .timeout = true, },
705 	{ .name	    = "ppoll", .timeout = true, },
706 	{ .name	    = "pread", .alias = "pread64", },
707 	{ .name	    = "preadv", .alias = "pread", },
708 	{ .name	    = "prlimit64",
709 	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
710 	{ .name	    = "pwrite", .alias = "pwrite64", },
711 	{ .name	    = "readlinkat",
712 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
713 	{ .name	    = "recvfrom",
714 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
715 	{ .name	    = "recvmmsg",
716 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
717 	{ .name	    = "recvmsg",
718 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
719 	{ .name	    = "renameat",
720 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
721 	{ .name	    = "rt_sigaction",
722 	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
723 	{ .name	    = "rt_sigprocmask",
724 	  .arg = { [0] = STRARRAY(how, sighow), }, },
725 	{ .name	    = "rt_sigqueueinfo",
726 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
727 	{ .name	    = "rt_tgsigqueueinfo",
728 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
729 	{ .name	    = "sched_setscheduler",
730 	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
731 	{ .name	    = "seccomp",
732 	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
733 		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
734 	{ .name	    = "select", .timeout = true, },
735 	{ .name	    = "sendmmsg",
736 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
737 	{ .name	    = "sendmsg",
738 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
739 	{ .name	    = "sendto",
740 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
741 	{ .name	    = "set_tid_address", .errpid = true, },
742 	{ .name	    = "setitimer",
743 	  .arg = { [0] = STRARRAY(which, itimers), }, },
744 	{ .name	    = "setrlimit",
745 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
746 	{ .name	    = "socket",
747 	  .arg = { [0] = STRARRAY(family, socket_families),
748 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
749 	{ .name	    = "socketpair",
750 	  .arg = { [0] = STRARRAY(family, socket_families),
751 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
752 	{ .name	    = "stat", .alias = "newstat", },
753 	{ .name	    = "statx",
754 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
755 		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
756 		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
757 	{ .name	    = "swapoff",
758 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
759 	{ .name	    = "swapon",
760 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
761 	{ .name	    = "symlinkat",
762 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
763 	{ .name	    = "tgkill",
764 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
765 	{ .name	    = "tkill",
766 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
767 	{ .name	    = "uname", .alias = "newuname", },
768 	{ .name	    = "unlinkat",
769 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
770 	{ .name	    = "utimensat",
771 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
772 	{ .name	    = "wait4",	    .errpid = true,
773 	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
774 	{ .name	    = "waitid",	    .errpid = true,
775 	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
776 };
777 
778 static int syscall_fmt__cmp(const void *name, const void *fmtp)
779 {
780 	const struct syscall_fmt *fmt = fmtp;
781 	return strcmp(name, fmt->name);
782 }
783 
784 static struct syscall_fmt *syscall_fmt__find(const char *name)
785 {
786 	const int nmemb = ARRAY_SIZE(syscall_fmts);
787 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
788 }
789 
790 struct syscall {
791 	struct event_format *tp_format;
792 	int		    nr_args;
793 	struct format_field *args;
794 	const char	    *name;
795 	bool		    is_exit;
796 	struct syscall_fmt  *fmt;
797 	struct syscall_arg_fmt *arg_fmt;
798 };
799 
800 /*
801  * We need to have this 'calculated' boolean because in some cases we really
802  * don't know what is the duration of a syscall, for instance, when we start
803  * a session and some threads are waiting for a syscall to finish, say 'poll',
804  * in which case all we can do is to print "( ? ) for duration and for the
805  * start timestamp.
806  */
807 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
808 {
809 	double duration = (double)t / NSEC_PER_MSEC;
810 	size_t printed = fprintf(fp, "(");
811 
812 	if (!calculated)
813 		printed += fprintf(fp, "     ?   ");
814 	else if (duration >= 1.0)
815 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
816 	else if (duration >= 0.01)
817 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
818 	else
819 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
820 	return printed + fprintf(fp, "): ");
821 }
822 
823 /**
824  * filename.ptr: The filename char pointer that will be vfs_getname'd
825  * filename.entry_str_pos: Where to insert the string translated from
826  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
827  * ret_scnprintf: syscall args may set this to a different syscall return
828  *                formatter, for instance, fcntl may return fds, file flags, etc.
829  */
830 struct thread_trace {
831 	u64		  entry_time;
832 	bool		  entry_pending;
833 	unsigned long	  nr_events;
834 	unsigned long	  pfmaj, pfmin;
835 	char		  *entry_str;
836 	double		  runtime_ms;
837 	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
838         struct {
839 		unsigned long ptr;
840 		short int     entry_str_pos;
841 		bool	      pending_open;
842 		unsigned int  namelen;
843 		char	      *name;
844 	} filename;
845 	struct {
846 		int	  max;
847 		char	  **table;
848 	} paths;
849 
850 	struct intlist *syscall_stats;
851 };
852 
853 static struct thread_trace *thread_trace__new(void)
854 {
855 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
856 
857 	if (ttrace)
858 		ttrace->paths.max = -1;
859 
860 	ttrace->syscall_stats = intlist__new(NULL);
861 
862 	return ttrace;
863 }
864 
865 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
866 {
867 	struct thread_trace *ttrace;
868 
869 	if (thread == NULL)
870 		goto fail;
871 
872 	if (thread__priv(thread) == NULL)
873 		thread__set_priv(thread, thread_trace__new());
874 
875 	if (thread__priv(thread) == NULL)
876 		goto fail;
877 
878 	ttrace = thread__priv(thread);
879 	++ttrace->nr_events;
880 
881 	return ttrace;
882 fail:
883 	color_fprintf(fp, PERF_COLOR_RED,
884 		      "WARNING: not enough memory, dropping samples!\n");
885 	return NULL;
886 }
887 
888 
889 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
890 				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
891 {
892 	struct thread_trace *ttrace = thread__priv(arg->thread);
893 
894 	ttrace->ret_scnprintf = ret_scnprintf;
895 }
896 
897 #define TRACE_PFMAJ		(1 << 0)
898 #define TRACE_PFMIN		(1 << 1)
899 
900 static const size_t trace__entry_str_size = 2048;
901 
902 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
903 {
904 	struct thread_trace *ttrace = thread__priv(thread);
905 
906 	if (fd > ttrace->paths.max) {
907 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
908 
909 		if (npath == NULL)
910 			return -1;
911 
912 		if (ttrace->paths.max != -1) {
913 			memset(npath + ttrace->paths.max + 1, 0,
914 			       (fd - ttrace->paths.max) * sizeof(char *));
915 		} else {
916 			memset(npath, 0, (fd + 1) * sizeof(char *));
917 		}
918 
919 		ttrace->paths.table = npath;
920 		ttrace->paths.max   = fd;
921 	}
922 
923 	ttrace->paths.table[fd] = strdup(pathname);
924 
925 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
926 }
927 
928 static int thread__read_fd_path(struct thread *thread, int fd)
929 {
930 	char linkname[PATH_MAX], pathname[PATH_MAX];
931 	struct stat st;
932 	int ret;
933 
934 	if (thread->pid_ == thread->tid) {
935 		scnprintf(linkname, sizeof(linkname),
936 			  "/proc/%d/fd/%d", thread->pid_, fd);
937 	} else {
938 		scnprintf(linkname, sizeof(linkname),
939 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
940 	}
941 
942 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
943 		return -1;
944 
945 	ret = readlink(linkname, pathname, sizeof(pathname));
946 
947 	if (ret < 0 || ret > st.st_size)
948 		return -1;
949 
950 	pathname[ret] = '\0';
951 	return trace__set_fd_pathname(thread, fd, pathname);
952 }
953 
954 static const char *thread__fd_path(struct thread *thread, int fd,
955 				   struct trace *trace)
956 {
957 	struct thread_trace *ttrace = thread__priv(thread);
958 
959 	if (ttrace == NULL)
960 		return NULL;
961 
962 	if (fd < 0)
963 		return NULL;
964 
965 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
966 		if (!trace->live)
967 			return NULL;
968 		++trace->stats.proc_getname;
969 		if (thread__read_fd_path(thread, fd))
970 			return NULL;
971 	}
972 
973 	return ttrace->paths.table[fd];
974 }
975 
976 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
977 {
978 	int fd = arg->val;
979 	size_t printed = scnprintf(bf, size, "%d", fd);
980 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
981 
982 	if (path)
983 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
984 
985 	return printed;
986 }
987 
988 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
989 					      struct syscall_arg *arg)
990 {
991 	int fd = arg->val;
992 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
993 	struct thread_trace *ttrace = thread__priv(arg->thread);
994 
995 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
996 		zfree(&ttrace->paths.table[fd]);
997 
998 	return printed;
999 }
1000 
1001 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1002 				     unsigned long ptr)
1003 {
1004 	struct thread_trace *ttrace = thread__priv(thread);
1005 
1006 	ttrace->filename.ptr = ptr;
1007 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1008 }
1009 
1010 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1011 					      struct syscall_arg *arg)
1012 {
1013 	unsigned long ptr = arg->val;
1014 
1015 	if (!arg->trace->vfs_getname)
1016 		return scnprintf(bf, size, "%#x", ptr);
1017 
1018 	thread__set_filename_pos(arg->thread, bf, ptr);
1019 	return 0;
1020 }
1021 
1022 static bool trace__filter_duration(struct trace *trace, double t)
1023 {
1024 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1025 }
1026 
1027 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1028 {
1029 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1030 
1031 	return fprintf(fp, "%10.3f ", ts);
1032 }
1033 
1034 /*
1035  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1036  * using ttrace->entry_time for a thread that receives a sys_exit without
1037  * first having received a sys_enter ("poll" issued before tracing session
1038  * starts, lost sys_enter exit due to ring buffer overflow).
1039  */
1040 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1041 {
1042 	if (tstamp > 0)
1043 		return __trace__fprintf_tstamp(trace, tstamp, fp);
1044 
1045 	return fprintf(fp, "         ? ");
1046 }
1047 
1048 static bool done = false;
1049 static bool interrupted = false;
1050 
1051 static void sig_handler(int sig)
1052 {
1053 	done = true;
1054 	interrupted = sig == SIGINT;
1055 }
1056 
1057 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1058 					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1059 {
1060 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1061 	printed += fprintf_duration(duration, duration_calculated, fp);
1062 
1063 	if (trace->multiple_threads) {
1064 		if (trace->show_comm)
1065 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1066 		printed += fprintf(fp, "%d ", thread->tid);
1067 	}
1068 
1069 	return printed;
1070 }
1071 
1072 static int trace__process_event(struct trace *trace, struct machine *machine,
1073 				union perf_event *event, struct perf_sample *sample)
1074 {
1075 	int ret = 0;
1076 
1077 	switch (event->header.type) {
1078 	case PERF_RECORD_LOST:
1079 		color_fprintf(trace->output, PERF_COLOR_RED,
1080 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1081 		ret = machine__process_lost_event(machine, event, sample);
1082 		break;
1083 	default:
1084 		ret = machine__process_event(machine, event, sample);
1085 		break;
1086 	}
1087 
1088 	return ret;
1089 }
1090 
1091 static int trace__tool_process(struct perf_tool *tool,
1092 			       union perf_event *event,
1093 			       struct perf_sample *sample,
1094 			       struct machine *machine)
1095 {
1096 	struct trace *trace = container_of(tool, struct trace, tool);
1097 	return trace__process_event(trace, machine, event, sample);
1098 }
1099 
1100 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1101 {
1102 	struct machine *machine = vmachine;
1103 
1104 	if (machine->kptr_restrict_warned)
1105 		return NULL;
1106 
1107 	if (symbol_conf.kptr_restrict) {
1108 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1109 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1110 			   "Kernel samples will not be resolved.\n");
1111 		machine->kptr_restrict_warned = true;
1112 		return NULL;
1113 	}
1114 
1115 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1116 }
1117 
1118 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1119 {
1120 	int err = symbol__init(NULL);
1121 
1122 	if (err)
1123 		return err;
1124 
1125 	trace->host = machine__new_host();
1126 	if (trace->host == NULL)
1127 		return -ENOMEM;
1128 
1129 	if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1130 		return -errno;
1131 
1132 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1133 					    evlist->threads, trace__tool_process, false,
1134 					    trace->opts.proc_map_timeout, 1);
1135 	if (err)
1136 		symbol__exit();
1137 
1138 	return err;
1139 }
1140 
1141 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1142 {
1143 	int idx;
1144 
1145 	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1146 		nr_args = sc->fmt->nr_args;
1147 
1148 	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1149 	if (sc->arg_fmt == NULL)
1150 		return -1;
1151 
1152 	for (idx = 0; idx < nr_args; ++idx) {
1153 		if (sc->fmt)
1154 			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1155 	}
1156 
1157 	sc->nr_args = nr_args;
1158 	return 0;
1159 }
1160 
1161 static int syscall__set_arg_fmts(struct syscall *sc)
1162 {
1163 	struct format_field *field;
1164 	int idx = 0, len;
1165 
1166 	for (field = sc->args; field; field = field->next, ++idx) {
1167 		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1168 			continue;
1169 
1170 		if (strcmp(field->type, "const char *") == 0 &&
1171 			 (strcmp(field->name, "filename") == 0 ||
1172 			  strcmp(field->name, "path") == 0 ||
1173 			  strcmp(field->name, "pathname") == 0))
1174 			sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1175 		else if (field->flags & FIELD_IS_POINTER)
1176 			sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1177 		else if (strcmp(field->type, "pid_t") == 0)
1178 			sc->arg_fmt[idx].scnprintf = SCA_PID;
1179 		else if (strcmp(field->type, "umode_t") == 0)
1180 			sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1181 		else if ((strcmp(field->type, "int") == 0 ||
1182 			  strcmp(field->type, "unsigned int") == 0 ||
1183 			  strcmp(field->type, "long") == 0) &&
1184 			 (len = strlen(field->name)) >= 2 &&
1185 			 strcmp(field->name + len - 2, "fd") == 0) {
1186 			/*
1187 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1188 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1189 			 * 65 int
1190 			 * 23 unsigned int
1191 			 * 7 unsigned long
1192 			 */
1193 			sc->arg_fmt[idx].scnprintf = SCA_FD;
1194 		}
1195 	}
1196 
1197 	return 0;
1198 }
1199 
1200 static int trace__read_syscall_info(struct trace *trace, int id)
1201 {
1202 	char tp_name[128];
1203 	struct syscall *sc;
1204 	const char *name = syscalltbl__name(trace->sctbl, id);
1205 
1206 	if (name == NULL)
1207 		return -1;
1208 
1209 	if (id > trace->syscalls.max) {
1210 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1211 
1212 		if (nsyscalls == NULL)
1213 			return -1;
1214 
1215 		if (trace->syscalls.max != -1) {
1216 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1217 			       (id - trace->syscalls.max) * sizeof(*sc));
1218 		} else {
1219 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1220 		}
1221 
1222 		trace->syscalls.table = nsyscalls;
1223 		trace->syscalls.max   = id;
1224 	}
1225 
1226 	sc = trace->syscalls.table + id;
1227 	sc->name = name;
1228 
1229 	sc->fmt  = syscall_fmt__find(sc->name);
1230 
1231 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1232 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1233 
1234 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1235 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1236 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1237 	}
1238 
1239 	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1240 		return -1;
1241 
1242 	if (IS_ERR(sc->tp_format))
1243 		return -1;
1244 
1245 	sc->args = sc->tp_format->format.fields;
1246 	/*
1247 	 * We need to check and discard the first variable '__syscall_nr'
1248 	 * or 'nr' that mean the syscall number. It is needless here.
1249 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1250 	 */
1251 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1252 		sc->args = sc->args->next;
1253 		--sc->nr_args;
1254 	}
1255 
1256 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1257 
1258 	return syscall__set_arg_fmts(sc);
1259 }
1260 
1261 static int trace__validate_ev_qualifier(struct trace *trace)
1262 {
1263 	int err = 0, i;
1264 	size_t nr_allocated;
1265 	struct str_node *pos;
1266 
1267 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1268 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1269 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1270 
1271 	if (trace->ev_qualifier_ids.entries == NULL) {
1272 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1273 		       trace->output);
1274 		err = -EINVAL;
1275 		goto out;
1276 	}
1277 
1278 	nr_allocated = trace->ev_qualifier_ids.nr;
1279 	i = 0;
1280 
1281 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1282 		const char *sc = pos->s;
1283 		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1284 
1285 		if (id < 0) {
1286 			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1287 			if (id >= 0)
1288 				goto matches;
1289 
1290 			if (err == 0) {
1291 				fputs("Error:\tInvalid syscall ", trace->output);
1292 				err = -EINVAL;
1293 			} else {
1294 				fputs(", ", trace->output);
1295 			}
1296 
1297 			fputs(sc, trace->output);
1298 		}
1299 matches:
1300 		trace->ev_qualifier_ids.entries[i++] = id;
1301 		if (match_next == -1)
1302 			continue;
1303 
1304 		while (1) {
1305 			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1306 			if (id < 0)
1307 				break;
1308 			if (nr_allocated == trace->ev_qualifier_ids.nr) {
1309 				void *entries;
1310 
1311 				nr_allocated += 8;
1312 				entries = realloc(trace->ev_qualifier_ids.entries,
1313 						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1314 				if (entries == NULL) {
1315 					err = -ENOMEM;
1316 					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1317 					goto out_free;
1318 				}
1319 				trace->ev_qualifier_ids.entries = entries;
1320 			}
1321 			trace->ev_qualifier_ids.nr++;
1322 			trace->ev_qualifier_ids.entries[i++] = id;
1323 		}
1324 	}
1325 
1326 	if (err < 0) {
1327 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1328 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1329 out_free:
1330 		zfree(&trace->ev_qualifier_ids.entries);
1331 		trace->ev_qualifier_ids.nr = 0;
1332 	}
1333 out:
1334 	return err;
1335 }
1336 
1337 /*
1338  * args is to be interpreted as a series of longs but we need to handle
1339  * 8-byte unaligned accesses. args points to raw_data within the event
1340  * and raw_data is guaranteed to be 8-byte unaligned because it is
1341  * preceded by raw_size which is a u32. So we need to copy args to a temp
1342  * variable to read it. Most notably this avoids extended load instructions
1343  * on unaligned addresses
1344  */
1345 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1346 {
1347 	unsigned long val;
1348 	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1349 
1350 	memcpy(&val, p, sizeof(val));
1351 	return val;
1352 }
1353 
1354 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1355 				      struct syscall_arg *arg)
1356 {
1357 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1358 		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1359 
1360 	return scnprintf(bf, size, "arg%d: ", arg->idx);
1361 }
1362 
1363 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1364 				     struct syscall_arg *arg, unsigned long val)
1365 {
1366 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1367 		arg->val = val;
1368 		if (sc->arg_fmt[arg->idx].parm)
1369 			arg->parm = sc->arg_fmt[arg->idx].parm;
1370 		return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1371 	}
1372 	return scnprintf(bf, size, "%ld", val);
1373 }
1374 
1375 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1376 				      unsigned char *args, struct trace *trace,
1377 				      struct thread *thread)
1378 {
1379 	size_t printed = 0;
1380 	unsigned long val;
1381 	u8 bit = 1;
1382 	struct syscall_arg arg = {
1383 		.args	= args,
1384 		.idx	= 0,
1385 		.mask	= 0,
1386 		.trace  = trace,
1387 		.thread = thread,
1388 	};
1389 	struct thread_trace *ttrace = thread__priv(thread);
1390 
1391 	/*
1392 	 * Things like fcntl will set this in its 'cmd' formatter to pick the
1393 	 * right formatter for the return value (an fd? file flags?), which is
1394 	 * not needed for syscalls that always return a given type, say an fd.
1395 	 */
1396 	ttrace->ret_scnprintf = NULL;
1397 
1398 	if (sc->args != NULL) {
1399 		struct format_field *field;
1400 
1401 		for (field = sc->args; field;
1402 		     field = field->next, ++arg.idx, bit <<= 1) {
1403 			if (arg.mask & bit)
1404 				continue;
1405 
1406 			val = syscall_arg__val(&arg, arg.idx);
1407 
1408 			/*
1409  			 * Suppress this argument if its value is zero and
1410  			 * and we don't have a string associated in an
1411  			 * strarray for it.
1412  			 */
1413 			if (val == 0 &&
1414 			    !(sc->arg_fmt &&
1415 			      (sc->arg_fmt[arg.idx].show_zero ||
1416 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1417 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1418 			      sc->arg_fmt[arg.idx].parm))
1419 				continue;
1420 
1421 			printed += scnprintf(bf + printed, size - printed,
1422 					     "%s%s: ", printed ? ", " : "", field->name);
1423 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1424 		}
1425 	} else if (IS_ERR(sc->tp_format)) {
1426 		/*
1427 		 * If we managed to read the tracepoint /format file, then we
1428 		 * may end up not having any args, like with gettid(), so only
1429 		 * print the raw args when we didn't manage to read it.
1430 		 */
1431 		while (arg.idx < sc->nr_args) {
1432 			if (arg.mask & bit)
1433 				goto next_arg;
1434 			val = syscall_arg__val(&arg, arg.idx);
1435 			if (printed)
1436 				printed += scnprintf(bf + printed, size - printed, ", ");
1437 			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1438 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1439 next_arg:
1440 			++arg.idx;
1441 			bit <<= 1;
1442 		}
1443 	}
1444 
1445 	return printed;
1446 }
1447 
1448 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1449 				  union perf_event *event,
1450 				  struct perf_sample *sample);
1451 
1452 static struct syscall *trace__syscall_info(struct trace *trace,
1453 					   struct perf_evsel *evsel, int id)
1454 {
1455 
1456 	if (id < 0) {
1457 
1458 		/*
1459 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1460 		 * before that, leaving at a higher verbosity level till that is
1461 		 * explained. Reproduced with plain ftrace with:
1462 		 *
1463 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1464 		 * grep "NR -1 " /t/trace_pipe
1465 		 *
1466 		 * After generating some load on the machine.
1467  		 */
1468 		if (verbose > 1) {
1469 			static u64 n;
1470 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1471 				id, perf_evsel__name(evsel), ++n);
1472 		}
1473 		return NULL;
1474 	}
1475 
1476 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1477 	    trace__read_syscall_info(trace, id))
1478 		goto out_cant_read;
1479 
1480 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1481 		goto out_cant_read;
1482 
1483 	return &trace->syscalls.table[id];
1484 
1485 out_cant_read:
1486 	if (verbose > 0) {
1487 		fprintf(trace->output, "Problems reading syscall %d", id);
1488 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1489 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1490 		fputs(" information\n", trace->output);
1491 	}
1492 	return NULL;
1493 }
1494 
1495 static void thread__update_stats(struct thread_trace *ttrace,
1496 				 int id, struct perf_sample *sample)
1497 {
1498 	struct int_node *inode;
1499 	struct stats *stats;
1500 	u64 duration = 0;
1501 
1502 	inode = intlist__findnew(ttrace->syscall_stats, id);
1503 	if (inode == NULL)
1504 		return;
1505 
1506 	stats = inode->priv;
1507 	if (stats == NULL) {
1508 		stats = malloc(sizeof(struct stats));
1509 		if (stats == NULL)
1510 			return;
1511 		init_stats(stats);
1512 		inode->priv = stats;
1513 	}
1514 
1515 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1516 		duration = sample->time - ttrace->entry_time;
1517 
1518 	update_stats(stats, duration);
1519 }
1520 
1521 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1522 {
1523 	struct thread_trace *ttrace;
1524 	u64 duration;
1525 	size_t printed;
1526 
1527 	if (trace->current == NULL)
1528 		return 0;
1529 
1530 	ttrace = thread__priv(trace->current);
1531 
1532 	if (!ttrace->entry_pending)
1533 		return 0;
1534 
1535 	duration = sample->time - ttrace->entry_time;
1536 
1537 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1538 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1539 	ttrace->entry_pending = false;
1540 
1541 	return printed;
1542 }
1543 
1544 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1545 			    union perf_event *event __maybe_unused,
1546 			    struct perf_sample *sample)
1547 {
1548 	char *msg;
1549 	void *args;
1550 	size_t printed = 0;
1551 	struct thread *thread;
1552 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1553 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1554 	struct thread_trace *ttrace;
1555 
1556 	if (sc == NULL)
1557 		return -1;
1558 
1559 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1560 	ttrace = thread__trace(thread, trace->output);
1561 	if (ttrace == NULL)
1562 		goto out_put;
1563 
1564 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1565 
1566 	if (ttrace->entry_str == NULL) {
1567 		ttrace->entry_str = malloc(trace__entry_str_size);
1568 		if (!ttrace->entry_str)
1569 			goto out_put;
1570 	}
1571 
1572 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1573 		trace__printf_interrupted_entry(trace, sample);
1574 
1575 	ttrace->entry_time = sample->time;
1576 	msg = ttrace->entry_str;
1577 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1578 
1579 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1580 					   args, trace, thread);
1581 
1582 	if (sc->is_exit) {
1583 		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1584 			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1585 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1586 		}
1587 	} else {
1588 		ttrace->entry_pending = true;
1589 		/* See trace__vfs_getname & trace__sys_exit */
1590 		ttrace->filename.pending_open = false;
1591 	}
1592 
1593 	if (trace->current != thread) {
1594 		thread__put(trace->current);
1595 		trace->current = thread__get(thread);
1596 	}
1597 	err = 0;
1598 out_put:
1599 	thread__put(thread);
1600 	return err;
1601 }
1602 
1603 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1604 				    struct perf_sample *sample,
1605 				    struct callchain_cursor *cursor)
1606 {
1607 	struct addr_location al;
1608 
1609 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1610 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1611 		return -1;
1612 
1613 	return 0;
1614 }
1615 
1616 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1617 {
1618 	/* TODO: user-configurable print_opts */
1619 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1620 				        EVSEL__PRINT_DSO |
1621 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1622 
1623 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1624 }
1625 
1626 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1627 			   union perf_event *event __maybe_unused,
1628 			   struct perf_sample *sample)
1629 {
1630 	long ret;
1631 	u64 duration = 0;
1632 	bool duration_calculated = false;
1633 	struct thread *thread;
1634 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1635 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1636 	struct thread_trace *ttrace;
1637 
1638 	if (sc == NULL)
1639 		return -1;
1640 
1641 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1642 	ttrace = thread__trace(thread, trace->output);
1643 	if (ttrace == NULL)
1644 		goto out_put;
1645 
1646 	if (trace->summary)
1647 		thread__update_stats(ttrace, id, sample);
1648 
1649 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1650 
1651 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1652 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1653 		ttrace->filename.pending_open = false;
1654 		++trace->stats.vfs_getname;
1655 	}
1656 
1657 	if (ttrace->entry_time) {
1658 		duration = sample->time - ttrace->entry_time;
1659 		if (trace__filter_duration(trace, duration))
1660 			goto out;
1661 		duration_calculated = true;
1662 	} else if (trace->duration_filter)
1663 		goto out;
1664 
1665 	if (sample->callchain) {
1666 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1667 		if (callchain_ret == 0) {
1668 			if (callchain_cursor.nr < trace->min_stack)
1669 				goto out;
1670 			callchain_ret = 1;
1671 		}
1672 	}
1673 
1674 	if (trace->summary_only)
1675 		goto out;
1676 
1677 	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1678 
1679 	if (ttrace->entry_pending) {
1680 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1681 	} else {
1682 		fprintf(trace->output, " ... [");
1683 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1684 		fprintf(trace->output, "]: %s()", sc->name);
1685 	}
1686 
1687 	if (sc->fmt == NULL) {
1688 		if (ret < 0)
1689 			goto errno_print;
1690 signed_print:
1691 		fprintf(trace->output, ") = %ld", ret);
1692 	} else if (ret < 0) {
1693 errno_print: {
1694 		char bf[STRERR_BUFSIZE];
1695 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1696 			   *e = audit_errno_to_name(-ret);
1697 
1698 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1699 	}
1700 	} else if (ret == 0 && sc->fmt->timeout)
1701 		fprintf(trace->output, ") = 0 Timeout");
1702 	else if (ttrace->ret_scnprintf) {
1703 		char bf[1024];
1704 		struct syscall_arg arg = {
1705 			.val	= ret,
1706 			.thread	= thread,
1707 			.trace	= trace,
1708 		};
1709 		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1710 		ttrace->ret_scnprintf = NULL;
1711 		fprintf(trace->output, ") = %s", bf);
1712 	} else if (sc->fmt->hexret)
1713 		fprintf(trace->output, ") = %#lx", ret);
1714 	else if (sc->fmt->errpid) {
1715 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1716 
1717 		if (child != NULL) {
1718 			fprintf(trace->output, ") = %ld", ret);
1719 			if (child->comm_set)
1720 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1721 			thread__put(child);
1722 		}
1723 	} else
1724 		goto signed_print;
1725 
1726 	fputc('\n', trace->output);
1727 
1728 	if (callchain_ret > 0)
1729 		trace__fprintf_callchain(trace, sample);
1730 	else if (callchain_ret < 0)
1731 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1732 out:
1733 	ttrace->entry_pending = false;
1734 	err = 0;
1735 out_put:
1736 	thread__put(thread);
1737 	return err;
1738 }
1739 
1740 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1741 			      union perf_event *event __maybe_unused,
1742 			      struct perf_sample *sample)
1743 {
1744 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1745 	struct thread_trace *ttrace;
1746 	size_t filename_len, entry_str_len, to_move;
1747 	ssize_t remaining_space;
1748 	char *pos;
1749 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1750 
1751 	if (!thread)
1752 		goto out;
1753 
1754 	ttrace = thread__priv(thread);
1755 	if (!ttrace)
1756 		goto out_put;
1757 
1758 	filename_len = strlen(filename);
1759 	if (filename_len == 0)
1760 		goto out_put;
1761 
1762 	if (ttrace->filename.namelen < filename_len) {
1763 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1764 
1765 		if (f == NULL)
1766 			goto out_put;
1767 
1768 		ttrace->filename.namelen = filename_len;
1769 		ttrace->filename.name = f;
1770 	}
1771 
1772 	strcpy(ttrace->filename.name, filename);
1773 	ttrace->filename.pending_open = true;
1774 
1775 	if (!ttrace->filename.ptr)
1776 		goto out_put;
1777 
1778 	entry_str_len = strlen(ttrace->entry_str);
1779 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1780 	if (remaining_space <= 0)
1781 		goto out_put;
1782 
1783 	if (filename_len > (size_t)remaining_space) {
1784 		filename += filename_len - remaining_space;
1785 		filename_len = remaining_space;
1786 	}
1787 
1788 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1789 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1790 	memmove(pos + filename_len, pos, to_move);
1791 	memcpy(pos, filename, filename_len);
1792 
1793 	ttrace->filename.ptr = 0;
1794 	ttrace->filename.entry_str_pos = 0;
1795 out_put:
1796 	thread__put(thread);
1797 out:
1798 	return 0;
1799 }
1800 
1801 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1802 				     union perf_event *event __maybe_unused,
1803 				     struct perf_sample *sample)
1804 {
1805         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1806 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1807 	struct thread *thread = machine__findnew_thread(trace->host,
1808 							sample->pid,
1809 							sample->tid);
1810 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1811 
1812 	if (ttrace == NULL)
1813 		goto out_dump;
1814 
1815 	ttrace->runtime_ms += runtime_ms;
1816 	trace->runtime_ms += runtime_ms;
1817 out_put:
1818 	thread__put(thread);
1819 	return 0;
1820 
1821 out_dump:
1822 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1823 	       evsel->name,
1824 	       perf_evsel__strval(evsel, sample, "comm"),
1825 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1826 	       runtime,
1827 	       perf_evsel__intval(evsel, sample, "vruntime"));
1828 	goto out_put;
1829 }
1830 
1831 static void bpf_output__printer(enum binary_printer_ops op,
1832 				unsigned int val, void *extra)
1833 {
1834 	FILE *output = extra;
1835 	unsigned char ch = (unsigned char)val;
1836 
1837 	switch (op) {
1838 	case BINARY_PRINT_CHAR_DATA:
1839 		fprintf(output, "%c", isprint(ch) ? ch : '.');
1840 		break;
1841 	case BINARY_PRINT_DATA_BEGIN:
1842 	case BINARY_PRINT_LINE_BEGIN:
1843 	case BINARY_PRINT_ADDR:
1844 	case BINARY_PRINT_NUM_DATA:
1845 	case BINARY_PRINT_NUM_PAD:
1846 	case BINARY_PRINT_SEP:
1847 	case BINARY_PRINT_CHAR_PAD:
1848 	case BINARY_PRINT_LINE_END:
1849 	case BINARY_PRINT_DATA_END:
1850 	default:
1851 		break;
1852 	}
1853 }
1854 
1855 static void bpf_output__fprintf(struct trace *trace,
1856 				struct perf_sample *sample)
1857 {
1858 	print_binary(sample->raw_data, sample->raw_size, 8,
1859 		     bpf_output__printer, trace->output);
1860 }
1861 
1862 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1863 				union perf_event *event __maybe_unused,
1864 				struct perf_sample *sample)
1865 {
1866 	int callchain_ret = 0;
1867 
1868 	if (sample->callchain) {
1869 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1870 		if (callchain_ret == 0) {
1871 			if (callchain_cursor.nr < trace->min_stack)
1872 				goto out;
1873 			callchain_ret = 1;
1874 		}
1875 	}
1876 
1877 	trace__printf_interrupted_entry(trace, sample);
1878 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1879 
1880 	if (trace->trace_syscalls)
1881 		fprintf(trace->output, "(         ): ");
1882 
1883 	fprintf(trace->output, "%s:", evsel->name);
1884 
1885 	if (perf_evsel__is_bpf_output(evsel)) {
1886 		bpf_output__fprintf(trace, sample);
1887 	} else if (evsel->tp_format) {
1888 		event_format__fprintf(evsel->tp_format, sample->cpu,
1889 				      sample->raw_data, sample->raw_size,
1890 				      trace->output);
1891 	}
1892 
1893 	fprintf(trace->output, ")\n");
1894 
1895 	if (callchain_ret > 0)
1896 		trace__fprintf_callchain(trace, sample);
1897 	else if (callchain_ret < 0)
1898 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1899 out:
1900 	return 0;
1901 }
1902 
1903 static void print_location(FILE *f, struct perf_sample *sample,
1904 			   struct addr_location *al,
1905 			   bool print_dso, bool print_sym)
1906 {
1907 
1908 	if ((verbose > 0 || print_dso) && al->map)
1909 		fprintf(f, "%s@", al->map->dso->long_name);
1910 
1911 	if ((verbose > 0 || print_sym) && al->sym)
1912 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1913 			al->addr - al->sym->start);
1914 	else if (al->map)
1915 		fprintf(f, "0x%" PRIx64, al->addr);
1916 	else
1917 		fprintf(f, "0x%" PRIx64, sample->addr);
1918 }
1919 
1920 static int trace__pgfault(struct trace *trace,
1921 			  struct perf_evsel *evsel,
1922 			  union perf_event *event __maybe_unused,
1923 			  struct perf_sample *sample)
1924 {
1925 	struct thread *thread;
1926 	struct addr_location al;
1927 	char map_type = 'd';
1928 	struct thread_trace *ttrace;
1929 	int err = -1;
1930 	int callchain_ret = 0;
1931 
1932 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1933 
1934 	if (sample->callchain) {
1935 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1936 		if (callchain_ret == 0) {
1937 			if (callchain_cursor.nr < trace->min_stack)
1938 				goto out_put;
1939 			callchain_ret = 1;
1940 		}
1941 	}
1942 
1943 	ttrace = thread__trace(thread, trace->output);
1944 	if (ttrace == NULL)
1945 		goto out_put;
1946 
1947 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1948 		ttrace->pfmaj++;
1949 	else
1950 		ttrace->pfmin++;
1951 
1952 	if (trace->summary_only)
1953 		goto out;
1954 
1955 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1956 			      sample->ip, &al);
1957 
1958 	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1959 
1960 	fprintf(trace->output, "%sfault [",
1961 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1962 		"maj" : "min");
1963 
1964 	print_location(trace->output, sample, &al, false, true);
1965 
1966 	fprintf(trace->output, "] => ");
1967 
1968 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1969 				   sample->addr, &al);
1970 
1971 	if (!al.map) {
1972 		thread__find_addr_location(thread, sample->cpumode,
1973 					   MAP__FUNCTION, sample->addr, &al);
1974 
1975 		if (al.map)
1976 			map_type = 'x';
1977 		else
1978 			map_type = '?';
1979 	}
1980 
1981 	print_location(trace->output, sample, &al, true, false);
1982 
1983 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1984 
1985 	if (callchain_ret > 0)
1986 		trace__fprintf_callchain(trace, sample);
1987 	else if (callchain_ret < 0)
1988 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1989 out:
1990 	err = 0;
1991 out_put:
1992 	thread__put(thread);
1993 	return err;
1994 }
1995 
1996 static void trace__set_base_time(struct trace *trace,
1997 				 struct perf_evsel *evsel,
1998 				 struct perf_sample *sample)
1999 {
2000 	/*
2001 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2002 	 * and don't use sample->time unconditionally, we may end up having
2003 	 * some other event in the future without PERF_SAMPLE_TIME for good
2004 	 * reason, i.e. we may not be interested in its timestamps, just in
2005 	 * it taking place, picking some piece of information when it
2006 	 * appears in our event stream (vfs_getname comes to mind).
2007 	 */
2008 	if (trace->base_time == 0 && !trace->full_time &&
2009 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2010 		trace->base_time = sample->time;
2011 }
2012 
2013 static int trace__process_sample(struct perf_tool *tool,
2014 				 union perf_event *event,
2015 				 struct perf_sample *sample,
2016 				 struct perf_evsel *evsel,
2017 				 struct machine *machine __maybe_unused)
2018 {
2019 	struct trace *trace = container_of(tool, struct trace, tool);
2020 	struct thread *thread;
2021 	int err = 0;
2022 
2023 	tracepoint_handler handler = evsel->handler;
2024 
2025 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2026 	if (thread && thread__is_filtered(thread))
2027 		goto out;
2028 
2029 	trace__set_base_time(trace, evsel, sample);
2030 
2031 	if (handler) {
2032 		++trace->nr_events;
2033 		handler(trace, evsel, event, sample);
2034 	}
2035 out:
2036 	thread__put(thread);
2037 	return err;
2038 }
2039 
2040 static int trace__record(struct trace *trace, int argc, const char **argv)
2041 {
2042 	unsigned int rec_argc, i, j;
2043 	const char **rec_argv;
2044 	const char * const record_args[] = {
2045 		"record",
2046 		"-R",
2047 		"-m", "1024",
2048 		"-c", "1",
2049 	};
2050 
2051 	const char * const sc_args[] = { "-e", };
2052 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2053 	const char * const majpf_args[] = { "-e", "major-faults" };
2054 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2055 	const char * const minpf_args[] = { "-e", "minor-faults" };
2056 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2057 
2058 	/* +1 is for the event string below */
2059 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2060 		majpf_args_nr + minpf_args_nr + argc;
2061 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2062 
2063 	if (rec_argv == NULL)
2064 		return -ENOMEM;
2065 
2066 	j = 0;
2067 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2068 		rec_argv[j++] = record_args[i];
2069 
2070 	if (trace->trace_syscalls) {
2071 		for (i = 0; i < sc_args_nr; i++)
2072 			rec_argv[j++] = sc_args[i];
2073 
2074 		/* event string may be different for older kernels - e.g., RHEL6 */
2075 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2076 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2077 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2078 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2079 		else {
2080 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2081 			free(rec_argv);
2082 			return -1;
2083 		}
2084 	}
2085 
2086 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2087 		for (i = 0; i < majpf_args_nr; i++)
2088 			rec_argv[j++] = majpf_args[i];
2089 
2090 	if (trace->trace_pgfaults & TRACE_PFMIN)
2091 		for (i = 0; i < minpf_args_nr; i++)
2092 			rec_argv[j++] = minpf_args[i];
2093 
2094 	for (i = 0; i < (unsigned int)argc; i++)
2095 		rec_argv[j++] = argv[i];
2096 
2097 	return cmd_record(j, rec_argv);
2098 }
2099 
2100 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2101 
2102 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2103 {
2104 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2105 
2106 	if (IS_ERR(evsel))
2107 		return false;
2108 
2109 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2110 		perf_evsel__delete(evsel);
2111 		return false;
2112 	}
2113 
2114 	evsel->handler = trace__vfs_getname;
2115 	perf_evlist__add(evlist, evsel);
2116 	return true;
2117 }
2118 
2119 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2120 {
2121 	struct perf_evsel *evsel;
2122 	struct perf_event_attr attr = {
2123 		.type = PERF_TYPE_SOFTWARE,
2124 		.mmap_data = 1,
2125 	};
2126 
2127 	attr.config = config;
2128 	attr.sample_period = 1;
2129 
2130 	event_attr_init(&attr);
2131 
2132 	evsel = perf_evsel__new(&attr);
2133 	if (evsel)
2134 		evsel->handler = trace__pgfault;
2135 
2136 	return evsel;
2137 }
2138 
2139 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2140 {
2141 	const u32 type = event->header.type;
2142 	struct perf_evsel *evsel;
2143 
2144 	if (type != PERF_RECORD_SAMPLE) {
2145 		trace__process_event(trace, trace->host, event, sample);
2146 		return;
2147 	}
2148 
2149 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2150 	if (evsel == NULL) {
2151 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2152 		return;
2153 	}
2154 
2155 	trace__set_base_time(trace, evsel, sample);
2156 
2157 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2158 	    sample->raw_data == NULL) {
2159 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2160 		       perf_evsel__name(evsel), sample->tid,
2161 		       sample->cpu, sample->raw_size);
2162 	} else {
2163 		tracepoint_handler handler = evsel->handler;
2164 		handler(trace, evsel, event, sample);
2165 	}
2166 }
2167 
2168 static int trace__add_syscall_newtp(struct trace *trace)
2169 {
2170 	int ret = -1;
2171 	struct perf_evlist *evlist = trace->evlist;
2172 	struct perf_evsel *sys_enter, *sys_exit;
2173 
2174 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2175 	if (sys_enter == NULL)
2176 		goto out;
2177 
2178 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2179 		goto out_delete_sys_enter;
2180 
2181 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2182 	if (sys_exit == NULL)
2183 		goto out_delete_sys_enter;
2184 
2185 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2186 		goto out_delete_sys_exit;
2187 
2188 	perf_evlist__add(evlist, sys_enter);
2189 	perf_evlist__add(evlist, sys_exit);
2190 
2191 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2192 		/*
2193 		 * We're interested only in the user space callchain
2194 		 * leading to the syscall, allow overriding that for
2195 		 * debugging reasons using --kernel_syscall_callchains
2196 		 */
2197 		sys_exit->attr.exclude_callchain_kernel = 1;
2198 	}
2199 
2200 	trace->syscalls.events.sys_enter = sys_enter;
2201 	trace->syscalls.events.sys_exit  = sys_exit;
2202 
2203 	ret = 0;
2204 out:
2205 	return ret;
2206 
2207 out_delete_sys_exit:
2208 	perf_evsel__delete_priv(sys_exit);
2209 out_delete_sys_enter:
2210 	perf_evsel__delete_priv(sys_enter);
2211 	goto out;
2212 }
2213 
2214 static int trace__set_ev_qualifier_filter(struct trace *trace)
2215 {
2216 	int err = -1;
2217 	struct perf_evsel *sys_exit;
2218 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2219 						trace->ev_qualifier_ids.nr,
2220 						trace->ev_qualifier_ids.entries);
2221 
2222 	if (filter == NULL)
2223 		goto out_enomem;
2224 
2225 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2226 					  filter)) {
2227 		sys_exit = trace->syscalls.events.sys_exit;
2228 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2229 	}
2230 
2231 	free(filter);
2232 out:
2233 	return err;
2234 out_enomem:
2235 	errno = ENOMEM;
2236 	goto out;
2237 }
2238 
2239 static int trace__set_filter_loop_pids(struct trace *trace)
2240 {
2241 	unsigned int nr = 1;
2242 	pid_t pids[32] = {
2243 		getpid(),
2244 	};
2245 	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2246 
2247 	while (thread && nr < ARRAY_SIZE(pids)) {
2248 		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2249 
2250 		if (parent == NULL)
2251 			break;
2252 
2253 		if (!strcmp(thread__comm_str(parent), "sshd")) {
2254 			pids[nr++] = parent->tid;
2255 			break;
2256 		}
2257 		thread = parent;
2258 	}
2259 
2260 	return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2261 }
2262 
2263 static int trace__run(struct trace *trace, int argc, const char **argv)
2264 {
2265 	struct perf_evlist *evlist = trace->evlist;
2266 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2267 	int err = -1, i;
2268 	unsigned long before;
2269 	const bool forks = argc > 0;
2270 	bool draining = false;
2271 
2272 	trace->live = true;
2273 
2274 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2275 		goto out_error_raw_syscalls;
2276 
2277 	if (trace->trace_syscalls)
2278 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2279 
2280 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2281 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2282 		if (pgfault_maj == NULL)
2283 			goto out_error_mem;
2284 		perf_evlist__add(evlist, pgfault_maj);
2285 	}
2286 
2287 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2288 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2289 		if (pgfault_min == NULL)
2290 			goto out_error_mem;
2291 		perf_evlist__add(evlist, pgfault_min);
2292 	}
2293 
2294 	if (trace->sched &&
2295 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2296 				   trace__sched_stat_runtime))
2297 		goto out_error_sched_stat_runtime;
2298 
2299 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2300 	if (err < 0) {
2301 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2302 		goto out_delete_evlist;
2303 	}
2304 
2305 	err = trace__symbols_init(trace, evlist);
2306 	if (err < 0) {
2307 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2308 		goto out_delete_evlist;
2309 	}
2310 
2311 	perf_evlist__config(evlist, &trace->opts, NULL);
2312 
2313 	if (callchain_param.enabled) {
2314 		bool use_identifier = false;
2315 
2316 		if (trace->syscalls.events.sys_exit) {
2317 			perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2318 						     &trace->opts, &callchain_param);
2319 			use_identifier = true;
2320 		}
2321 
2322 		if (pgfault_maj) {
2323 			perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2324 			use_identifier = true;
2325 		}
2326 
2327 		if (pgfault_min) {
2328 			perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2329 			use_identifier = true;
2330 		}
2331 
2332 		if (use_identifier) {
2333 		       /*
2334 			* Now we have evsels with different sample_ids, use
2335 			* PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2336 			* from a fixed position in each ring buffer record.
2337 			*
2338 			* As of this the changeset introducing this comment, this
2339 			* isn't strictly needed, as the fields that can come before
2340 			* PERF_SAMPLE_ID are all used, but we'll probably disable
2341 			* some of those for things like copying the payload of
2342 			* pointer syscall arguments, and for vfs_getname we don't
2343 			* need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2344 			* here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2345 			*/
2346 			perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2347 			perf_evlist__reset_sample_bit(evlist, ID);
2348 		}
2349 	}
2350 
2351 	signal(SIGCHLD, sig_handler);
2352 	signal(SIGINT, sig_handler);
2353 
2354 	if (forks) {
2355 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2356 						    argv, false, NULL);
2357 		if (err < 0) {
2358 			fprintf(trace->output, "Couldn't run the workload!\n");
2359 			goto out_delete_evlist;
2360 		}
2361 	}
2362 
2363 	err = perf_evlist__open(evlist);
2364 	if (err < 0)
2365 		goto out_error_open;
2366 
2367 	err = bpf__apply_obj_config();
2368 	if (err) {
2369 		char errbuf[BUFSIZ];
2370 
2371 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2372 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2373 			 errbuf);
2374 		goto out_error_open;
2375 	}
2376 
2377 	/*
2378 	 * Better not use !target__has_task() here because we need to cover the
2379 	 * case where no threads were specified in the command line, but a
2380 	 * workload was, and in that case we will fill in the thread_map when
2381 	 * we fork the workload in perf_evlist__prepare_workload.
2382 	 */
2383 	if (trace->filter_pids.nr > 0)
2384 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2385 	else if (thread_map__pid(evlist->threads, 0) == -1)
2386 		err = trace__set_filter_loop_pids(trace);
2387 
2388 	if (err < 0)
2389 		goto out_error_mem;
2390 
2391 	if (trace->ev_qualifier_ids.nr > 0) {
2392 		err = trace__set_ev_qualifier_filter(trace);
2393 		if (err < 0)
2394 			goto out_errno;
2395 
2396 		pr_debug("event qualifier tracepoint filter: %s\n",
2397 			 trace->syscalls.events.sys_exit->filter);
2398 	}
2399 
2400 	err = perf_evlist__apply_filters(evlist, &evsel);
2401 	if (err < 0)
2402 		goto out_error_apply_filters;
2403 
2404 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2405 	if (err < 0)
2406 		goto out_error_mmap;
2407 
2408 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2409 		perf_evlist__enable(evlist);
2410 
2411 	if (forks)
2412 		perf_evlist__start_workload(evlist);
2413 
2414 	if (trace->opts.initial_delay) {
2415 		usleep(trace->opts.initial_delay * 1000);
2416 		perf_evlist__enable(evlist);
2417 	}
2418 
2419 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2420 				  evlist->threads->nr > 1 ||
2421 				  perf_evlist__first(evlist)->attr.inherit;
2422 again:
2423 	before = trace->nr_events;
2424 
2425 	for (i = 0; i < evlist->nr_mmaps; i++) {
2426 		union perf_event *event;
2427 
2428 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2429 			struct perf_sample sample;
2430 
2431 			++trace->nr_events;
2432 
2433 			err = perf_evlist__parse_sample(evlist, event, &sample);
2434 			if (err) {
2435 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2436 				goto next_event;
2437 			}
2438 
2439 			trace__handle_event(trace, event, &sample);
2440 next_event:
2441 			perf_evlist__mmap_consume(evlist, i);
2442 
2443 			if (interrupted)
2444 				goto out_disable;
2445 
2446 			if (done && !draining) {
2447 				perf_evlist__disable(evlist);
2448 				draining = true;
2449 			}
2450 		}
2451 	}
2452 
2453 	if (trace->nr_events == before) {
2454 		int timeout = done ? 100 : -1;
2455 
2456 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2457 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2458 				draining = true;
2459 
2460 			goto again;
2461 		}
2462 	} else {
2463 		goto again;
2464 	}
2465 
2466 out_disable:
2467 	thread__zput(trace->current);
2468 
2469 	perf_evlist__disable(evlist);
2470 
2471 	if (!err) {
2472 		if (trace->summary)
2473 			trace__fprintf_thread_summary(trace, trace->output);
2474 
2475 		if (trace->show_tool_stats) {
2476 			fprintf(trace->output, "Stats:\n "
2477 					       " vfs_getname : %" PRIu64 "\n"
2478 					       " proc_getname: %" PRIu64 "\n",
2479 				trace->stats.vfs_getname,
2480 				trace->stats.proc_getname);
2481 		}
2482 	}
2483 
2484 out_delete_evlist:
2485 	perf_evlist__delete(evlist);
2486 	trace->evlist = NULL;
2487 	trace->live = false;
2488 	return err;
2489 {
2490 	char errbuf[BUFSIZ];
2491 
2492 out_error_sched_stat_runtime:
2493 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2494 	goto out_error;
2495 
2496 out_error_raw_syscalls:
2497 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2498 	goto out_error;
2499 
2500 out_error_mmap:
2501 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2502 	goto out_error;
2503 
2504 out_error_open:
2505 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2506 
2507 out_error:
2508 	fprintf(trace->output, "%s\n", errbuf);
2509 	goto out_delete_evlist;
2510 
2511 out_error_apply_filters:
2512 	fprintf(trace->output,
2513 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2514 		evsel->filter, perf_evsel__name(evsel), errno,
2515 		str_error_r(errno, errbuf, sizeof(errbuf)));
2516 	goto out_delete_evlist;
2517 }
2518 out_error_mem:
2519 	fprintf(trace->output, "Not enough memory to run!\n");
2520 	goto out_delete_evlist;
2521 
2522 out_errno:
2523 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2524 	goto out_delete_evlist;
2525 }
2526 
2527 static int trace__replay(struct trace *trace)
2528 {
2529 	const struct perf_evsel_str_handler handlers[] = {
2530 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2531 	};
2532 	struct perf_data_file file = {
2533 		.path  = input_name,
2534 		.mode  = PERF_DATA_MODE_READ,
2535 		.force = trace->force,
2536 	};
2537 	struct perf_session *session;
2538 	struct perf_evsel *evsel;
2539 	int err = -1;
2540 
2541 	trace->tool.sample	  = trace__process_sample;
2542 	trace->tool.mmap	  = perf_event__process_mmap;
2543 	trace->tool.mmap2	  = perf_event__process_mmap2;
2544 	trace->tool.comm	  = perf_event__process_comm;
2545 	trace->tool.exit	  = perf_event__process_exit;
2546 	trace->tool.fork	  = perf_event__process_fork;
2547 	trace->tool.attr	  = perf_event__process_attr;
2548 	trace->tool.tracing_data  = perf_event__process_tracing_data;
2549 	trace->tool.build_id	  = perf_event__process_build_id;
2550 	trace->tool.namespaces	  = perf_event__process_namespaces;
2551 
2552 	trace->tool.ordered_events = true;
2553 	trace->tool.ordering_requires_timestamps = true;
2554 
2555 	/* add tid to output */
2556 	trace->multiple_threads = true;
2557 
2558 	session = perf_session__new(&file, false, &trace->tool);
2559 	if (session == NULL)
2560 		return -1;
2561 
2562 	if (trace->opts.target.pid)
2563 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2564 
2565 	if (trace->opts.target.tid)
2566 		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2567 
2568 	if (symbol__init(&session->header.env) < 0)
2569 		goto out;
2570 
2571 	trace->host = &session->machines.host;
2572 
2573 	err = perf_session__set_tracepoints_handlers(session, handlers);
2574 	if (err)
2575 		goto out;
2576 
2577 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2578 						     "raw_syscalls:sys_enter");
2579 	/* older kernels have syscalls tp versus raw_syscalls */
2580 	if (evsel == NULL)
2581 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2582 							     "syscalls:sys_enter");
2583 
2584 	if (evsel &&
2585 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2586 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2587 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2588 		goto out;
2589 	}
2590 
2591 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2592 						     "raw_syscalls:sys_exit");
2593 	if (evsel == NULL)
2594 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2595 							     "syscalls:sys_exit");
2596 	if (evsel &&
2597 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2598 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2599 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2600 		goto out;
2601 	}
2602 
2603 	evlist__for_each_entry(session->evlist, evsel) {
2604 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2605 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2606 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2607 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2608 			evsel->handler = trace__pgfault;
2609 	}
2610 
2611 	setup_pager();
2612 
2613 	err = perf_session__process_events(session);
2614 	if (err)
2615 		pr_err("Failed to process events, error %d", err);
2616 
2617 	else if (trace->summary)
2618 		trace__fprintf_thread_summary(trace, trace->output);
2619 
2620 out:
2621 	perf_session__delete(session);
2622 
2623 	return err;
2624 }
2625 
2626 static size_t trace__fprintf_threads_header(FILE *fp)
2627 {
2628 	size_t printed;
2629 
2630 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2631 
2632 	return printed;
2633 }
2634 
2635 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2636 	struct stats 	*stats;
2637 	double		msecs;
2638 	int		syscall;
2639 )
2640 {
2641 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2642 	struct stats *stats = source->priv;
2643 
2644 	entry->syscall = source->i;
2645 	entry->stats   = stats;
2646 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2647 }
2648 
2649 static size_t thread__dump_stats(struct thread_trace *ttrace,
2650 				 struct trace *trace, FILE *fp)
2651 {
2652 	size_t printed = 0;
2653 	struct syscall *sc;
2654 	struct rb_node *nd;
2655 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2656 
2657 	if (syscall_stats == NULL)
2658 		return 0;
2659 
2660 	printed += fprintf(fp, "\n");
2661 
2662 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2663 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2664 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2665 
2666 	resort_rb__for_each_entry(nd, syscall_stats) {
2667 		struct stats *stats = syscall_stats_entry->stats;
2668 		if (stats) {
2669 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2670 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2671 			double avg = avg_stats(stats);
2672 			double pct;
2673 			u64 n = (u64) stats->n;
2674 
2675 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2676 			avg /= NSEC_PER_MSEC;
2677 
2678 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2679 			printed += fprintf(fp, "   %-15s", sc->name);
2680 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2681 					   n, syscall_stats_entry->msecs, min, avg);
2682 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2683 		}
2684 	}
2685 
2686 	resort_rb__delete(syscall_stats);
2687 	printed += fprintf(fp, "\n\n");
2688 
2689 	return printed;
2690 }
2691 
2692 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2693 {
2694 	size_t printed = 0;
2695 	struct thread_trace *ttrace = thread__priv(thread);
2696 	double ratio;
2697 
2698 	if (ttrace == NULL)
2699 		return 0;
2700 
2701 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2702 
2703 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2704 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2705 	printed += fprintf(fp, "%.1f%%", ratio);
2706 	if (ttrace->pfmaj)
2707 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2708 	if (ttrace->pfmin)
2709 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2710 	if (trace->sched)
2711 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2712 	else if (fputc('\n', fp) != EOF)
2713 		++printed;
2714 
2715 	printed += thread__dump_stats(ttrace, trace, fp);
2716 
2717 	return printed;
2718 }
2719 
2720 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2721 {
2722 	return ttrace ? ttrace->nr_events : 0;
2723 }
2724 
2725 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2726 	struct thread *thread;
2727 )
2728 {
2729 	entry->thread = rb_entry(nd, struct thread, rb_node);
2730 }
2731 
2732 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2733 {
2734 	size_t printed = trace__fprintf_threads_header(fp);
2735 	struct rb_node *nd;
2736 	int i;
2737 
2738 	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2739 		DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2740 
2741 		if (threads == NULL) {
2742 			fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2743 			return 0;
2744 		}
2745 
2746 		resort_rb__for_each_entry(nd, threads)
2747 			printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2748 
2749 		resort_rb__delete(threads);
2750 	}
2751 	return printed;
2752 }
2753 
2754 static int trace__set_duration(const struct option *opt, const char *str,
2755 			       int unset __maybe_unused)
2756 {
2757 	struct trace *trace = opt->value;
2758 
2759 	trace->duration_filter = atof(str);
2760 	return 0;
2761 }
2762 
2763 static int trace__set_filter_pids(const struct option *opt, const char *str,
2764 				  int unset __maybe_unused)
2765 {
2766 	int ret = -1;
2767 	size_t i;
2768 	struct trace *trace = opt->value;
2769 	/*
2770 	 * FIXME: introduce a intarray class, plain parse csv and create a
2771 	 * { int nr, int entries[] } struct...
2772 	 */
2773 	struct intlist *list = intlist__new(str);
2774 
2775 	if (list == NULL)
2776 		return -1;
2777 
2778 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2779 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2780 
2781 	if (trace->filter_pids.entries == NULL)
2782 		goto out;
2783 
2784 	trace->filter_pids.entries[0] = getpid();
2785 
2786 	for (i = 1; i < trace->filter_pids.nr; ++i)
2787 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2788 
2789 	intlist__delete(list);
2790 	ret = 0;
2791 out:
2792 	return ret;
2793 }
2794 
2795 static int trace__open_output(struct trace *trace, const char *filename)
2796 {
2797 	struct stat st;
2798 
2799 	if (!stat(filename, &st) && st.st_size) {
2800 		char oldname[PATH_MAX];
2801 
2802 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2803 		unlink(oldname);
2804 		rename(filename, oldname);
2805 	}
2806 
2807 	trace->output = fopen(filename, "w");
2808 
2809 	return trace->output == NULL ? -errno : 0;
2810 }
2811 
2812 static int parse_pagefaults(const struct option *opt, const char *str,
2813 			    int unset __maybe_unused)
2814 {
2815 	int *trace_pgfaults = opt->value;
2816 
2817 	if (strcmp(str, "all") == 0)
2818 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2819 	else if (strcmp(str, "maj") == 0)
2820 		*trace_pgfaults |= TRACE_PFMAJ;
2821 	else if (strcmp(str, "min") == 0)
2822 		*trace_pgfaults |= TRACE_PFMIN;
2823 	else
2824 		return -1;
2825 
2826 	return 0;
2827 }
2828 
2829 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2830 {
2831 	struct perf_evsel *evsel;
2832 
2833 	evlist__for_each_entry(evlist, evsel)
2834 		evsel->handler = handler;
2835 }
2836 
2837 /*
2838  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2839  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2840  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2841  *
2842  * It'd be better to introduce a parse_options() variant that would return a
2843  * list with the terms it didn't match to an event...
2844  */
2845 static int trace__parse_events_option(const struct option *opt, const char *str,
2846 				      int unset __maybe_unused)
2847 {
2848 	struct trace *trace = (struct trace *)opt->value;
2849 	const char *s = str;
2850 	char *sep = NULL, *lists[2] = { NULL, NULL, };
2851 	int len = strlen(str) + 1, err = -1, list, idx;
2852 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2853 	char group_name[PATH_MAX];
2854 
2855 	if (strace_groups_dir == NULL)
2856 		return -1;
2857 
2858 	if (*s == '!') {
2859 		++s;
2860 		trace->not_ev_qualifier = true;
2861 	}
2862 
2863 	while (1) {
2864 		if ((sep = strchr(s, ',')) != NULL)
2865 			*sep = '\0';
2866 
2867 		list = 0;
2868 		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2869 		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2870 			list = 1;
2871 		} else {
2872 			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2873 			if (access(group_name, R_OK) == 0)
2874 				list = 1;
2875 		}
2876 
2877 		if (lists[list]) {
2878 			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2879 		} else {
2880 			lists[list] = malloc(len);
2881 			if (lists[list] == NULL)
2882 				goto out;
2883 			strcpy(lists[list], s);
2884 		}
2885 
2886 		if (!sep)
2887 			break;
2888 
2889 		*sep = ',';
2890 		s = sep + 1;
2891 	}
2892 
2893 	if (lists[1] != NULL) {
2894 		struct strlist_config slist_config = {
2895 			.dirname = strace_groups_dir,
2896 		};
2897 
2898 		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2899 		if (trace->ev_qualifier == NULL) {
2900 			fputs("Not enough memory to parse event qualifier", trace->output);
2901 			goto out;
2902 		}
2903 
2904 		if (trace__validate_ev_qualifier(trace))
2905 			goto out;
2906 	}
2907 
2908 	err = 0;
2909 
2910 	if (lists[0]) {
2911 		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2912 					       "event selector. use 'perf list' to list available events",
2913 					       parse_events_option);
2914 		err = parse_events_option(&o, lists[0], 0);
2915 	}
2916 out:
2917 	if (sep)
2918 		*sep = ',';
2919 
2920 	return err;
2921 }
2922 
2923 int cmd_trace(int argc, const char **argv)
2924 {
2925 	const char *trace_usage[] = {
2926 		"perf trace [<options>] [<command>]",
2927 		"perf trace [<options>] -- <command> [<options>]",
2928 		"perf trace record [<options>] [<command>]",
2929 		"perf trace record [<options>] -- <command> [<options>]",
2930 		NULL
2931 	};
2932 	struct trace trace = {
2933 		.syscalls = {
2934 			. max = -1,
2935 		},
2936 		.opts = {
2937 			.target = {
2938 				.uid	   = UINT_MAX,
2939 				.uses_mmap = true,
2940 			},
2941 			.user_freq     = UINT_MAX,
2942 			.user_interval = ULLONG_MAX,
2943 			.no_buffering  = true,
2944 			.mmap_pages    = UINT_MAX,
2945 			.proc_map_timeout  = 500,
2946 		},
2947 		.output = stderr,
2948 		.show_comm = true,
2949 		.trace_syscalls = true,
2950 		.kernel_syscallchains = false,
2951 		.max_stack = UINT_MAX,
2952 	};
2953 	const char *output_name = NULL;
2954 	const struct option trace_options[] = {
2955 	OPT_CALLBACK('e', "event", &trace, "event",
2956 		     "event/syscall selector. use 'perf list' to list available events",
2957 		     trace__parse_events_option),
2958 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2959 		    "show the thread COMM next to its id"),
2960 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2961 	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2962 		     trace__parse_events_option),
2963 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
2964 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2965 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2966 		    "trace events on existing process id"),
2967 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2968 		    "trace events on existing thread id"),
2969 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2970 		     "pids to filter (by the kernel)", trace__set_filter_pids),
2971 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2972 		    "system-wide collection from all CPUs"),
2973 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2974 		    "list of cpus to monitor"),
2975 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2976 		    "child tasks do not inherit counters"),
2977 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2978 		     "number of mmap data pages",
2979 		     perf_evlist__parse_mmap_pages),
2980 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2981 		   "user to profile"),
2982 	OPT_CALLBACK(0, "duration", &trace, "float",
2983 		     "show only events with duration > N.M ms",
2984 		     trace__set_duration),
2985 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2986 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2987 	OPT_BOOLEAN('T', "time", &trace.full_time,
2988 		    "Show full timestamp, not time relative to first start"),
2989 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
2990 		    "Show only syscall summary with statistics"),
2991 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
2992 		    "Show all syscalls and summary with statistics"),
2993 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2994 		     "Trace pagefaults", parse_pagefaults, "maj"),
2995 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2996 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2997 	OPT_CALLBACK(0, "call-graph", &trace.opts,
2998 		     "record_mode[,record_size]", record_callchain_help,
2999 		     &record_parse_callchain_opt),
3000 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3001 		    "Show the kernel callchains on the syscall exit path"),
3002 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3003 		     "Set the minimum stack depth when parsing the callchain, "
3004 		     "anything below the specified depth will be ignored."),
3005 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3006 		     "Set the maximum stack depth when parsing the callchain, "
3007 		     "anything beyond the specified depth will be ignored. "
3008 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3009 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3010 			"per thread proc mmap processing timeout in ms"),
3011 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3012 		     "ms to wait before starting measurement after program "
3013 		     "start"),
3014 	OPT_END()
3015 	};
3016 	bool __maybe_unused max_stack_user_set = true;
3017 	bool mmap_pages_user_set = true;
3018 	const char * const trace_subcommands[] = { "record", NULL };
3019 	int err;
3020 	char bf[BUFSIZ];
3021 
3022 	signal(SIGSEGV, sighandler_dump_stack);
3023 	signal(SIGFPE, sighandler_dump_stack);
3024 
3025 	trace.evlist = perf_evlist__new();
3026 	trace.sctbl = syscalltbl__new();
3027 
3028 	if (trace.evlist == NULL || trace.sctbl == NULL) {
3029 		pr_err("Not enough memory to run!\n");
3030 		err = -ENOMEM;
3031 		goto out;
3032 	}
3033 
3034 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3035 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3036 
3037 	err = bpf__setup_stdout(trace.evlist);
3038 	if (err) {
3039 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3040 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3041 		goto out;
3042 	}
3043 
3044 	err = -1;
3045 
3046 	if (trace.trace_pgfaults) {
3047 		trace.opts.sample_address = true;
3048 		trace.opts.sample_time = true;
3049 	}
3050 
3051 	if (trace.opts.mmap_pages == UINT_MAX)
3052 		mmap_pages_user_set = false;
3053 
3054 	if (trace.max_stack == UINT_MAX) {
3055 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3056 		max_stack_user_set = false;
3057 	}
3058 
3059 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3060 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
3061 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3062 #endif
3063 
3064 	if (callchain_param.enabled) {
3065 		if (!mmap_pages_user_set && geteuid() == 0)
3066 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3067 
3068 		symbol_conf.use_callchain = true;
3069 	}
3070 
3071 	if (trace.evlist->nr_entries > 0)
3072 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3073 
3074 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3075 		return trace__record(&trace, argc-1, &argv[1]);
3076 
3077 	/* summary_only implies summary option, but don't overwrite summary if set */
3078 	if (trace.summary_only)
3079 		trace.summary = trace.summary_only;
3080 
3081 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3082 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3083 		pr_err("Please specify something to trace.\n");
3084 		return -1;
3085 	}
3086 
3087 	if (!trace.trace_syscalls && trace.ev_qualifier) {
3088 		pr_err("The -e option can't be used with --no-syscalls.\n");
3089 		goto out;
3090 	}
3091 
3092 	if (output_name != NULL) {
3093 		err = trace__open_output(&trace, output_name);
3094 		if (err < 0) {
3095 			perror("failed to create output file");
3096 			goto out;
3097 		}
3098 	}
3099 
3100 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
3101 
3102 	err = target__validate(&trace.opts.target);
3103 	if (err) {
3104 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3105 		fprintf(trace.output, "%s", bf);
3106 		goto out_close;
3107 	}
3108 
3109 	err = target__parse_uid(&trace.opts.target);
3110 	if (err) {
3111 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3112 		fprintf(trace.output, "%s", bf);
3113 		goto out_close;
3114 	}
3115 
3116 	if (!argc && target__none(&trace.opts.target))
3117 		trace.opts.target.system_wide = true;
3118 
3119 	if (input_name)
3120 		err = trace__replay(&trace);
3121 	else
3122 		err = trace__run(&trace, argc, argv);
3123 
3124 out_close:
3125 	if (output_name != NULL)
3126 		fclose(trace.output);
3127 out:
3128 	return err;
3129 }
3130