xref: /linux/tools/perf/builtin-trace.c (revision 0a6545bda2756807a089c64352edfc5628c57e6c)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/cgroup.h"
23 #include "util/color.h"
24 #include "util/debug.h"
25 #include "util/env.h"
26 #include "util/event.h"
27 #include "util/evlist.h"
28 #include <subcmd/exec-cmd.h>
29 #include "util/machine.h"
30 #include "util/path.h"
31 #include "util/session.h"
32 #include "util/thread.h"
33 #include <subcmd/parse-options.h>
34 #include "util/strlist.h"
35 #include "util/intlist.h"
36 #include "util/thread_map.h"
37 #include "util/stat.h"
38 #include "trace/beauty/beauty.h"
39 #include "trace-event.h"
40 #include "util/parse-events.h"
41 #include "util/bpf-loader.h"
42 #include "callchain.h"
43 #include "print_binary.h"
44 #include "string2.h"
45 #include "syscalltbl.h"
46 #include "rb_resort.h"
47 
48 #include <errno.h>
49 #include <inttypes.h>
50 #include <poll.h>
51 #include <signal.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <linux/err.h>
55 #include <linux/filter.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 #include <fcntl.h>
61 
62 #include "sane_ctype.h"
63 
64 #ifndef O_CLOEXEC
65 # define O_CLOEXEC		02000000
66 #endif
67 
68 #ifndef F_LINUX_SPECIFIC_BASE
69 # define F_LINUX_SPECIFIC_BASE	1024
70 #endif
71 
72 struct trace {
73 	struct perf_tool	tool;
74 	struct syscalltbl	*sctbl;
75 	struct {
76 		int		max;
77 		struct syscall  *table;
78 		struct {
79 			struct perf_evsel *sys_enter,
80 					  *sys_exit;
81 		}		events;
82 	} syscalls;
83 	struct record_opts	opts;
84 	struct perf_evlist	*evlist;
85 	struct machine		*host;
86 	struct thread		*current;
87 	struct cgroup		*cgroup;
88 	u64			base_time;
89 	FILE			*output;
90 	unsigned long		nr_events;
91 	struct strlist		*ev_qualifier;
92 	struct {
93 		size_t		nr;
94 		int		*entries;
95 	}			ev_qualifier_ids;
96 	struct {
97 		size_t		nr;
98 		pid_t		*entries;
99 	}			filter_pids;
100 	double			duration_filter;
101 	double			runtime_ms;
102 	struct {
103 		u64		vfs_getname,
104 				proc_getname;
105 	} stats;
106 	unsigned int		max_stack;
107 	unsigned int		min_stack;
108 	bool			not_ev_qualifier;
109 	bool			live;
110 	bool			full_time;
111 	bool			sched;
112 	bool			multiple_threads;
113 	bool			summary;
114 	bool			summary_only;
115 	bool			failure_only;
116 	bool			show_comm;
117 	bool			print_sample;
118 	bool			show_tool_stats;
119 	bool			trace_syscalls;
120 	bool			kernel_syscallchains;
121 	bool			force;
122 	bool			vfs_getname;
123 	int			trace_pgfaults;
124 	int			open_id;
125 };
126 
127 struct tp_field {
128 	int offset;
129 	union {
130 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
131 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
132 	};
133 };
134 
135 #define TP_UINT_FIELD(bits) \
136 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
137 { \
138 	u##bits value; \
139 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
140 	return value;  \
141 }
142 
143 TP_UINT_FIELD(8);
144 TP_UINT_FIELD(16);
145 TP_UINT_FIELD(32);
146 TP_UINT_FIELD(64);
147 
148 #define TP_UINT_FIELD__SWAPPED(bits) \
149 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
150 { \
151 	u##bits value; \
152 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
153 	return bswap_##bits(value);\
154 }
155 
156 TP_UINT_FIELD__SWAPPED(16);
157 TP_UINT_FIELD__SWAPPED(32);
158 TP_UINT_FIELD__SWAPPED(64);
159 
160 static int tp_field__init_uint(struct tp_field *field,
161 			       struct format_field *format_field,
162 			       bool needs_swap)
163 {
164 	field->offset = format_field->offset;
165 
166 	switch (format_field->size) {
167 	case 1:
168 		field->integer = tp_field__u8;
169 		break;
170 	case 2:
171 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
172 		break;
173 	case 4:
174 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
175 		break;
176 	case 8:
177 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
178 		break;
179 	default:
180 		return -1;
181 	}
182 
183 	return 0;
184 }
185 
186 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
187 {
188 	return sample->raw_data + field->offset;
189 }
190 
191 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
192 {
193 	field->offset = format_field->offset;
194 	field->pointer = tp_field__ptr;
195 	return 0;
196 }
197 
198 struct syscall_tp {
199 	struct tp_field id;
200 	union {
201 		struct tp_field args, ret;
202 	};
203 };
204 
205 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
206 					  struct tp_field *field,
207 					  const char *name)
208 {
209 	struct format_field *format_field = perf_evsel__field(evsel, name);
210 
211 	if (format_field == NULL)
212 		return -1;
213 
214 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
215 }
216 
217 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
218 	({ struct syscall_tp *sc = evsel->priv;\
219 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
220 
221 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
222 					 struct tp_field *field,
223 					 const char *name)
224 {
225 	struct format_field *format_field = perf_evsel__field(evsel, name);
226 
227 	if (format_field == NULL)
228 		return -1;
229 
230 	return tp_field__init_ptr(field, format_field);
231 }
232 
233 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
234 	({ struct syscall_tp *sc = evsel->priv;\
235 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
236 
237 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
238 {
239 	zfree(&evsel->priv);
240 	perf_evsel__delete(evsel);
241 }
242 
243 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
244 {
245 	evsel->priv = malloc(sizeof(struct syscall_tp));
246 	if (evsel->priv != NULL) {
247 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
248 			goto out_delete;
249 
250 		evsel->handler = handler;
251 		return 0;
252 	}
253 
254 	return -ENOMEM;
255 
256 out_delete:
257 	zfree(&evsel->priv);
258 	return -ENOENT;
259 }
260 
261 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
262 {
263 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
264 
265 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
266 	if (IS_ERR(evsel))
267 		evsel = perf_evsel__newtp("syscalls", direction);
268 
269 	if (IS_ERR(evsel))
270 		return NULL;
271 
272 	if (perf_evsel__init_syscall_tp(evsel, handler))
273 		goto out_delete;
274 
275 	return evsel;
276 
277 out_delete:
278 	perf_evsel__delete_priv(evsel);
279 	return NULL;
280 }
281 
282 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
283 	({ struct syscall_tp *fields = evsel->priv; \
284 	   fields->name.integer(&fields->name, sample); })
285 
286 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
287 	({ struct syscall_tp *fields = evsel->priv; \
288 	   fields->name.pointer(&fields->name, sample); })
289 
290 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
291 {
292 	int idx = val - sa->offset;
293 
294 	if (idx < 0 || idx >= sa->nr_entries)
295 		return scnprintf(bf, size, intfmt, val);
296 
297 	return scnprintf(bf, size, "%s", sa->entries[idx]);
298 }
299 
300 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
301 						const char *intfmt,
302 					        struct syscall_arg *arg)
303 {
304 	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
305 }
306 
307 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
308 					      struct syscall_arg *arg)
309 {
310 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
311 }
312 
313 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
314 
315 struct strarrays {
316 	int		nr_entries;
317 	struct strarray **entries;
318 };
319 
320 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
321 	.nr_entries = ARRAY_SIZE(array), \
322 	.entries = array, \
323 }
324 
325 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
326 					struct syscall_arg *arg)
327 {
328 	struct strarrays *sas = arg->parm;
329 	int i;
330 
331 	for (i = 0; i < sas->nr_entries; ++i) {
332 		struct strarray *sa = sas->entries[i];
333 		int idx = arg->val - sa->offset;
334 
335 		if (idx >= 0 && idx < sa->nr_entries) {
336 			if (sa->entries[idx] == NULL)
337 				break;
338 			return scnprintf(bf, size, "%s", sa->entries[idx]);
339 		}
340 	}
341 
342 	return scnprintf(bf, size, "%d", arg->val);
343 }
344 
345 #ifndef AT_FDCWD
346 #define AT_FDCWD	-100
347 #endif
348 
349 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
350 					   struct syscall_arg *arg)
351 {
352 	int fd = arg->val;
353 
354 	if (fd == AT_FDCWD)
355 		return scnprintf(bf, size, "CWD");
356 
357 	return syscall_arg__scnprintf_fd(bf, size, arg);
358 }
359 
360 #define SCA_FDAT syscall_arg__scnprintf_fd_at
361 
362 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
363 					      struct syscall_arg *arg);
364 
365 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
366 
367 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
368 {
369 	return scnprintf(bf, size, "%#lx", arg->val);
370 }
371 
372 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
373 {
374 	return scnprintf(bf, size, "%d", arg->val);
375 }
376 
377 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
378 {
379 	return scnprintf(bf, size, "%ld", arg->val);
380 }
381 
382 static const char *bpf_cmd[] = {
383 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
384 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
385 };
386 static DEFINE_STRARRAY(bpf_cmd);
387 
388 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
389 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
390 
391 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
392 static DEFINE_STRARRAY(itimers);
393 
394 static const char *keyctl_options[] = {
395 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
396 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
397 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
398 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
399 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
400 };
401 static DEFINE_STRARRAY(keyctl_options);
402 
403 static const char *whences[] = { "SET", "CUR", "END",
404 #ifdef SEEK_DATA
405 "DATA",
406 #endif
407 #ifdef SEEK_HOLE
408 "HOLE",
409 #endif
410 };
411 static DEFINE_STRARRAY(whences);
412 
413 static const char *fcntl_cmds[] = {
414 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
415 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
416 	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
417 	"GETOWNER_UIDS",
418 };
419 static DEFINE_STRARRAY(fcntl_cmds);
420 
421 static const char *fcntl_linux_specific_cmds[] = {
422 	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
423 	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
424 	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
425 };
426 
427 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
428 
429 static struct strarray *fcntl_cmds_arrays[] = {
430 	&strarray__fcntl_cmds,
431 	&strarray__fcntl_linux_specific_cmds,
432 };
433 
434 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
435 
436 static const char *rlimit_resources[] = {
437 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
438 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
439 	"RTTIME",
440 };
441 static DEFINE_STRARRAY(rlimit_resources);
442 
443 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
444 static DEFINE_STRARRAY(sighow);
445 
446 static const char *clockid[] = {
447 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
448 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
449 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
450 };
451 static DEFINE_STRARRAY(clockid);
452 
453 static const char *socket_families[] = {
454 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
455 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
456 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
457 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
458 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
459 	"ALG", "NFC", "VSOCK",
460 };
461 static DEFINE_STRARRAY(socket_families);
462 
463 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
464 						 struct syscall_arg *arg)
465 {
466 	size_t printed = 0;
467 	int mode = arg->val;
468 
469 	if (mode == F_OK) /* 0 */
470 		return scnprintf(bf, size, "F");
471 #define	P_MODE(n) \
472 	if (mode & n##_OK) { \
473 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
474 		mode &= ~n##_OK; \
475 	}
476 
477 	P_MODE(R);
478 	P_MODE(W);
479 	P_MODE(X);
480 #undef P_MODE
481 
482 	if (mode)
483 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
484 
485 	return printed;
486 }
487 
488 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
489 
490 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
491 					      struct syscall_arg *arg);
492 
493 #define SCA_FILENAME syscall_arg__scnprintf_filename
494 
495 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
496 						struct syscall_arg *arg)
497 {
498 	int printed = 0, flags = arg->val;
499 
500 #define	P_FLAG(n) \
501 	if (flags & O_##n) { \
502 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
503 		flags &= ~O_##n; \
504 	}
505 
506 	P_FLAG(CLOEXEC);
507 	P_FLAG(NONBLOCK);
508 #undef P_FLAG
509 
510 	if (flags)
511 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
512 
513 	return printed;
514 }
515 
516 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
517 
518 #ifndef GRND_NONBLOCK
519 #define GRND_NONBLOCK	0x0001
520 #endif
521 #ifndef GRND_RANDOM
522 #define GRND_RANDOM	0x0002
523 #endif
524 
525 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
526 						   struct syscall_arg *arg)
527 {
528 	int printed = 0, flags = arg->val;
529 
530 #define	P_FLAG(n) \
531 	if (flags & GRND_##n) { \
532 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
533 		flags &= ~GRND_##n; \
534 	}
535 
536 	P_FLAG(RANDOM);
537 	P_FLAG(NONBLOCK);
538 #undef P_FLAG
539 
540 	if (flags)
541 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
542 
543 	return printed;
544 }
545 
546 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
547 
548 #define STRARRAY(name, array) \
549 	  { .scnprintf	= SCA_STRARRAY, \
550 	    .parm	= &strarray__##array, }
551 
552 #include "trace/beauty/arch_errno_names.c"
553 #include "trace/beauty/eventfd.c"
554 #include "trace/beauty/futex_op.c"
555 #include "trace/beauty/futex_val3.c"
556 #include "trace/beauty/mmap.c"
557 #include "trace/beauty/mode_t.c"
558 #include "trace/beauty/msg_flags.c"
559 #include "trace/beauty/open_flags.c"
560 #include "trace/beauty/perf_event_open.c"
561 #include "trace/beauty/pid.c"
562 #include "trace/beauty/sched_policy.c"
563 #include "trace/beauty/seccomp.c"
564 #include "trace/beauty/signum.c"
565 #include "trace/beauty/socket_type.c"
566 #include "trace/beauty/waitid_options.c"
567 
568 struct syscall_arg_fmt {
569 	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
570 	void	   *parm;
571 	const char *name;
572 	bool	   show_zero;
573 };
574 
575 static struct syscall_fmt {
576 	const char *name;
577 	const char *alias;
578 	struct syscall_arg_fmt arg[6];
579 	u8	   nr_args;
580 	bool	   errpid;
581 	bool	   timeout;
582 	bool	   hexret;
583 } syscall_fmts[] = {
584 	{ .name	    = "access",
585 	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
586 	{ .name	    = "bpf",
587 	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
588 	{ .name	    = "brk",	    .hexret = true,
589 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
590 	{ .name     = "clock_gettime",
591 	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
592 	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
593 	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
594 		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
595 		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
596 		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
597 		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
598 	{ .name	    = "close",
599 	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
600 	{ .name	    = "epoll_ctl",
601 	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
602 	{ .name	    = "eventfd2",
603 	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
604 	{ .name	    = "fchmodat",
605 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
606 	{ .name	    = "fchownat",
607 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
608 	{ .name	    = "fcntl",
609 	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
610 			   .parm      = &strarrays__fcntl_cmds_arrays,
611 			   .show_zero = true, },
612 		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
613 	{ .name	    = "flock",
614 	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
615 	{ .name	    = "fstat", .alias = "newfstat", },
616 	{ .name	    = "fstatat", .alias = "newfstatat", },
617 	{ .name	    = "futex",
618 	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
619 		   [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
620 	{ .name	    = "futimesat",
621 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
622 	{ .name	    = "getitimer",
623 	  .arg = { [0] = STRARRAY(which, itimers), }, },
624 	{ .name	    = "getpid",	    .errpid = true, },
625 	{ .name	    = "getpgid",    .errpid = true, },
626 	{ .name	    = "getppid",    .errpid = true, },
627 	{ .name	    = "getrandom",
628 	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
629 	{ .name	    = "getrlimit",
630 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
631 	{ .name	    = "gettid",	    .errpid = true, },
632 	{ .name	    = "ioctl",
633 	  .arg = {
634 #if defined(__i386__) || defined(__x86_64__)
635 /*
636  * FIXME: Make this available to all arches.
637  */
638 		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
639 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
640 #else
641 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
642 #endif
643 	{ .name	    = "kcmp",	    .nr_args = 5,
644 	  .arg = { [0] = { .name = "pid1",	.scnprintf = SCA_PID, },
645 		   [1] = { .name = "pid2",	.scnprintf = SCA_PID, },
646 		   [2] = { .name = "type",	.scnprintf = SCA_KCMP_TYPE, },
647 		   [3] = { .name = "idx1",	.scnprintf = SCA_KCMP_IDX, },
648 		   [4] = { .name = "idx2",	.scnprintf = SCA_KCMP_IDX, }, }, },
649 	{ .name	    = "keyctl",
650 	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
651 	{ .name	    = "kill",
652 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
653 	{ .name	    = "linkat",
654 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
655 	{ .name	    = "lseek",
656 	  .arg = { [2] = STRARRAY(whence, whences), }, },
657 	{ .name	    = "lstat", .alias = "newlstat", },
658 	{ .name     = "madvise",
659 	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
660 		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
661 	{ .name	    = "mkdirat",
662 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
663 	{ .name	    = "mknodat",
664 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
665 	{ .name	    = "mlock",
666 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
667 	{ .name	    = "mlockall",
668 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
669 	{ .name	    = "mmap",	    .hexret = true,
670 /* The standard mmap maps to old_mmap on s390x */
671 #if defined(__s390x__)
672 	.alias = "old_mmap",
673 #endif
674 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* addr */ },
675 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
676 		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */ }, }, },
677 	{ .name	    = "mprotect",
678 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
679 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
680 	{ .name	    = "mq_unlink",
681 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
682 	{ .name	    = "mremap",	    .hexret = true,
683 	  .arg = { [0] = { .scnprintf = SCA_HEX,	  /* addr */ },
684 		   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
685 		   [4] = { .scnprintf = SCA_HEX,	  /* new_addr */ }, }, },
686 	{ .name	    = "munlock",
687 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
688 	{ .name	    = "munmap",
689 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
690 	{ .name	    = "name_to_handle_at",
691 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
692 	{ .name	    = "newfstatat",
693 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
694 	{ .name	    = "open",
695 	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
696 	{ .name	    = "open_by_handle_at",
697 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
698 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
699 	{ .name	    = "openat",
700 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
701 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
702 	{ .name	    = "perf_event_open",
703 	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
704 		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
705 		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
706 	{ .name	    = "pipe2",
707 	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
708 	{ .name	    = "pkey_alloc",
709 	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
710 	{ .name	    = "pkey_free",
711 	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
712 	{ .name	    = "pkey_mprotect",
713 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
714 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
715 		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
716 	{ .name	    = "poll", .timeout = true, },
717 	{ .name	    = "ppoll", .timeout = true, },
718 	{ .name	    = "prctl", .alias = "arch_prctl",
719 	  .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
720 		   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
721 		   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
722 	{ .name	    = "pread", .alias = "pread64", },
723 	{ .name	    = "preadv", .alias = "pread", },
724 	{ .name	    = "prlimit64",
725 	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
726 	{ .name	    = "pwrite", .alias = "pwrite64", },
727 	{ .name	    = "readlinkat",
728 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
729 	{ .name	    = "recvfrom",
730 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
731 	{ .name	    = "recvmmsg",
732 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
733 	{ .name	    = "recvmsg",
734 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
735 	{ .name	    = "renameat",
736 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
737 	{ .name	    = "rt_sigaction",
738 	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
739 	{ .name	    = "rt_sigprocmask",
740 	  .arg = { [0] = STRARRAY(how, sighow), }, },
741 	{ .name	    = "rt_sigqueueinfo",
742 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
743 	{ .name	    = "rt_tgsigqueueinfo",
744 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
745 	{ .name	    = "sched_setscheduler",
746 	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
747 	{ .name	    = "seccomp",
748 	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
749 		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
750 	{ .name	    = "select", .timeout = true, },
751 	{ .name	    = "sendmmsg",
752 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
753 	{ .name	    = "sendmsg",
754 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
755 	{ .name	    = "sendto",
756 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
757 	{ .name	    = "set_tid_address", .errpid = true, },
758 	{ .name	    = "setitimer",
759 	  .arg = { [0] = STRARRAY(which, itimers), }, },
760 	{ .name	    = "setrlimit",
761 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
762 	{ .name	    = "socket",
763 	  .arg = { [0] = STRARRAY(family, socket_families),
764 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
765 	{ .name	    = "socketpair",
766 	  .arg = { [0] = STRARRAY(family, socket_families),
767 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
768 	{ .name	    = "stat", .alias = "newstat", },
769 	{ .name	    = "statx",
770 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
771 		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
772 		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
773 	{ .name	    = "swapoff",
774 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
775 	{ .name	    = "swapon",
776 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
777 	{ .name	    = "symlinkat",
778 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
779 	{ .name	    = "tgkill",
780 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
781 	{ .name	    = "tkill",
782 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
783 	{ .name	    = "uname", .alias = "newuname", },
784 	{ .name	    = "unlinkat",
785 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
786 	{ .name	    = "utimensat",
787 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
788 	{ .name	    = "wait4",	    .errpid = true,
789 	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
790 	{ .name	    = "waitid",	    .errpid = true,
791 	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
792 };
793 
794 static int syscall_fmt__cmp(const void *name, const void *fmtp)
795 {
796 	const struct syscall_fmt *fmt = fmtp;
797 	return strcmp(name, fmt->name);
798 }
799 
800 static struct syscall_fmt *syscall_fmt__find(const char *name)
801 {
802 	const int nmemb = ARRAY_SIZE(syscall_fmts);
803 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
804 }
805 
806 struct syscall {
807 	struct event_format *tp_format;
808 	int		    nr_args;
809 	struct format_field *args;
810 	const char	    *name;
811 	bool		    is_exit;
812 	struct syscall_fmt  *fmt;
813 	struct syscall_arg_fmt *arg_fmt;
814 };
815 
816 /*
817  * We need to have this 'calculated' boolean because in some cases we really
818  * don't know what is the duration of a syscall, for instance, when we start
819  * a session and some threads are waiting for a syscall to finish, say 'poll',
820  * in which case all we can do is to print "( ? ) for duration and for the
821  * start timestamp.
822  */
823 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
824 {
825 	double duration = (double)t / NSEC_PER_MSEC;
826 	size_t printed = fprintf(fp, "(");
827 
828 	if (!calculated)
829 		printed += fprintf(fp, "         ");
830 	else if (duration >= 1.0)
831 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
832 	else if (duration >= 0.01)
833 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
834 	else
835 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
836 	return printed + fprintf(fp, "): ");
837 }
838 
839 /**
840  * filename.ptr: The filename char pointer that will be vfs_getname'd
841  * filename.entry_str_pos: Where to insert the string translated from
842  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
843  * ret_scnprintf: syscall args may set this to a different syscall return
844  *                formatter, for instance, fcntl may return fds, file flags, etc.
845  */
846 struct thread_trace {
847 	u64		  entry_time;
848 	bool		  entry_pending;
849 	unsigned long	  nr_events;
850 	unsigned long	  pfmaj, pfmin;
851 	char		  *entry_str;
852 	double		  runtime_ms;
853 	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
854         struct {
855 		unsigned long ptr;
856 		short int     entry_str_pos;
857 		bool	      pending_open;
858 		unsigned int  namelen;
859 		char	      *name;
860 	} filename;
861 	struct {
862 		int	  max;
863 		char	  **table;
864 	} paths;
865 
866 	struct intlist *syscall_stats;
867 };
868 
869 static struct thread_trace *thread_trace__new(void)
870 {
871 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
872 
873 	if (ttrace)
874 		ttrace->paths.max = -1;
875 
876 	ttrace->syscall_stats = intlist__new(NULL);
877 
878 	return ttrace;
879 }
880 
881 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
882 {
883 	struct thread_trace *ttrace;
884 
885 	if (thread == NULL)
886 		goto fail;
887 
888 	if (thread__priv(thread) == NULL)
889 		thread__set_priv(thread, thread_trace__new());
890 
891 	if (thread__priv(thread) == NULL)
892 		goto fail;
893 
894 	ttrace = thread__priv(thread);
895 	++ttrace->nr_events;
896 
897 	return ttrace;
898 fail:
899 	color_fprintf(fp, PERF_COLOR_RED,
900 		      "WARNING: not enough memory, dropping samples!\n");
901 	return NULL;
902 }
903 
904 
905 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
906 				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
907 {
908 	struct thread_trace *ttrace = thread__priv(arg->thread);
909 
910 	ttrace->ret_scnprintf = ret_scnprintf;
911 }
912 
913 #define TRACE_PFMAJ		(1 << 0)
914 #define TRACE_PFMIN		(1 << 1)
915 
916 static const size_t trace__entry_str_size = 2048;
917 
918 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
919 {
920 	struct thread_trace *ttrace = thread__priv(thread);
921 
922 	if (fd > ttrace->paths.max) {
923 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
924 
925 		if (npath == NULL)
926 			return -1;
927 
928 		if (ttrace->paths.max != -1) {
929 			memset(npath + ttrace->paths.max + 1, 0,
930 			       (fd - ttrace->paths.max) * sizeof(char *));
931 		} else {
932 			memset(npath, 0, (fd + 1) * sizeof(char *));
933 		}
934 
935 		ttrace->paths.table = npath;
936 		ttrace->paths.max   = fd;
937 	}
938 
939 	ttrace->paths.table[fd] = strdup(pathname);
940 
941 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
942 }
943 
944 static int thread__read_fd_path(struct thread *thread, int fd)
945 {
946 	char linkname[PATH_MAX], pathname[PATH_MAX];
947 	struct stat st;
948 	int ret;
949 
950 	if (thread->pid_ == thread->tid) {
951 		scnprintf(linkname, sizeof(linkname),
952 			  "/proc/%d/fd/%d", thread->pid_, fd);
953 	} else {
954 		scnprintf(linkname, sizeof(linkname),
955 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
956 	}
957 
958 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
959 		return -1;
960 
961 	ret = readlink(linkname, pathname, sizeof(pathname));
962 
963 	if (ret < 0 || ret > st.st_size)
964 		return -1;
965 
966 	pathname[ret] = '\0';
967 	return trace__set_fd_pathname(thread, fd, pathname);
968 }
969 
970 static const char *thread__fd_path(struct thread *thread, int fd,
971 				   struct trace *trace)
972 {
973 	struct thread_trace *ttrace = thread__priv(thread);
974 
975 	if (ttrace == NULL)
976 		return NULL;
977 
978 	if (fd < 0)
979 		return NULL;
980 
981 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
982 		if (!trace->live)
983 			return NULL;
984 		++trace->stats.proc_getname;
985 		if (thread__read_fd_path(thread, fd))
986 			return NULL;
987 	}
988 
989 	return ttrace->paths.table[fd];
990 }
991 
992 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
993 {
994 	int fd = arg->val;
995 	size_t printed = scnprintf(bf, size, "%d", fd);
996 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
997 
998 	if (path)
999 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1000 
1001 	return printed;
1002 }
1003 
1004 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1005 {
1006         size_t printed = scnprintf(bf, size, "%d", fd);
1007 	struct thread *thread = machine__find_thread(trace->host, pid, pid);
1008 
1009 	if (thread) {
1010 		const char *path = thread__fd_path(thread, fd, trace);
1011 
1012 		if (path)
1013 			printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1014 
1015 		thread__put(thread);
1016 	}
1017 
1018         return printed;
1019 }
1020 
1021 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1022 					      struct syscall_arg *arg)
1023 {
1024 	int fd = arg->val;
1025 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1026 	struct thread_trace *ttrace = thread__priv(arg->thread);
1027 
1028 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1029 		zfree(&ttrace->paths.table[fd]);
1030 
1031 	return printed;
1032 }
1033 
1034 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1035 				     unsigned long ptr)
1036 {
1037 	struct thread_trace *ttrace = thread__priv(thread);
1038 
1039 	ttrace->filename.ptr = ptr;
1040 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1041 }
1042 
1043 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1044 					      struct syscall_arg *arg)
1045 {
1046 	unsigned long ptr = arg->val;
1047 
1048 	if (!arg->trace->vfs_getname)
1049 		return scnprintf(bf, size, "%#x", ptr);
1050 
1051 	thread__set_filename_pos(arg->thread, bf, ptr);
1052 	return 0;
1053 }
1054 
1055 static bool trace__filter_duration(struct trace *trace, double t)
1056 {
1057 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1058 }
1059 
1060 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1061 {
1062 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1063 
1064 	return fprintf(fp, "%10.3f ", ts);
1065 }
1066 
1067 /*
1068  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1069  * using ttrace->entry_time for a thread that receives a sys_exit without
1070  * first having received a sys_enter ("poll" issued before tracing session
1071  * starts, lost sys_enter exit due to ring buffer overflow).
1072  */
1073 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1074 {
1075 	if (tstamp > 0)
1076 		return __trace__fprintf_tstamp(trace, tstamp, fp);
1077 
1078 	return fprintf(fp, "         ? ");
1079 }
1080 
1081 static bool done = false;
1082 static bool interrupted = false;
1083 
1084 static void sig_handler(int sig)
1085 {
1086 	done = true;
1087 	interrupted = sig == SIGINT;
1088 }
1089 
1090 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1091 					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1092 {
1093 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1094 	printed += fprintf_duration(duration, duration_calculated, fp);
1095 
1096 	if (trace->multiple_threads) {
1097 		if (trace->show_comm)
1098 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1099 		printed += fprintf(fp, "%d ", thread->tid);
1100 	}
1101 
1102 	return printed;
1103 }
1104 
1105 static int trace__process_event(struct trace *trace, struct machine *machine,
1106 				union perf_event *event, struct perf_sample *sample)
1107 {
1108 	int ret = 0;
1109 
1110 	switch (event->header.type) {
1111 	case PERF_RECORD_LOST:
1112 		color_fprintf(trace->output, PERF_COLOR_RED,
1113 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1114 		ret = machine__process_lost_event(machine, event, sample);
1115 		break;
1116 	default:
1117 		ret = machine__process_event(machine, event, sample);
1118 		break;
1119 	}
1120 
1121 	return ret;
1122 }
1123 
1124 static int trace__tool_process(struct perf_tool *tool,
1125 			       union perf_event *event,
1126 			       struct perf_sample *sample,
1127 			       struct machine *machine)
1128 {
1129 	struct trace *trace = container_of(tool, struct trace, tool);
1130 	return trace__process_event(trace, machine, event, sample);
1131 }
1132 
1133 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1134 {
1135 	struct machine *machine = vmachine;
1136 
1137 	if (machine->kptr_restrict_warned)
1138 		return NULL;
1139 
1140 	if (symbol_conf.kptr_restrict) {
1141 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1142 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1143 			   "Kernel samples will not be resolved.\n");
1144 		machine->kptr_restrict_warned = true;
1145 		return NULL;
1146 	}
1147 
1148 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1149 }
1150 
1151 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1152 {
1153 	int err = symbol__init(NULL);
1154 
1155 	if (err)
1156 		return err;
1157 
1158 	trace->host = machine__new_host();
1159 	if (trace->host == NULL)
1160 		return -ENOMEM;
1161 
1162 	err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1163 	if (err < 0)
1164 		goto out;
1165 
1166 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1167 					    evlist->threads, trace__tool_process, false,
1168 					    trace->opts.proc_map_timeout, 1);
1169 out:
1170 	if (err)
1171 		symbol__exit();
1172 
1173 	return err;
1174 }
1175 
1176 static void trace__symbols__exit(struct trace *trace)
1177 {
1178 	machine__exit(trace->host);
1179 	trace->host = NULL;
1180 
1181 	symbol__exit();
1182 }
1183 
1184 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1185 {
1186 	int idx;
1187 
1188 	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1189 		nr_args = sc->fmt->nr_args;
1190 
1191 	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1192 	if (sc->arg_fmt == NULL)
1193 		return -1;
1194 
1195 	for (idx = 0; idx < nr_args; ++idx) {
1196 		if (sc->fmt)
1197 			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1198 	}
1199 
1200 	sc->nr_args = nr_args;
1201 	return 0;
1202 }
1203 
1204 static int syscall__set_arg_fmts(struct syscall *sc)
1205 {
1206 	struct format_field *field;
1207 	int idx = 0, len;
1208 
1209 	for (field = sc->args; field; field = field->next, ++idx) {
1210 		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1211 			continue;
1212 
1213 		if (strcmp(field->type, "const char *") == 0 &&
1214 			 (strcmp(field->name, "filename") == 0 ||
1215 			  strcmp(field->name, "path") == 0 ||
1216 			  strcmp(field->name, "pathname") == 0))
1217 			sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1218 		else if (field->flags & FIELD_IS_POINTER)
1219 			sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1220 		else if (strcmp(field->type, "pid_t") == 0)
1221 			sc->arg_fmt[idx].scnprintf = SCA_PID;
1222 		else if (strcmp(field->type, "umode_t") == 0)
1223 			sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1224 		else if ((strcmp(field->type, "int") == 0 ||
1225 			  strcmp(field->type, "unsigned int") == 0 ||
1226 			  strcmp(field->type, "long") == 0) &&
1227 			 (len = strlen(field->name)) >= 2 &&
1228 			 strcmp(field->name + len - 2, "fd") == 0) {
1229 			/*
1230 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1231 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1232 			 * 65 int
1233 			 * 23 unsigned int
1234 			 * 7 unsigned long
1235 			 */
1236 			sc->arg_fmt[idx].scnprintf = SCA_FD;
1237 		}
1238 	}
1239 
1240 	return 0;
1241 }
1242 
1243 static int trace__read_syscall_info(struct trace *trace, int id)
1244 {
1245 	char tp_name[128];
1246 	struct syscall *sc;
1247 	const char *name = syscalltbl__name(trace->sctbl, id);
1248 
1249 	if (name == NULL)
1250 		return -1;
1251 
1252 	if (id > trace->syscalls.max) {
1253 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1254 
1255 		if (nsyscalls == NULL)
1256 			return -1;
1257 
1258 		if (trace->syscalls.max != -1) {
1259 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1260 			       (id - trace->syscalls.max) * sizeof(*sc));
1261 		} else {
1262 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1263 		}
1264 
1265 		trace->syscalls.table = nsyscalls;
1266 		trace->syscalls.max   = id;
1267 	}
1268 
1269 	sc = trace->syscalls.table + id;
1270 	sc->name = name;
1271 
1272 	sc->fmt  = syscall_fmt__find(sc->name);
1273 
1274 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1275 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1276 
1277 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1278 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1279 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1280 	}
1281 
1282 	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1283 		return -1;
1284 
1285 	if (IS_ERR(sc->tp_format))
1286 		return -1;
1287 
1288 	sc->args = sc->tp_format->format.fields;
1289 	/*
1290 	 * We need to check and discard the first variable '__syscall_nr'
1291 	 * or 'nr' that mean the syscall number. It is needless here.
1292 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1293 	 */
1294 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1295 		sc->args = sc->args->next;
1296 		--sc->nr_args;
1297 	}
1298 
1299 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1300 
1301 	return syscall__set_arg_fmts(sc);
1302 }
1303 
1304 static int trace__validate_ev_qualifier(struct trace *trace)
1305 {
1306 	int err = 0, i;
1307 	size_t nr_allocated;
1308 	struct str_node *pos;
1309 
1310 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1311 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1312 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1313 
1314 	if (trace->ev_qualifier_ids.entries == NULL) {
1315 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1316 		       trace->output);
1317 		err = -EINVAL;
1318 		goto out;
1319 	}
1320 
1321 	nr_allocated = trace->ev_qualifier_ids.nr;
1322 	i = 0;
1323 
1324 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1325 		const char *sc = pos->s;
1326 		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1327 
1328 		if (id < 0) {
1329 			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1330 			if (id >= 0)
1331 				goto matches;
1332 
1333 			if (err == 0) {
1334 				fputs("Error:\tInvalid syscall ", trace->output);
1335 				err = -EINVAL;
1336 			} else {
1337 				fputs(", ", trace->output);
1338 			}
1339 
1340 			fputs(sc, trace->output);
1341 		}
1342 matches:
1343 		trace->ev_qualifier_ids.entries[i++] = id;
1344 		if (match_next == -1)
1345 			continue;
1346 
1347 		while (1) {
1348 			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1349 			if (id < 0)
1350 				break;
1351 			if (nr_allocated == trace->ev_qualifier_ids.nr) {
1352 				void *entries;
1353 
1354 				nr_allocated += 8;
1355 				entries = realloc(trace->ev_qualifier_ids.entries,
1356 						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1357 				if (entries == NULL) {
1358 					err = -ENOMEM;
1359 					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1360 					goto out_free;
1361 				}
1362 				trace->ev_qualifier_ids.entries = entries;
1363 			}
1364 			trace->ev_qualifier_ids.nr++;
1365 			trace->ev_qualifier_ids.entries[i++] = id;
1366 		}
1367 	}
1368 
1369 	if (err < 0) {
1370 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1371 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1372 out_free:
1373 		zfree(&trace->ev_qualifier_ids.entries);
1374 		trace->ev_qualifier_ids.nr = 0;
1375 	}
1376 out:
1377 	return err;
1378 }
1379 
1380 /*
1381  * args is to be interpreted as a series of longs but we need to handle
1382  * 8-byte unaligned accesses. args points to raw_data within the event
1383  * and raw_data is guaranteed to be 8-byte unaligned because it is
1384  * preceded by raw_size which is a u32. So we need to copy args to a temp
1385  * variable to read it. Most notably this avoids extended load instructions
1386  * on unaligned addresses
1387  */
1388 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1389 {
1390 	unsigned long val;
1391 	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1392 
1393 	memcpy(&val, p, sizeof(val));
1394 	return val;
1395 }
1396 
1397 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1398 				      struct syscall_arg *arg)
1399 {
1400 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1401 		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1402 
1403 	return scnprintf(bf, size, "arg%d: ", arg->idx);
1404 }
1405 
1406 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1407 				     struct syscall_arg *arg, unsigned long val)
1408 {
1409 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1410 		arg->val = val;
1411 		if (sc->arg_fmt[arg->idx].parm)
1412 			arg->parm = sc->arg_fmt[arg->idx].parm;
1413 		return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1414 	}
1415 	return scnprintf(bf, size, "%ld", val);
1416 }
1417 
1418 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1419 				      unsigned char *args, struct trace *trace,
1420 				      struct thread *thread)
1421 {
1422 	size_t printed = 0;
1423 	unsigned long val;
1424 	u8 bit = 1;
1425 	struct syscall_arg arg = {
1426 		.args	= args,
1427 		.idx	= 0,
1428 		.mask	= 0,
1429 		.trace  = trace,
1430 		.thread = thread,
1431 	};
1432 	struct thread_trace *ttrace = thread__priv(thread);
1433 
1434 	/*
1435 	 * Things like fcntl will set this in its 'cmd' formatter to pick the
1436 	 * right formatter for the return value (an fd? file flags?), which is
1437 	 * not needed for syscalls that always return a given type, say an fd.
1438 	 */
1439 	ttrace->ret_scnprintf = NULL;
1440 
1441 	if (sc->args != NULL) {
1442 		struct format_field *field;
1443 
1444 		for (field = sc->args; field;
1445 		     field = field->next, ++arg.idx, bit <<= 1) {
1446 			if (arg.mask & bit)
1447 				continue;
1448 
1449 			val = syscall_arg__val(&arg, arg.idx);
1450 
1451 			/*
1452  			 * Suppress this argument if its value is zero and
1453  			 * and we don't have a string associated in an
1454  			 * strarray for it.
1455  			 */
1456 			if (val == 0 &&
1457 			    !(sc->arg_fmt &&
1458 			      (sc->arg_fmt[arg.idx].show_zero ||
1459 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1460 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1461 			      sc->arg_fmt[arg.idx].parm))
1462 				continue;
1463 
1464 			printed += scnprintf(bf + printed, size - printed,
1465 					     "%s%s: ", printed ? ", " : "", field->name);
1466 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1467 		}
1468 	} else if (IS_ERR(sc->tp_format)) {
1469 		/*
1470 		 * If we managed to read the tracepoint /format file, then we
1471 		 * may end up not having any args, like with gettid(), so only
1472 		 * print the raw args when we didn't manage to read it.
1473 		 */
1474 		while (arg.idx < sc->nr_args) {
1475 			if (arg.mask & bit)
1476 				goto next_arg;
1477 			val = syscall_arg__val(&arg, arg.idx);
1478 			if (printed)
1479 				printed += scnprintf(bf + printed, size - printed, ", ");
1480 			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1481 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1482 next_arg:
1483 			++arg.idx;
1484 			bit <<= 1;
1485 		}
1486 	}
1487 
1488 	return printed;
1489 }
1490 
1491 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1492 				  union perf_event *event,
1493 				  struct perf_sample *sample);
1494 
1495 static struct syscall *trace__syscall_info(struct trace *trace,
1496 					   struct perf_evsel *evsel, int id)
1497 {
1498 
1499 	if (id < 0) {
1500 
1501 		/*
1502 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1503 		 * before that, leaving at a higher verbosity level till that is
1504 		 * explained. Reproduced with plain ftrace with:
1505 		 *
1506 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1507 		 * grep "NR -1 " /t/trace_pipe
1508 		 *
1509 		 * After generating some load on the machine.
1510  		 */
1511 		if (verbose > 1) {
1512 			static u64 n;
1513 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1514 				id, perf_evsel__name(evsel), ++n);
1515 		}
1516 		return NULL;
1517 	}
1518 
1519 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1520 	    trace__read_syscall_info(trace, id))
1521 		goto out_cant_read;
1522 
1523 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1524 		goto out_cant_read;
1525 
1526 	return &trace->syscalls.table[id];
1527 
1528 out_cant_read:
1529 	if (verbose > 0) {
1530 		fprintf(trace->output, "Problems reading syscall %d", id);
1531 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1532 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1533 		fputs(" information\n", trace->output);
1534 	}
1535 	return NULL;
1536 }
1537 
1538 static void thread__update_stats(struct thread_trace *ttrace,
1539 				 int id, struct perf_sample *sample)
1540 {
1541 	struct int_node *inode;
1542 	struct stats *stats;
1543 	u64 duration = 0;
1544 
1545 	inode = intlist__findnew(ttrace->syscall_stats, id);
1546 	if (inode == NULL)
1547 		return;
1548 
1549 	stats = inode->priv;
1550 	if (stats == NULL) {
1551 		stats = malloc(sizeof(struct stats));
1552 		if (stats == NULL)
1553 			return;
1554 		init_stats(stats);
1555 		inode->priv = stats;
1556 	}
1557 
1558 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1559 		duration = sample->time - ttrace->entry_time;
1560 
1561 	update_stats(stats, duration);
1562 }
1563 
1564 static int trace__printf_interrupted_entry(struct trace *trace)
1565 {
1566 	struct thread_trace *ttrace;
1567 	size_t printed;
1568 
1569 	if (trace->failure_only || trace->current == NULL)
1570 		return 0;
1571 
1572 	ttrace = thread__priv(trace->current);
1573 
1574 	if (!ttrace->entry_pending)
1575 		return 0;
1576 
1577 	printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1578 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1579 	ttrace->entry_pending = false;
1580 
1581 	return printed;
1582 }
1583 
1584 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1585 				 struct perf_sample *sample, struct thread *thread)
1586 {
1587 	int printed = 0;
1588 
1589 	if (trace->print_sample) {
1590 		double ts = (double)sample->time / NSEC_PER_MSEC;
1591 
1592 		printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1593 				   perf_evsel__name(evsel), ts,
1594 				   thread__comm_str(thread),
1595 				   sample->pid, sample->tid, sample->cpu);
1596 	}
1597 
1598 	return printed;
1599 }
1600 
1601 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1602 			    union perf_event *event __maybe_unused,
1603 			    struct perf_sample *sample)
1604 {
1605 	char *msg;
1606 	void *args;
1607 	size_t printed = 0;
1608 	struct thread *thread;
1609 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1610 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1611 	struct thread_trace *ttrace;
1612 
1613 	if (sc == NULL)
1614 		return -1;
1615 
1616 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1617 	ttrace = thread__trace(thread, trace->output);
1618 	if (ttrace == NULL)
1619 		goto out_put;
1620 
1621 	trace__fprintf_sample(trace, evsel, sample, thread);
1622 
1623 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1624 
1625 	if (ttrace->entry_str == NULL) {
1626 		ttrace->entry_str = malloc(trace__entry_str_size);
1627 		if (!ttrace->entry_str)
1628 			goto out_put;
1629 	}
1630 
1631 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1632 		trace__printf_interrupted_entry(trace);
1633 
1634 	ttrace->entry_time = sample->time;
1635 	msg = ttrace->entry_str;
1636 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1637 
1638 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1639 					   args, trace, thread);
1640 
1641 	if (sc->is_exit) {
1642 		if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1643 			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1644 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1645 		}
1646 	} else {
1647 		ttrace->entry_pending = true;
1648 		/* See trace__vfs_getname & trace__sys_exit */
1649 		ttrace->filename.pending_open = false;
1650 	}
1651 
1652 	if (trace->current != thread) {
1653 		thread__put(trace->current);
1654 		trace->current = thread__get(thread);
1655 	}
1656 	err = 0;
1657 out_put:
1658 	thread__put(thread);
1659 	return err;
1660 }
1661 
1662 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1663 				    struct perf_sample *sample,
1664 				    struct callchain_cursor *cursor)
1665 {
1666 	struct addr_location al;
1667 	int max_stack = evsel->attr.sample_max_stack ?
1668 			evsel->attr.sample_max_stack :
1669 			trace->max_stack;
1670 
1671 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1672 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1673 		return -1;
1674 
1675 	return 0;
1676 }
1677 
1678 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1679 {
1680 	/* TODO: user-configurable print_opts */
1681 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1682 				        EVSEL__PRINT_DSO |
1683 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1684 
1685 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1686 }
1687 
1688 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1689 {
1690 	struct perf_env *env = perf_evsel__env(evsel);
1691 	const char *arch_name = perf_env__arch(env);
1692 
1693 	return arch_syscalls__strerrno(arch_name, err);
1694 }
1695 
1696 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1697 			   union perf_event *event __maybe_unused,
1698 			   struct perf_sample *sample)
1699 {
1700 	long ret;
1701 	u64 duration = 0;
1702 	bool duration_calculated = false;
1703 	struct thread *thread;
1704 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1705 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1706 	struct thread_trace *ttrace;
1707 
1708 	if (sc == NULL)
1709 		return -1;
1710 
1711 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1712 	ttrace = thread__trace(thread, trace->output);
1713 	if (ttrace == NULL)
1714 		goto out_put;
1715 
1716 	trace__fprintf_sample(trace, evsel, sample, thread);
1717 
1718 	if (trace->summary)
1719 		thread__update_stats(ttrace, id, sample);
1720 
1721 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1722 
1723 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1724 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1725 		ttrace->filename.pending_open = false;
1726 		++trace->stats.vfs_getname;
1727 	}
1728 
1729 	if (ttrace->entry_time) {
1730 		duration = sample->time - ttrace->entry_time;
1731 		if (trace__filter_duration(trace, duration))
1732 			goto out;
1733 		duration_calculated = true;
1734 	} else if (trace->duration_filter)
1735 		goto out;
1736 
1737 	if (sample->callchain) {
1738 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1739 		if (callchain_ret == 0) {
1740 			if (callchain_cursor.nr < trace->min_stack)
1741 				goto out;
1742 			callchain_ret = 1;
1743 		}
1744 	}
1745 
1746 	if (trace->summary_only || (ret >= 0 && trace->failure_only))
1747 		goto out;
1748 
1749 	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1750 
1751 	if (ttrace->entry_pending) {
1752 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1753 	} else {
1754 		fprintf(trace->output, " ... [");
1755 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1756 		fprintf(trace->output, "]: %s()", sc->name);
1757 	}
1758 
1759 	if (sc->fmt == NULL) {
1760 		if (ret < 0)
1761 			goto errno_print;
1762 signed_print:
1763 		fprintf(trace->output, ") = %ld", ret);
1764 	} else if (ret < 0) {
1765 errno_print: {
1766 		char bf[STRERR_BUFSIZE];
1767 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1768 			   *e = errno_to_name(evsel, -ret);
1769 
1770 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1771 	}
1772 	} else if (ret == 0 && sc->fmt->timeout)
1773 		fprintf(trace->output, ") = 0 Timeout");
1774 	else if (ttrace->ret_scnprintf) {
1775 		char bf[1024];
1776 		struct syscall_arg arg = {
1777 			.val	= ret,
1778 			.thread	= thread,
1779 			.trace	= trace,
1780 		};
1781 		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1782 		ttrace->ret_scnprintf = NULL;
1783 		fprintf(trace->output, ") = %s", bf);
1784 	} else if (sc->fmt->hexret)
1785 		fprintf(trace->output, ") = %#lx", ret);
1786 	else if (sc->fmt->errpid) {
1787 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1788 
1789 		if (child != NULL) {
1790 			fprintf(trace->output, ") = %ld", ret);
1791 			if (child->comm_set)
1792 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1793 			thread__put(child);
1794 		}
1795 	} else
1796 		goto signed_print;
1797 
1798 	fputc('\n', trace->output);
1799 
1800 	if (callchain_ret > 0)
1801 		trace__fprintf_callchain(trace, sample);
1802 	else if (callchain_ret < 0)
1803 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1804 out:
1805 	ttrace->entry_pending = false;
1806 	err = 0;
1807 out_put:
1808 	thread__put(thread);
1809 	return err;
1810 }
1811 
1812 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1813 			      union perf_event *event __maybe_unused,
1814 			      struct perf_sample *sample)
1815 {
1816 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1817 	struct thread_trace *ttrace;
1818 	size_t filename_len, entry_str_len, to_move;
1819 	ssize_t remaining_space;
1820 	char *pos;
1821 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1822 
1823 	if (!thread)
1824 		goto out;
1825 
1826 	ttrace = thread__priv(thread);
1827 	if (!ttrace)
1828 		goto out_put;
1829 
1830 	filename_len = strlen(filename);
1831 	if (filename_len == 0)
1832 		goto out_put;
1833 
1834 	if (ttrace->filename.namelen < filename_len) {
1835 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1836 
1837 		if (f == NULL)
1838 			goto out_put;
1839 
1840 		ttrace->filename.namelen = filename_len;
1841 		ttrace->filename.name = f;
1842 	}
1843 
1844 	strcpy(ttrace->filename.name, filename);
1845 	ttrace->filename.pending_open = true;
1846 
1847 	if (!ttrace->filename.ptr)
1848 		goto out_put;
1849 
1850 	entry_str_len = strlen(ttrace->entry_str);
1851 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1852 	if (remaining_space <= 0)
1853 		goto out_put;
1854 
1855 	if (filename_len > (size_t)remaining_space) {
1856 		filename += filename_len - remaining_space;
1857 		filename_len = remaining_space;
1858 	}
1859 
1860 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1861 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1862 	memmove(pos + filename_len, pos, to_move);
1863 	memcpy(pos, filename, filename_len);
1864 
1865 	ttrace->filename.ptr = 0;
1866 	ttrace->filename.entry_str_pos = 0;
1867 out_put:
1868 	thread__put(thread);
1869 out:
1870 	return 0;
1871 }
1872 
1873 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1874 				     union perf_event *event __maybe_unused,
1875 				     struct perf_sample *sample)
1876 {
1877         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1878 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1879 	struct thread *thread = machine__findnew_thread(trace->host,
1880 							sample->pid,
1881 							sample->tid);
1882 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1883 
1884 	if (ttrace == NULL)
1885 		goto out_dump;
1886 
1887 	ttrace->runtime_ms += runtime_ms;
1888 	trace->runtime_ms += runtime_ms;
1889 out_put:
1890 	thread__put(thread);
1891 	return 0;
1892 
1893 out_dump:
1894 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1895 	       evsel->name,
1896 	       perf_evsel__strval(evsel, sample, "comm"),
1897 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1898 	       runtime,
1899 	       perf_evsel__intval(evsel, sample, "vruntime"));
1900 	goto out_put;
1901 }
1902 
1903 static int bpf_output__printer(enum binary_printer_ops op,
1904 			       unsigned int val, void *extra __maybe_unused, FILE *fp)
1905 {
1906 	unsigned char ch = (unsigned char)val;
1907 
1908 	switch (op) {
1909 	case BINARY_PRINT_CHAR_DATA:
1910 		return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1911 	case BINARY_PRINT_DATA_BEGIN:
1912 	case BINARY_PRINT_LINE_BEGIN:
1913 	case BINARY_PRINT_ADDR:
1914 	case BINARY_PRINT_NUM_DATA:
1915 	case BINARY_PRINT_NUM_PAD:
1916 	case BINARY_PRINT_SEP:
1917 	case BINARY_PRINT_CHAR_PAD:
1918 	case BINARY_PRINT_LINE_END:
1919 	case BINARY_PRINT_DATA_END:
1920 	default:
1921 		break;
1922 	}
1923 
1924 	return 0;
1925 }
1926 
1927 static void bpf_output__fprintf(struct trace *trace,
1928 				struct perf_sample *sample)
1929 {
1930 	binary__fprintf(sample->raw_data, sample->raw_size, 8,
1931 			bpf_output__printer, NULL, trace->output);
1932 }
1933 
1934 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1935 				union perf_event *event __maybe_unused,
1936 				struct perf_sample *sample)
1937 {
1938 	int callchain_ret = 0;
1939 
1940 	if (sample->callchain) {
1941 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1942 		if (callchain_ret == 0) {
1943 			if (callchain_cursor.nr < trace->min_stack)
1944 				goto out;
1945 			callchain_ret = 1;
1946 		}
1947 	}
1948 
1949 	trace__printf_interrupted_entry(trace);
1950 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1951 
1952 	if (trace->trace_syscalls)
1953 		fprintf(trace->output, "(         ): ");
1954 
1955 	fprintf(trace->output, "%s:", evsel->name);
1956 
1957 	if (perf_evsel__is_bpf_output(evsel)) {
1958 		bpf_output__fprintf(trace, sample);
1959 	} else if (evsel->tp_format) {
1960 		event_format__fprintf(evsel->tp_format, sample->cpu,
1961 				      sample->raw_data, sample->raw_size,
1962 				      trace->output);
1963 	}
1964 
1965 	fprintf(trace->output, ")\n");
1966 
1967 	if (callchain_ret > 0)
1968 		trace__fprintf_callchain(trace, sample);
1969 	else if (callchain_ret < 0)
1970 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1971 out:
1972 	return 0;
1973 }
1974 
1975 static void print_location(FILE *f, struct perf_sample *sample,
1976 			   struct addr_location *al,
1977 			   bool print_dso, bool print_sym)
1978 {
1979 
1980 	if ((verbose > 0 || print_dso) && al->map)
1981 		fprintf(f, "%s@", al->map->dso->long_name);
1982 
1983 	if ((verbose > 0 || print_sym) && al->sym)
1984 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1985 			al->addr - al->sym->start);
1986 	else if (al->map)
1987 		fprintf(f, "0x%" PRIx64, al->addr);
1988 	else
1989 		fprintf(f, "0x%" PRIx64, sample->addr);
1990 }
1991 
1992 static int trace__pgfault(struct trace *trace,
1993 			  struct perf_evsel *evsel,
1994 			  union perf_event *event __maybe_unused,
1995 			  struct perf_sample *sample)
1996 {
1997 	struct thread *thread;
1998 	struct addr_location al;
1999 	char map_type = 'd';
2000 	struct thread_trace *ttrace;
2001 	int err = -1;
2002 	int callchain_ret = 0;
2003 
2004 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2005 
2006 	if (sample->callchain) {
2007 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2008 		if (callchain_ret == 0) {
2009 			if (callchain_cursor.nr < trace->min_stack)
2010 				goto out_put;
2011 			callchain_ret = 1;
2012 		}
2013 	}
2014 
2015 	ttrace = thread__trace(thread, trace->output);
2016 	if (ttrace == NULL)
2017 		goto out_put;
2018 
2019 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2020 		ttrace->pfmaj++;
2021 	else
2022 		ttrace->pfmin++;
2023 
2024 	if (trace->summary_only)
2025 		goto out;
2026 
2027 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
2028 			      sample->ip, &al);
2029 
2030 	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2031 
2032 	fprintf(trace->output, "%sfault [",
2033 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2034 		"maj" : "min");
2035 
2036 	print_location(trace->output, sample, &al, false, true);
2037 
2038 	fprintf(trace->output, "] => ");
2039 
2040 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
2041 				   sample->addr, &al);
2042 
2043 	if (!al.map) {
2044 		thread__find_addr_location(thread, sample->cpumode,
2045 					   MAP__FUNCTION, sample->addr, &al);
2046 
2047 		if (al.map)
2048 			map_type = 'x';
2049 		else
2050 			map_type = '?';
2051 	}
2052 
2053 	print_location(trace->output, sample, &al, true, false);
2054 
2055 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2056 
2057 	if (callchain_ret > 0)
2058 		trace__fprintf_callchain(trace, sample);
2059 	else if (callchain_ret < 0)
2060 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2061 out:
2062 	err = 0;
2063 out_put:
2064 	thread__put(thread);
2065 	return err;
2066 }
2067 
2068 static void trace__set_base_time(struct trace *trace,
2069 				 struct perf_evsel *evsel,
2070 				 struct perf_sample *sample)
2071 {
2072 	/*
2073 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2074 	 * and don't use sample->time unconditionally, we may end up having
2075 	 * some other event in the future without PERF_SAMPLE_TIME for good
2076 	 * reason, i.e. we may not be interested in its timestamps, just in
2077 	 * it taking place, picking some piece of information when it
2078 	 * appears in our event stream (vfs_getname comes to mind).
2079 	 */
2080 	if (trace->base_time == 0 && !trace->full_time &&
2081 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2082 		trace->base_time = sample->time;
2083 }
2084 
2085 static int trace__process_sample(struct perf_tool *tool,
2086 				 union perf_event *event,
2087 				 struct perf_sample *sample,
2088 				 struct perf_evsel *evsel,
2089 				 struct machine *machine __maybe_unused)
2090 {
2091 	struct trace *trace = container_of(tool, struct trace, tool);
2092 	struct thread *thread;
2093 	int err = 0;
2094 
2095 	tracepoint_handler handler = evsel->handler;
2096 
2097 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2098 	if (thread && thread__is_filtered(thread))
2099 		goto out;
2100 
2101 	trace__set_base_time(trace, evsel, sample);
2102 
2103 	if (handler) {
2104 		++trace->nr_events;
2105 		handler(trace, evsel, event, sample);
2106 	}
2107 out:
2108 	thread__put(thread);
2109 	return err;
2110 }
2111 
2112 static int trace__record(struct trace *trace, int argc, const char **argv)
2113 {
2114 	unsigned int rec_argc, i, j;
2115 	const char **rec_argv;
2116 	const char * const record_args[] = {
2117 		"record",
2118 		"-R",
2119 		"-m", "1024",
2120 		"-c", "1",
2121 	};
2122 
2123 	const char * const sc_args[] = { "-e", };
2124 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2125 	const char * const majpf_args[] = { "-e", "major-faults" };
2126 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2127 	const char * const minpf_args[] = { "-e", "minor-faults" };
2128 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2129 
2130 	/* +1 is for the event string below */
2131 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2132 		majpf_args_nr + minpf_args_nr + argc;
2133 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2134 
2135 	if (rec_argv == NULL)
2136 		return -ENOMEM;
2137 
2138 	j = 0;
2139 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2140 		rec_argv[j++] = record_args[i];
2141 
2142 	if (trace->trace_syscalls) {
2143 		for (i = 0; i < sc_args_nr; i++)
2144 			rec_argv[j++] = sc_args[i];
2145 
2146 		/* event string may be different for older kernels - e.g., RHEL6 */
2147 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2148 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2149 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2150 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2151 		else {
2152 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2153 			free(rec_argv);
2154 			return -1;
2155 		}
2156 	}
2157 
2158 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2159 		for (i = 0; i < majpf_args_nr; i++)
2160 			rec_argv[j++] = majpf_args[i];
2161 
2162 	if (trace->trace_pgfaults & TRACE_PFMIN)
2163 		for (i = 0; i < minpf_args_nr; i++)
2164 			rec_argv[j++] = minpf_args[i];
2165 
2166 	for (i = 0; i < (unsigned int)argc; i++)
2167 		rec_argv[j++] = argv[i];
2168 
2169 	return cmd_record(j, rec_argv);
2170 }
2171 
2172 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2173 
2174 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2175 {
2176 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2177 
2178 	if (IS_ERR(evsel))
2179 		return false;
2180 
2181 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2182 		perf_evsel__delete(evsel);
2183 		return false;
2184 	}
2185 
2186 	evsel->handler = trace__vfs_getname;
2187 	perf_evlist__add(evlist, evsel);
2188 	return true;
2189 }
2190 
2191 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2192 {
2193 	struct perf_evsel *evsel;
2194 	struct perf_event_attr attr = {
2195 		.type = PERF_TYPE_SOFTWARE,
2196 		.mmap_data = 1,
2197 	};
2198 
2199 	attr.config = config;
2200 	attr.sample_period = 1;
2201 
2202 	event_attr_init(&attr);
2203 
2204 	evsel = perf_evsel__new(&attr);
2205 	if (evsel)
2206 		evsel->handler = trace__pgfault;
2207 
2208 	return evsel;
2209 }
2210 
2211 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2212 {
2213 	const u32 type = event->header.type;
2214 	struct perf_evsel *evsel;
2215 
2216 	if (type != PERF_RECORD_SAMPLE) {
2217 		trace__process_event(trace, trace->host, event, sample);
2218 		return;
2219 	}
2220 
2221 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2222 	if (evsel == NULL) {
2223 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2224 		return;
2225 	}
2226 
2227 	trace__set_base_time(trace, evsel, sample);
2228 
2229 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2230 	    sample->raw_data == NULL) {
2231 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2232 		       perf_evsel__name(evsel), sample->tid,
2233 		       sample->cpu, sample->raw_size);
2234 	} else {
2235 		tracepoint_handler handler = evsel->handler;
2236 		handler(trace, evsel, event, sample);
2237 	}
2238 }
2239 
2240 static int trace__add_syscall_newtp(struct trace *trace)
2241 {
2242 	int ret = -1;
2243 	struct perf_evlist *evlist = trace->evlist;
2244 	struct perf_evsel *sys_enter, *sys_exit;
2245 
2246 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2247 	if (sys_enter == NULL)
2248 		goto out;
2249 
2250 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2251 		goto out_delete_sys_enter;
2252 
2253 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2254 	if (sys_exit == NULL)
2255 		goto out_delete_sys_enter;
2256 
2257 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2258 		goto out_delete_sys_exit;
2259 
2260 	perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2261 	perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2262 
2263 	perf_evlist__add(evlist, sys_enter);
2264 	perf_evlist__add(evlist, sys_exit);
2265 
2266 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2267 		/*
2268 		 * We're interested only in the user space callchain
2269 		 * leading to the syscall, allow overriding that for
2270 		 * debugging reasons using --kernel_syscall_callchains
2271 		 */
2272 		sys_exit->attr.exclude_callchain_kernel = 1;
2273 	}
2274 
2275 	trace->syscalls.events.sys_enter = sys_enter;
2276 	trace->syscalls.events.sys_exit  = sys_exit;
2277 
2278 	ret = 0;
2279 out:
2280 	return ret;
2281 
2282 out_delete_sys_exit:
2283 	perf_evsel__delete_priv(sys_exit);
2284 out_delete_sys_enter:
2285 	perf_evsel__delete_priv(sys_enter);
2286 	goto out;
2287 }
2288 
2289 static int trace__set_ev_qualifier_filter(struct trace *trace)
2290 {
2291 	int err = -1;
2292 	struct perf_evsel *sys_exit;
2293 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2294 						trace->ev_qualifier_ids.nr,
2295 						trace->ev_qualifier_ids.entries);
2296 
2297 	if (filter == NULL)
2298 		goto out_enomem;
2299 
2300 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2301 					  filter)) {
2302 		sys_exit = trace->syscalls.events.sys_exit;
2303 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2304 	}
2305 
2306 	free(filter);
2307 out:
2308 	return err;
2309 out_enomem:
2310 	errno = ENOMEM;
2311 	goto out;
2312 }
2313 
2314 static int trace__set_filter_loop_pids(struct trace *trace)
2315 {
2316 	unsigned int nr = 1;
2317 	pid_t pids[32] = {
2318 		getpid(),
2319 	};
2320 	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2321 
2322 	while (thread && nr < ARRAY_SIZE(pids)) {
2323 		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2324 
2325 		if (parent == NULL)
2326 			break;
2327 
2328 		if (!strcmp(thread__comm_str(parent), "sshd")) {
2329 			pids[nr++] = parent->tid;
2330 			break;
2331 		}
2332 		thread = parent;
2333 	}
2334 
2335 	return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2336 }
2337 
2338 static int trace__run(struct trace *trace, int argc, const char **argv)
2339 {
2340 	struct perf_evlist *evlist = trace->evlist;
2341 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2342 	int err = -1, i;
2343 	unsigned long before;
2344 	const bool forks = argc > 0;
2345 	bool draining = false;
2346 
2347 	trace->live = true;
2348 
2349 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2350 		goto out_error_raw_syscalls;
2351 
2352 	if (trace->trace_syscalls)
2353 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2354 
2355 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2356 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2357 		if (pgfault_maj == NULL)
2358 			goto out_error_mem;
2359 		perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2360 		perf_evlist__add(evlist, pgfault_maj);
2361 	}
2362 
2363 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2364 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2365 		if (pgfault_min == NULL)
2366 			goto out_error_mem;
2367 		perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2368 		perf_evlist__add(evlist, pgfault_min);
2369 	}
2370 
2371 	if (trace->sched &&
2372 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2373 				   trace__sched_stat_runtime))
2374 		goto out_error_sched_stat_runtime;
2375 
2376 	/*
2377 	 * If a global cgroup was set, apply it to all the events without an
2378 	 * explicit cgroup. I.e.:
2379 	 *
2380 	 * 	trace -G A -e sched:*switch
2381 	 *
2382 	 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2383 	 * _and_ sched:sched_switch to the 'A' cgroup, while:
2384 	 *
2385 	 * trace -e sched:*switch -G A
2386 	 *
2387 	 * will only set the sched:sched_switch event to the 'A' cgroup, all the
2388 	 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2389 	 * a cgroup (on the root cgroup, sys wide, etc).
2390 	 *
2391 	 * Multiple cgroups:
2392 	 *
2393 	 * trace -G A -e sched:*switch -G B
2394 	 *
2395 	 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2396 	 * to the 'B' cgroup.
2397 	 *
2398 	 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2399 	 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2400 	 */
2401 	if (trace->cgroup)
2402 		evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2403 
2404 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2405 	if (err < 0) {
2406 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2407 		goto out_delete_evlist;
2408 	}
2409 
2410 	err = trace__symbols_init(trace, evlist);
2411 	if (err < 0) {
2412 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2413 		goto out_delete_evlist;
2414 	}
2415 
2416 	perf_evlist__config(evlist, &trace->opts, &callchain_param);
2417 
2418 	signal(SIGCHLD, sig_handler);
2419 	signal(SIGINT, sig_handler);
2420 
2421 	if (forks) {
2422 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2423 						    argv, false, NULL);
2424 		if (err < 0) {
2425 			fprintf(trace->output, "Couldn't run the workload!\n");
2426 			goto out_delete_evlist;
2427 		}
2428 	}
2429 
2430 	err = perf_evlist__open(evlist);
2431 	if (err < 0)
2432 		goto out_error_open;
2433 
2434 	err = bpf__apply_obj_config();
2435 	if (err) {
2436 		char errbuf[BUFSIZ];
2437 
2438 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2439 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2440 			 errbuf);
2441 		goto out_error_open;
2442 	}
2443 
2444 	/*
2445 	 * Better not use !target__has_task() here because we need to cover the
2446 	 * case where no threads were specified in the command line, but a
2447 	 * workload was, and in that case we will fill in the thread_map when
2448 	 * we fork the workload in perf_evlist__prepare_workload.
2449 	 */
2450 	if (trace->filter_pids.nr > 0)
2451 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2452 	else if (thread_map__pid(evlist->threads, 0) == -1)
2453 		err = trace__set_filter_loop_pids(trace);
2454 
2455 	if (err < 0)
2456 		goto out_error_mem;
2457 
2458 	if (trace->ev_qualifier_ids.nr > 0) {
2459 		err = trace__set_ev_qualifier_filter(trace);
2460 		if (err < 0)
2461 			goto out_errno;
2462 
2463 		pr_debug("event qualifier tracepoint filter: %s\n",
2464 			 trace->syscalls.events.sys_exit->filter);
2465 	}
2466 
2467 	err = perf_evlist__apply_filters(evlist, &evsel);
2468 	if (err < 0)
2469 		goto out_error_apply_filters;
2470 
2471 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2472 	if (err < 0)
2473 		goto out_error_mmap;
2474 
2475 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2476 		perf_evlist__enable(evlist);
2477 
2478 	if (forks)
2479 		perf_evlist__start_workload(evlist);
2480 
2481 	if (trace->opts.initial_delay) {
2482 		usleep(trace->opts.initial_delay * 1000);
2483 		perf_evlist__enable(evlist);
2484 	}
2485 
2486 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2487 				  evlist->threads->nr > 1 ||
2488 				  perf_evlist__first(evlist)->attr.inherit;
2489 
2490 	/*
2491 	 * Now that we already used evsel->attr to ask the kernel to setup the
2492 	 * events, lets reuse evsel->attr.sample_max_stack as the limit in
2493 	 * trace__resolve_callchain(), allowing per-event max-stack settings
2494 	 * to override an explicitely set --max-stack global setting.
2495 	 */
2496 	evlist__for_each_entry(evlist, evsel) {
2497 		if ((evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
2498 		    evsel->attr.sample_max_stack == 0)
2499 			evsel->attr.sample_max_stack = trace->max_stack;
2500 	}
2501 again:
2502 	before = trace->nr_events;
2503 
2504 	for (i = 0; i < evlist->nr_mmaps; i++) {
2505 		union perf_event *event;
2506 		struct perf_mmap *md;
2507 
2508 		md = &evlist->mmap[i];
2509 		if (perf_mmap__read_init(md) < 0)
2510 			continue;
2511 
2512 		while ((event = perf_mmap__read_event(md)) != NULL) {
2513 			struct perf_sample sample;
2514 
2515 			++trace->nr_events;
2516 
2517 			err = perf_evlist__parse_sample(evlist, event, &sample);
2518 			if (err) {
2519 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2520 				goto next_event;
2521 			}
2522 
2523 			trace__handle_event(trace, event, &sample);
2524 next_event:
2525 			perf_mmap__consume(md);
2526 
2527 			if (interrupted)
2528 				goto out_disable;
2529 
2530 			if (done && !draining) {
2531 				perf_evlist__disable(evlist);
2532 				draining = true;
2533 			}
2534 		}
2535 		perf_mmap__read_done(md);
2536 	}
2537 
2538 	if (trace->nr_events == before) {
2539 		int timeout = done ? 100 : -1;
2540 
2541 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2542 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2543 				draining = true;
2544 
2545 			goto again;
2546 		}
2547 	} else {
2548 		goto again;
2549 	}
2550 
2551 out_disable:
2552 	thread__zput(trace->current);
2553 
2554 	perf_evlist__disable(evlist);
2555 
2556 	if (!err) {
2557 		if (trace->summary)
2558 			trace__fprintf_thread_summary(trace, trace->output);
2559 
2560 		if (trace->show_tool_stats) {
2561 			fprintf(trace->output, "Stats:\n "
2562 					       " vfs_getname : %" PRIu64 "\n"
2563 					       " proc_getname: %" PRIu64 "\n",
2564 				trace->stats.vfs_getname,
2565 				trace->stats.proc_getname);
2566 		}
2567 	}
2568 
2569 out_delete_evlist:
2570 	trace__symbols__exit(trace);
2571 
2572 	perf_evlist__delete(evlist);
2573 	cgroup__put(trace->cgroup);
2574 	trace->evlist = NULL;
2575 	trace->live = false;
2576 	return err;
2577 {
2578 	char errbuf[BUFSIZ];
2579 
2580 out_error_sched_stat_runtime:
2581 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2582 	goto out_error;
2583 
2584 out_error_raw_syscalls:
2585 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2586 	goto out_error;
2587 
2588 out_error_mmap:
2589 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2590 	goto out_error;
2591 
2592 out_error_open:
2593 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2594 
2595 out_error:
2596 	fprintf(trace->output, "%s\n", errbuf);
2597 	goto out_delete_evlist;
2598 
2599 out_error_apply_filters:
2600 	fprintf(trace->output,
2601 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2602 		evsel->filter, perf_evsel__name(evsel), errno,
2603 		str_error_r(errno, errbuf, sizeof(errbuf)));
2604 	goto out_delete_evlist;
2605 }
2606 out_error_mem:
2607 	fprintf(trace->output, "Not enough memory to run!\n");
2608 	goto out_delete_evlist;
2609 
2610 out_errno:
2611 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2612 	goto out_delete_evlist;
2613 }
2614 
2615 static int trace__replay(struct trace *trace)
2616 {
2617 	const struct perf_evsel_str_handler handlers[] = {
2618 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2619 	};
2620 	struct perf_data data = {
2621 		.file      = {
2622 			.path = input_name,
2623 		},
2624 		.mode      = PERF_DATA_MODE_READ,
2625 		.force     = trace->force,
2626 	};
2627 	struct perf_session *session;
2628 	struct perf_evsel *evsel;
2629 	int err = -1;
2630 
2631 	trace->tool.sample	  = trace__process_sample;
2632 	trace->tool.mmap	  = perf_event__process_mmap;
2633 	trace->tool.mmap2	  = perf_event__process_mmap2;
2634 	trace->tool.comm	  = perf_event__process_comm;
2635 	trace->tool.exit	  = perf_event__process_exit;
2636 	trace->tool.fork	  = perf_event__process_fork;
2637 	trace->tool.attr	  = perf_event__process_attr;
2638 	trace->tool.tracing_data  = perf_event__process_tracing_data;
2639 	trace->tool.build_id	  = perf_event__process_build_id;
2640 	trace->tool.namespaces	  = perf_event__process_namespaces;
2641 
2642 	trace->tool.ordered_events = true;
2643 	trace->tool.ordering_requires_timestamps = true;
2644 
2645 	/* add tid to output */
2646 	trace->multiple_threads = true;
2647 
2648 	session = perf_session__new(&data, false, &trace->tool);
2649 	if (session == NULL)
2650 		return -1;
2651 
2652 	if (trace->opts.target.pid)
2653 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2654 
2655 	if (trace->opts.target.tid)
2656 		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2657 
2658 	if (symbol__init(&session->header.env) < 0)
2659 		goto out;
2660 
2661 	trace->host = &session->machines.host;
2662 
2663 	err = perf_session__set_tracepoints_handlers(session, handlers);
2664 	if (err)
2665 		goto out;
2666 
2667 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2668 						     "raw_syscalls:sys_enter");
2669 	/* older kernels have syscalls tp versus raw_syscalls */
2670 	if (evsel == NULL)
2671 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2672 							     "syscalls:sys_enter");
2673 
2674 	if (evsel &&
2675 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2676 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2677 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2678 		goto out;
2679 	}
2680 
2681 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2682 						     "raw_syscalls:sys_exit");
2683 	if (evsel == NULL)
2684 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2685 							     "syscalls:sys_exit");
2686 	if (evsel &&
2687 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2688 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2689 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2690 		goto out;
2691 	}
2692 
2693 	evlist__for_each_entry(session->evlist, evsel) {
2694 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2695 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2696 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2697 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2698 			evsel->handler = trace__pgfault;
2699 	}
2700 
2701 	setup_pager();
2702 
2703 	err = perf_session__process_events(session);
2704 	if (err)
2705 		pr_err("Failed to process events, error %d", err);
2706 
2707 	else if (trace->summary)
2708 		trace__fprintf_thread_summary(trace, trace->output);
2709 
2710 out:
2711 	perf_session__delete(session);
2712 
2713 	return err;
2714 }
2715 
2716 static size_t trace__fprintf_threads_header(FILE *fp)
2717 {
2718 	size_t printed;
2719 
2720 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2721 
2722 	return printed;
2723 }
2724 
2725 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2726 	struct stats 	*stats;
2727 	double		msecs;
2728 	int		syscall;
2729 )
2730 {
2731 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2732 	struct stats *stats = source->priv;
2733 
2734 	entry->syscall = source->i;
2735 	entry->stats   = stats;
2736 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2737 }
2738 
2739 static size_t thread__dump_stats(struct thread_trace *ttrace,
2740 				 struct trace *trace, FILE *fp)
2741 {
2742 	size_t printed = 0;
2743 	struct syscall *sc;
2744 	struct rb_node *nd;
2745 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2746 
2747 	if (syscall_stats == NULL)
2748 		return 0;
2749 
2750 	printed += fprintf(fp, "\n");
2751 
2752 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2753 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2754 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2755 
2756 	resort_rb__for_each_entry(nd, syscall_stats) {
2757 		struct stats *stats = syscall_stats_entry->stats;
2758 		if (stats) {
2759 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2760 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2761 			double avg = avg_stats(stats);
2762 			double pct;
2763 			u64 n = (u64) stats->n;
2764 
2765 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2766 			avg /= NSEC_PER_MSEC;
2767 
2768 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2769 			printed += fprintf(fp, "   %-15s", sc->name);
2770 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2771 					   n, syscall_stats_entry->msecs, min, avg);
2772 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2773 		}
2774 	}
2775 
2776 	resort_rb__delete(syscall_stats);
2777 	printed += fprintf(fp, "\n\n");
2778 
2779 	return printed;
2780 }
2781 
2782 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2783 {
2784 	size_t printed = 0;
2785 	struct thread_trace *ttrace = thread__priv(thread);
2786 	double ratio;
2787 
2788 	if (ttrace == NULL)
2789 		return 0;
2790 
2791 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2792 
2793 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2794 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2795 	printed += fprintf(fp, "%.1f%%", ratio);
2796 	if (ttrace->pfmaj)
2797 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2798 	if (ttrace->pfmin)
2799 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2800 	if (trace->sched)
2801 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2802 	else if (fputc('\n', fp) != EOF)
2803 		++printed;
2804 
2805 	printed += thread__dump_stats(ttrace, trace, fp);
2806 
2807 	return printed;
2808 }
2809 
2810 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2811 {
2812 	return ttrace ? ttrace->nr_events : 0;
2813 }
2814 
2815 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2816 	struct thread *thread;
2817 )
2818 {
2819 	entry->thread = rb_entry(nd, struct thread, rb_node);
2820 }
2821 
2822 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2823 {
2824 	size_t printed = trace__fprintf_threads_header(fp);
2825 	struct rb_node *nd;
2826 	int i;
2827 
2828 	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2829 		DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2830 
2831 		if (threads == NULL) {
2832 			fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2833 			return 0;
2834 		}
2835 
2836 		resort_rb__for_each_entry(nd, threads)
2837 			printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2838 
2839 		resort_rb__delete(threads);
2840 	}
2841 	return printed;
2842 }
2843 
2844 static int trace__set_duration(const struct option *opt, const char *str,
2845 			       int unset __maybe_unused)
2846 {
2847 	struct trace *trace = opt->value;
2848 
2849 	trace->duration_filter = atof(str);
2850 	return 0;
2851 }
2852 
2853 static int trace__set_filter_pids(const struct option *opt, const char *str,
2854 				  int unset __maybe_unused)
2855 {
2856 	int ret = -1;
2857 	size_t i;
2858 	struct trace *trace = opt->value;
2859 	/*
2860 	 * FIXME: introduce a intarray class, plain parse csv and create a
2861 	 * { int nr, int entries[] } struct...
2862 	 */
2863 	struct intlist *list = intlist__new(str);
2864 
2865 	if (list == NULL)
2866 		return -1;
2867 
2868 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2869 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2870 
2871 	if (trace->filter_pids.entries == NULL)
2872 		goto out;
2873 
2874 	trace->filter_pids.entries[0] = getpid();
2875 
2876 	for (i = 1; i < trace->filter_pids.nr; ++i)
2877 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2878 
2879 	intlist__delete(list);
2880 	ret = 0;
2881 out:
2882 	return ret;
2883 }
2884 
2885 static int trace__open_output(struct trace *trace, const char *filename)
2886 {
2887 	struct stat st;
2888 
2889 	if (!stat(filename, &st) && st.st_size) {
2890 		char oldname[PATH_MAX];
2891 
2892 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2893 		unlink(oldname);
2894 		rename(filename, oldname);
2895 	}
2896 
2897 	trace->output = fopen(filename, "w");
2898 
2899 	return trace->output == NULL ? -errno : 0;
2900 }
2901 
2902 static int parse_pagefaults(const struct option *opt, const char *str,
2903 			    int unset __maybe_unused)
2904 {
2905 	int *trace_pgfaults = opt->value;
2906 
2907 	if (strcmp(str, "all") == 0)
2908 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2909 	else if (strcmp(str, "maj") == 0)
2910 		*trace_pgfaults |= TRACE_PFMAJ;
2911 	else if (strcmp(str, "min") == 0)
2912 		*trace_pgfaults |= TRACE_PFMIN;
2913 	else
2914 		return -1;
2915 
2916 	return 0;
2917 }
2918 
2919 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2920 {
2921 	struct perf_evsel *evsel;
2922 
2923 	evlist__for_each_entry(evlist, evsel)
2924 		evsel->handler = handler;
2925 }
2926 
2927 /*
2928  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2929  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2930  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2931  *
2932  * It'd be better to introduce a parse_options() variant that would return a
2933  * list with the terms it didn't match to an event...
2934  */
2935 static int trace__parse_events_option(const struct option *opt, const char *str,
2936 				      int unset __maybe_unused)
2937 {
2938 	struct trace *trace = (struct trace *)opt->value;
2939 	const char *s = str;
2940 	char *sep = NULL, *lists[2] = { NULL, NULL, };
2941 	int len = strlen(str) + 1, err = -1, list, idx;
2942 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2943 	char group_name[PATH_MAX];
2944 
2945 	if (strace_groups_dir == NULL)
2946 		return -1;
2947 
2948 	if (*s == '!') {
2949 		++s;
2950 		trace->not_ev_qualifier = true;
2951 	}
2952 
2953 	while (1) {
2954 		if ((sep = strchr(s, ',')) != NULL)
2955 			*sep = '\0';
2956 
2957 		list = 0;
2958 		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2959 		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2960 			list = 1;
2961 		} else {
2962 			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2963 			if (access(group_name, R_OK) == 0)
2964 				list = 1;
2965 		}
2966 
2967 		if (lists[list]) {
2968 			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2969 		} else {
2970 			lists[list] = malloc(len);
2971 			if (lists[list] == NULL)
2972 				goto out;
2973 			strcpy(lists[list], s);
2974 		}
2975 
2976 		if (!sep)
2977 			break;
2978 
2979 		*sep = ',';
2980 		s = sep + 1;
2981 	}
2982 
2983 	if (lists[1] != NULL) {
2984 		struct strlist_config slist_config = {
2985 			.dirname = strace_groups_dir,
2986 		};
2987 
2988 		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2989 		if (trace->ev_qualifier == NULL) {
2990 			fputs("Not enough memory to parse event qualifier", trace->output);
2991 			goto out;
2992 		}
2993 
2994 		if (trace__validate_ev_qualifier(trace))
2995 			goto out;
2996 	}
2997 
2998 	err = 0;
2999 
3000 	if (lists[0]) {
3001 		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3002 					       "event selector. use 'perf list' to list available events",
3003 					       parse_events_option);
3004 		err = parse_events_option(&o, lists[0], 0);
3005 	}
3006 out:
3007 	if (sep)
3008 		*sep = ',';
3009 
3010 	return err;
3011 }
3012 
3013 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3014 {
3015 	struct trace *trace = opt->value;
3016 
3017 	if (!list_empty(&trace->evlist->entries))
3018 		return parse_cgroups(opt, str, unset);
3019 
3020 	trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3021 
3022 	return 0;
3023 }
3024 
3025 int cmd_trace(int argc, const char **argv)
3026 {
3027 	const char *trace_usage[] = {
3028 		"perf trace [<options>] [<command>]",
3029 		"perf trace [<options>] -- <command> [<options>]",
3030 		"perf trace record [<options>] [<command>]",
3031 		"perf trace record [<options>] -- <command> [<options>]",
3032 		NULL
3033 	};
3034 	struct trace trace = {
3035 		.syscalls = {
3036 			. max = -1,
3037 		},
3038 		.opts = {
3039 			.target = {
3040 				.uid	   = UINT_MAX,
3041 				.uses_mmap = true,
3042 			},
3043 			.user_freq     = UINT_MAX,
3044 			.user_interval = ULLONG_MAX,
3045 			.no_buffering  = true,
3046 			.mmap_pages    = UINT_MAX,
3047 			.proc_map_timeout  = 500,
3048 		},
3049 		.output = stderr,
3050 		.show_comm = true,
3051 		.trace_syscalls = true,
3052 		.kernel_syscallchains = false,
3053 		.max_stack = UINT_MAX,
3054 	};
3055 	const char *output_name = NULL;
3056 	const struct option trace_options[] = {
3057 	OPT_CALLBACK('e', "event", &trace, "event",
3058 		     "event/syscall selector. use 'perf list' to list available events",
3059 		     trace__parse_events_option),
3060 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
3061 		    "show the thread COMM next to its id"),
3062 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3063 	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3064 		     trace__parse_events_option),
3065 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
3066 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3067 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3068 		    "trace events on existing process id"),
3069 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3070 		    "trace events on existing thread id"),
3071 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3072 		     "pids to filter (by the kernel)", trace__set_filter_pids),
3073 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3074 		    "system-wide collection from all CPUs"),
3075 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3076 		    "list of cpus to monitor"),
3077 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3078 		    "child tasks do not inherit counters"),
3079 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3080 		     "number of mmap data pages",
3081 		     perf_evlist__parse_mmap_pages),
3082 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3083 		   "user to profile"),
3084 	OPT_CALLBACK(0, "duration", &trace, "float",
3085 		     "show only events with duration > N.M ms",
3086 		     trace__set_duration),
3087 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3088 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3089 	OPT_BOOLEAN('T', "time", &trace.full_time,
3090 		    "Show full timestamp, not time relative to first start"),
3091 	OPT_BOOLEAN(0, "failure", &trace.failure_only,
3092 		    "Show only syscalls that failed"),
3093 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
3094 		    "Show only syscall summary with statistics"),
3095 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
3096 		    "Show all syscalls and summary with statistics"),
3097 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3098 		     "Trace pagefaults", parse_pagefaults, "maj"),
3099 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3100 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3101 	OPT_CALLBACK(0, "call-graph", &trace.opts,
3102 		     "record_mode[,record_size]", record_callchain_help,
3103 		     &record_parse_callchain_opt),
3104 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3105 		    "Show the kernel callchains on the syscall exit path"),
3106 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3107 		     "Set the minimum stack depth when parsing the callchain, "
3108 		     "anything below the specified depth will be ignored."),
3109 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3110 		     "Set the maximum stack depth when parsing the callchain, "
3111 		     "anything beyond the specified depth will be ignored. "
3112 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3113 	OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3114 			"print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3115 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3116 			"per thread proc mmap processing timeout in ms"),
3117 	OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3118 		     trace__parse_cgroups),
3119 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3120 		     "ms to wait before starting measurement after program "
3121 		     "start"),
3122 	OPT_END()
3123 	};
3124 	bool __maybe_unused max_stack_user_set = true;
3125 	bool mmap_pages_user_set = true;
3126 	const char * const trace_subcommands[] = { "record", NULL };
3127 	int err;
3128 	char bf[BUFSIZ];
3129 
3130 	signal(SIGSEGV, sighandler_dump_stack);
3131 	signal(SIGFPE, sighandler_dump_stack);
3132 
3133 	trace.evlist = perf_evlist__new();
3134 	trace.sctbl = syscalltbl__new();
3135 
3136 	if (trace.evlist == NULL || trace.sctbl == NULL) {
3137 		pr_err("Not enough memory to run!\n");
3138 		err = -ENOMEM;
3139 		goto out;
3140 	}
3141 
3142 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3143 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3144 
3145 	if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3146 		usage_with_options_msg(trace_usage, trace_options,
3147 				       "cgroup monitoring only available in system-wide mode");
3148 	}
3149 
3150 	err = bpf__setup_stdout(trace.evlist);
3151 	if (err) {
3152 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3153 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3154 		goto out;
3155 	}
3156 
3157 	err = -1;
3158 
3159 	if (trace.trace_pgfaults) {
3160 		trace.opts.sample_address = true;
3161 		trace.opts.sample_time = true;
3162 	}
3163 
3164 	if (trace.opts.mmap_pages == UINT_MAX)
3165 		mmap_pages_user_set = false;
3166 
3167 	if (trace.max_stack == UINT_MAX) {
3168 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3169 		max_stack_user_set = false;
3170 	}
3171 
3172 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3173 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3174 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3175 	}
3176 #endif
3177 
3178 	if (callchain_param.enabled) {
3179 		if (!mmap_pages_user_set && geteuid() == 0)
3180 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3181 
3182 		symbol_conf.use_callchain = true;
3183 	}
3184 
3185 	if (trace.evlist->nr_entries > 0)
3186 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3187 
3188 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3189 		return trace__record(&trace, argc-1, &argv[1]);
3190 
3191 	/* summary_only implies summary option, but don't overwrite summary if set */
3192 	if (trace.summary_only)
3193 		trace.summary = trace.summary_only;
3194 
3195 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3196 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3197 		pr_err("Please specify something to trace.\n");
3198 		return -1;
3199 	}
3200 
3201 	if (!trace.trace_syscalls && trace.ev_qualifier) {
3202 		pr_err("The -e option can't be used with --no-syscalls.\n");
3203 		goto out;
3204 	}
3205 
3206 	if (output_name != NULL) {
3207 		err = trace__open_output(&trace, output_name);
3208 		if (err < 0) {
3209 			perror("failed to create output file");
3210 			goto out;
3211 		}
3212 	}
3213 
3214 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
3215 
3216 	err = target__validate(&trace.opts.target);
3217 	if (err) {
3218 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3219 		fprintf(trace.output, "%s", bf);
3220 		goto out_close;
3221 	}
3222 
3223 	err = target__parse_uid(&trace.opts.target);
3224 	if (err) {
3225 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3226 		fprintf(trace.output, "%s", bf);
3227 		goto out_close;
3228 	}
3229 
3230 	if (!argc && target__none(&trace.opts.target))
3231 		trace.opts.target.system_wide = true;
3232 
3233 	if (input_name)
3234 		err = trace__replay(&trace);
3235 	else
3236 		err = trace__run(&trace, argc, argv);
3237 
3238 out_close:
3239 	if (output_name != NULL)
3240 		fclose(trace.output);
3241 out:
3242 	return err;
3243 }
3244