xref: /linux/tools/perf/builtin-trace.c (revision 591421e151ddf95e43d690a5c9b291d8e1cb8065)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/env.h"
25 #include "util/event.h"
26 #include "util/evlist.h"
27 #include <subcmd/exec-cmd.h>
28 #include "util/machine.h"
29 #include "util/path.h"
30 #include "util/session.h"
31 #include "util/thread.h"
32 #include <subcmd/parse-options.h>
33 #include "util/strlist.h"
34 #include "util/intlist.h"
35 #include "util/thread_map.h"
36 #include "util/stat.h"
37 #include "trace/beauty/beauty.h"
38 #include "trace-event.h"
39 #include "util/parse-events.h"
40 #include "util/bpf-loader.h"
41 #include "callchain.h"
42 #include "print_binary.h"
43 #include "string2.h"
44 #include "syscalltbl.h"
45 #include "rb_resort.h"
46 
47 #include <errno.h>
48 #include <inttypes.h>
49 #include <poll.h>
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/kernel.h>
56 #include <linux/random.h>
57 #include <linux/stringify.h>
58 #include <linux/time64.h>
59 
60 #include "sane_ctype.h"
61 
62 #ifndef O_CLOEXEC
63 # define O_CLOEXEC		02000000
64 #endif
65 
66 #ifndef F_LINUX_SPECIFIC_BASE
67 # define F_LINUX_SPECIFIC_BASE	1024
68 #endif
69 
70 struct trace {
71 	struct perf_tool	tool;
72 	struct syscalltbl	*sctbl;
73 	struct {
74 		int		max;
75 		struct syscall  *table;
76 		struct {
77 			struct perf_evsel *sys_enter,
78 					  *sys_exit;
79 		}		events;
80 	} syscalls;
81 	struct record_opts	opts;
82 	struct perf_evlist	*evlist;
83 	struct machine		*host;
84 	struct thread		*current;
85 	u64			base_time;
86 	FILE			*output;
87 	unsigned long		nr_events;
88 	struct strlist		*ev_qualifier;
89 	struct {
90 		size_t		nr;
91 		int		*entries;
92 	}			ev_qualifier_ids;
93 	struct {
94 		size_t		nr;
95 		pid_t		*entries;
96 	}			filter_pids;
97 	double			duration_filter;
98 	double			runtime_ms;
99 	struct {
100 		u64		vfs_getname,
101 				proc_getname;
102 	} stats;
103 	unsigned int		max_stack;
104 	unsigned int		min_stack;
105 	bool			not_ev_qualifier;
106 	bool			live;
107 	bool			full_time;
108 	bool			sched;
109 	bool			multiple_threads;
110 	bool			summary;
111 	bool			summary_only;
112 	bool			show_comm;
113 	bool			print_sample;
114 	bool			show_tool_stats;
115 	bool			trace_syscalls;
116 	bool			kernel_syscallchains;
117 	bool			force;
118 	bool			vfs_getname;
119 	int			trace_pgfaults;
120 	int			open_id;
121 };
122 
123 struct tp_field {
124 	int offset;
125 	union {
126 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
127 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
128 	};
129 };
130 
131 #define TP_UINT_FIELD(bits) \
132 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
133 { \
134 	u##bits value; \
135 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
136 	return value;  \
137 }
138 
139 TP_UINT_FIELD(8);
140 TP_UINT_FIELD(16);
141 TP_UINT_FIELD(32);
142 TP_UINT_FIELD(64);
143 
144 #define TP_UINT_FIELD__SWAPPED(bits) \
145 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
146 { \
147 	u##bits value; \
148 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
149 	return bswap_##bits(value);\
150 }
151 
152 TP_UINT_FIELD__SWAPPED(16);
153 TP_UINT_FIELD__SWAPPED(32);
154 TP_UINT_FIELD__SWAPPED(64);
155 
156 static int tp_field__init_uint(struct tp_field *field,
157 			       struct format_field *format_field,
158 			       bool needs_swap)
159 {
160 	field->offset = format_field->offset;
161 
162 	switch (format_field->size) {
163 	case 1:
164 		field->integer = tp_field__u8;
165 		break;
166 	case 2:
167 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
168 		break;
169 	case 4:
170 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
171 		break;
172 	case 8:
173 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
174 		break;
175 	default:
176 		return -1;
177 	}
178 
179 	return 0;
180 }
181 
182 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
183 {
184 	return sample->raw_data + field->offset;
185 }
186 
187 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
188 {
189 	field->offset = format_field->offset;
190 	field->pointer = tp_field__ptr;
191 	return 0;
192 }
193 
194 struct syscall_tp {
195 	struct tp_field id;
196 	union {
197 		struct tp_field args, ret;
198 	};
199 };
200 
201 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
202 					  struct tp_field *field,
203 					  const char *name)
204 {
205 	struct format_field *format_field = perf_evsel__field(evsel, name);
206 
207 	if (format_field == NULL)
208 		return -1;
209 
210 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
211 }
212 
213 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
214 	({ struct syscall_tp *sc = evsel->priv;\
215 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
216 
217 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
218 					 struct tp_field *field,
219 					 const char *name)
220 {
221 	struct format_field *format_field = perf_evsel__field(evsel, name);
222 
223 	if (format_field == NULL)
224 		return -1;
225 
226 	return tp_field__init_ptr(field, format_field);
227 }
228 
229 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
230 	({ struct syscall_tp *sc = evsel->priv;\
231 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
232 
233 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
234 {
235 	zfree(&evsel->priv);
236 	perf_evsel__delete(evsel);
237 }
238 
239 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
240 {
241 	evsel->priv = malloc(sizeof(struct syscall_tp));
242 	if (evsel->priv != NULL) {
243 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
244 			goto out_delete;
245 
246 		evsel->handler = handler;
247 		return 0;
248 	}
249 
250 	return -ENOMEM;
251 
252 out_delete:
253 	zfree(&evsel->priv);
254 	return -ENOENT;
255 }
256 
257 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
258 {
259 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
260 
261 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
262 	if (IS_ERR(evsel))
263 		evsel = perf_evsel__newtp("syscalls", direction);
264 
265 	if (IS_ERR(evsel))
266 		return NULL;
267 
268 	if (perf_evsel__init_syscall_tp(evsel, handler))
269 		goto out_delete;
270 
271 	return evsel;
272 
273 out_delete:
274 	perf_evsel__delete_priv(evsel);
275 	return NULL;
276 }
277 
278 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
279 	({ struct syscall_tp *fields = evsel->priv; \
280 	   fields->name.integer(&fields->name, sample); })
281 
282 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
283 	({ struct syscall_tp *fields = evsel->priv; \
284 	   fields->name.pointer(&fields->name, sample); })
285 
286 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
287 {
288 	int idx = val - sa->offset;
289 
290 	if (idx < 0 || idx >= sa->nr_entries)
291 		return scnprintf(bf, size, intfmt, val);
292 
293 	return scnprintf(bf, size, "%s", sa->entries[idx]);
294 }
295 
296 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
297 						const char *intfmt,
298 					        struct syscall_arg *arg)
299 {
300 	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
301 }
302 
303 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
304 					      struct syscall_arg *arg)
305 {
306 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
307 }
308 
309 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
310 
311 struct strarrays {
312 	int		nr_entries;
313 	struct strarray **entries;
314 };
315 
316 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
317 	.nr_entries = ARRAY_SIZE(array), \
318 	.entries = array, \
319 }
320 
321 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
322 					struct syscall_arg *arg)
323 {
324 	struct strarrays *sas = arg->parm;
325 	int i;
326 
327 	for (i = 0; i < sas->nr_entries; ++i) {
328 		struct strarray *sa = sas->entries[i];
329 		int idx = arg->val - sa->offset;
330 
331 		if (idx >= 0 && idx < sa->nr_entries) {
332 			if (sa->entries[idx] == NULL)
333 				break;
334 			return scnprintf(bf, size, "%s", sa->entries[idx]);
335 		}
336 	}
337 
338 	return scnprintf(bf, size, "%d", arg->val);
339 }
340 
341 #ifndef AT_FDCWD
342 #define AT_FDCWD	-100
343 #endif
344 
345 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
346 					   struct syscall_arg *arg)
347 {
348 	int fd = arg->val;
349 
350 	if (fd == AT_FDCWD)
351 		return scnprintf(bf, size, "CWD");
352 
353 	return syscall_arg__scnprintf_fd(bf, size, arg);
354 }
355 
356 #define SCA_FDAT syscall_arg__scnprintf_fd_at
357 
358 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
359 					      struct syscall_arg *arg);
360 
361 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
362 
363 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
364 {
365 	return scnprintf(bf, size, "%#lx", arg->val);
366 }
367 
368 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
369 {
370 	return scnprintf(bf, size, "%d", arg->val);
371 }
372 
373 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
374 {
375 	return scnprintf(bf, size, "%ld", arg->val);
376 }
377 
378 static const char *bpf_cmd[] = {
379 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
380 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
381 };
382 static DEFINE_STRARRAY(bpf_cmd);
383 
384 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
385 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
386 
387 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
388 static DEFINE_STRARRAY(itimers);
389 
390 static const char *keyctl_options[] = {
391 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
392 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
393 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
394 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
395 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
396 };
397 static DEFINE_STRARRAY(keyctl_options);
398 
399 static const char *whences[] = { "SET", "CUR", "END",
400 #ifdef SEEK_DATA
401 "DATA",
402 #endif
403 #ifdef SEEK_HOLE
404 "HOLE",
405 #endif
406 };
407 static DEFINE_STRARRAY(whences);
408 
409 static const char *fcntl_cmds[] = {
410 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
411 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
412 	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
413 	"GETOWNER_UIDS",
414 };
415 static DEFINE_STRARRAY(fcntl_cmds);
416 
417 static const char *fcntl_linux_specific_cmds[] = {
418 	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
419 	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
420 	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
421 };
422 
423 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
424 
425 static struct strarray *fcntl_cmds_arrays[] = {
426 	&strarray__fcntl_cmds,
427 	&strarray__fcntl_linux_specific_cmds,
428 };
429 
430 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
431 
432 static const char *rlimit_resources[] = {
433 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
434 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
435 	"RTTIME",
436 };
437 static DEFINE_STRARRAY(rlimit_resources);
438 
439 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
440 static DEFINE_STRARRAY(sighow);
441 
442 static const char *clockid[] = {
443 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
444 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
445 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
446 };
447 static DEFINE_STRARRAY(clockid);
448 
449 static const char *socket_families[] = {
450 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
451 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
452 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
453 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
454 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
455 	"ALG", "NFC", "VSOCK",
456 };
457 static DEFINE_STRARRAY(socket_families);
458 
459 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
460 						 struct syscall_arg *arg)
461 {
462 	size_t printed = 0;
463 	int mode = arg->val;
464 
465 	if (mode == F_OK) /* 0 */
466 		return scnprintf(bf, size, "F");
467 #define	P_MODE(n) \
468 	if (mode & n##_OK) { \
469 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
470 		mode &= ~n##_OK; \
471 	}
472 
473 	P_MODE(R);
474 	P_MODE(W);
475 	P_MODE(X);
476 #undef P_MODE
477 
478 	if (mode)
479 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
480 
481 	return printed;
482 }
483 
484 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
485 
486 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
487 					      struct syscall_arg *arg);
488 
489 #define SCA_FILENAME syscall_arg__scnprintf_filename
490 
491 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
492 						struct syscall_arg *arg)
493 {
494 	int printed = 0, flags = arg->val;
495 
496 #define	P_FLAG(n) \
497 	if (flags & O_##n) { \
498 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
499 		flags &= ~O_##n; \
500 	}
501 
502 	P_FLAG(CLOEXEC);
503 	P_FLAG(NONBLOCK);
504 #undef P_FLAG
505 
506 	if (flags)
507 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
508 
509 	return printed;
510 }
511 
512 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
513 
514 #ifndef GRND_NONBLOCK
515 #define GRND_NONBLOCK	0x0001
516 #endif
517 #ifndef GRND_RANDOM
518 #define GRND_RANDOM	0x0002
519 #endif
520 
521 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
522 						   struct syscall_arg *arg)
523 {
524 	int printed = 0, flags = arg->val;
525 
526 #define	P_FLAG(n) \
527 	if (flags & GRND_##n) { \
528 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
529 		flags &= ~GRND_##n; \
530 	}
531 
532 	P_FLAG(RANDOM);
533 	P_FLAG(NONBLOCK);
534 #undef P_FLAG
535 
536 	if (flags)
537 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
538 
539 	return printed;
540 }
541 
542 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
543 
544 #define STRARRAY(name, array) \
545 	  { .scnprintf	= SCA_STRARRAY, \
546 	    .parm	= &strarray__##array, }
547 
548 #include "trace/beauty/arch_errno_names.c"
549 #include "trace/beauty/eventfd.c"
550 #include "trace/beauty/flock.c"
551 #include "trace/beauty/futex_op.c"
552 #include "trace/beauty/mmap.c"
553 #include "trace/beauty/mode_t.c"
554 #include "trace/beauty/msg_flags.c"
555 #include "trace/beauty/open_flags.c"
556 #include "trace/beauty/perf_event_open.c"
557 #include "trace/beauty/pid.c"
558 #include "trace/beauty/sched_policy.c"
559 #include "trace/beauty/seccomp.c"
560 #include "trace/beauty/signum.c"
561 #include "trace/beauty/socket_type.c"
562 #include "trace/beauty/waitid_options.c"
563 
564 struct syscall_arg_fmt {
565 	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
566 	void	   *parm;
567 	const char *name;
568 	bool	   show_zero;
569 };
570 
571 static struct syscall_fmt {
572 	const char *name;
573 	const char *alias;
574 	struct syscall_arg_fmt arg[6];
575 	u8	   nr_args;
576 	bool	   errpid;
577 	bool	   timeout;
578 	bool	   hexret;
579 } syscall_fmts[] = {
580 	{ .name	    = "access",
581 	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
582 	{ .name	    = "bpf",
583 	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
584 	{ .name	    = "brk",	    .hexret = true,
585 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
586 	{ .name     = "clock_gettime",
587 	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
588 	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
589 	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
590 		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
591 		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
592 		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
593 		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
594 	{ .name	    = "close",
595 	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
596 	{ .name	    = "epoll_ctl",
597 	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
598 	{ .name	    = "eventfd2",
599 	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
600 	{ .name	    = "fchmodat",
601 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
602 	{ .name	    = "fchownat",
603 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
604 	{ .name	    = "fcntl",
605 	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
606 			   .parm      = &strarrays__fcntl_cmds_arrays,
607 			   .show_zero = true, },
608 		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
609 	{ .name	    = "flock",
610 	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
611 	{ .name	    = "fstat", .alias = "newfstat", },
612 	{ .name	    = "fstatat", .alias = "newfstatat", },
613 	{ .name	    = "futex",
614 	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, }, },
615 	{ .name	    = "futimesat",
616 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
617 	{ .name	    = "getitimer",
618 	  .arg = { [0] = STRARRAY(which, itimers), }, },
619 	{ .name	    = "getpid",	    .errpid = true, },
620 	{ .name	    = "getpgid",    .errpid = true, },
621 	{ .name	    = "getppid",    .errpid = true, },
622 	{ .name	    = "getrandom",
623 	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
624 	{ .name	    = "getrlimit",
625 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
626 	{ .name	    = "gettid",	    .errpid = true, },
627 	{ .name	    = "ioctl",
628 	  .arg = {
629 #if defined(__i386__) || defined(__x86_64__)
630 /*
631  * FIXME: Make this available to all arches.
632  */
633 		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
634 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
635 #else
636 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
637 #endif
638 	{ .name	    = "kcmp",	    .nr_args = 5,
639 	  .arg = { [0] = { .name = "pid1",	.scnprintf = SCA_PID, },
640 		   [1] = { .name = "pid2",	.scnprintf = SCA_PID, },
641 		   [2] = { .name = "type",	.scnprintf = SCA_KCMP_TYPE, },
642 		   [3] = { .name = "idx1",	.scnprintf = SCA_KCMP_IDX, },
643 		   [4] = { .name = "idx2",	.scnprintf = SCA_KCMP_IDX, }, }, },
644 	{ .name	    = "keyctl",
645 	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
646 	{ .name	    = "kill",
647 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
648 	{ .name	    = "linkat",
649 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
650 	{ .name	    = "lseek",
651 	  .arg = { [2] = STRARRAY(whence, whences), }, },
652 	{ .name	    = "lstat", .alias = "newlstat", },
653 	{ .name     = "madvise",
654 	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
655 		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
656 	{ .name	    = "mkdirat",
657 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
658 	{ .name	    = "mknodat",
659 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
660 	{ .name	    = "mlock",
661 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
662 	{ .name	    = "mlockall",
663 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
664 	{ .name	    = "mmap",	    .hexret = true,
665 /* The standard mmap maps to old_mmap on s390x */
666 #if defined(__s390x__)
667 	.alias = "old_mmap",
668 #endif
669 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* addr */ },
670 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
671 		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */ }, }, },
672 	{ .name	    = "mprotect",
673 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
674 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
675 	{ .name	    = "mq_unlink",
676 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
677 	{ .name	    = "mremap",	    .hexret = true,
678 	  .arg = { [0] = { .scnprintf = SCA_HEX,	  /* addr */ },
679 		   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
680 		   [4] = { .scnprintf = SCA_HEX,	  /* new_addr */ }, }, },
681 	{ .name	    = "munlock",
682 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
683 	{ .name	    = "munmap",
684 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
685 	{ .name	    = "name_to_handle_at",
686 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
687 	{ .name	    = "newfstatat",
688 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
689 	{ .name	    = "open",
690 	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
691 	{ .name	    = "open_by_handle_at",
692 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
693 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
694 	{ .name	    = "openat",
695 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
696 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
697 	{ .name	    = "perf_event_open",
698 	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
699 		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
700 		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
701 	{ .name	    = "pipe2",
702 	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
703 	{ .name	    = "pkey_alloc",
704 	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
705 	{ .name	    = "pkey_free",
706 	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
707 	{ .name	    = "pkey_mprotect",
708 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
709 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
710 		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
711 	{ .name	    = "poll", .timeout = true, },
712 	{ .name	    = "ppoll", .timeout = true, },
713 	{ .name	    = "prctl", .alias = "arch_prctl",
714 	  .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
715 		   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
716 		   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
717 	{ .name	    = "pread", .alias = "pread64", },
718 	{ .name	    = "preadv", .alias = "pread", },
719 	{ .name	    = "prlimit64",
720 	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
721 	{ .name	    = "pwrite", .alias = "pwrite64", },
722 	{ .name	    = "readlinkat",
723 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
724 	{ .name	    = "recvfrom",
725 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
726 	{ .name	    = "recvmmsg",
727 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
728 	{ .name	    = "recvmsg",
729 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
730 	{ .name	    = "renameat",
731 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
732 	{ .name	    = "rt_sigaction",
733 	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
734 	{ .name	    = "rt_sigprocmask",
735 	  .arg = { [0] = STRARRAY(how, sighow), }, },
736 	{ .name	    = "rt_sigqueueinfo",
737 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
738 	{ .name	    = "rt_tgsigqueueinfo",
739 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
740 	{ .name	    = "sched_setscheduler",
741 	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
742 	{ .name	    = "seccomp",
743 	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
744 		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
745 	{ .name	    = "select", .timeout = true, },
746 	{ .name	    = "sendmmsg",
747 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
748 	{ .name	    = "sendmsg",
749 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
750 	{ .name	    = "sendto",
751 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
752 	{ .name	    = "set_tid_address", .errpid = true, },
753 	{ .name	    = "setitimer",
754 	  .arg = { [0] = STRARRAY(which, itimers), }, },
755 	{ .name	    = "setrlimit",
756 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
757 	{ .name	    = "socket",
758 	  .arg = { [0] = STRARRAY(family, socket_families),
759 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
760 	{ .name	    = "socketpair",
761 	  .arg = { [0] = STRARRAY(family, socket_families),
762 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
763 	{ .name	    = "stat", .alias = "newstat", },
764 	{ .name	    = "statx",
765 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
766 		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
767 		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
768 	{ .name	    = "swapoff",
769 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
770 	{ .name	    = "swapon",
771 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
772 	{ .name	    = "symlinkat",
773 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
774 	{ .name	    = "tgkill",
775 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
776 	{ .name	    = "tkill",
777 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
778 	{ .name	    = "uname", .alias = "newuname", },
779 	{ .name	    = "unlinkat",
780 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
781 	{ .name	    = "utimensat",
782 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
783 	{ .name	    = "wait4",	    .errpid = true,
784 	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
785 	{ .name	    = "waitid",	    .errpid = true,
786 	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
787 };
788 
789 static int syscall_fmt__cmp(const void *name, const void *fmtp)
790 {
791 	const struct syscall_fmt *fmt = fmtp;
792 	return strcmp(name, fmt->name);
793 }
794 
795 static struct syscall_fmt *syscall_fmt__find(const char *name)
796 {
797 	const int nmemb = ARRAY_SIZE(syscall_fmts);
798 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
799 }
800 
801 struct syscall {
802 	struct event_format *tp_format;
803 	int		    nr_args;
804 	struct format_field *args;
805 	const char	    *name;
806 	bool		    is_exit;
807 	struct syscall_fmt  *fmt;
808 	struct syscall_arg_fmt *arg_fmt;
809 };
810 
811 /*
812  * We need to have this 'calculated' boolean because in some cases we really
813  * don't know what is the duration of a syscall, for instance, when we start
814  * a session and some threads are waiting for a syscall to finish, say 'poll',
815  * in which case all we can do is to print "( ? ) for duration and for the
816  * start timestamp.
817  */
818 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
819 {
820 	double duration = (double)t / NSEC_PER_MSEC;
821 	size_t printed = fprintf(fp, "(");
822 
823 	if (!calculated)
824 		printed += fprintf(fp, "     ?   ");
825 	else if (duration >= 1.0)
826 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
827 	else if (duration >= 0.01)
828 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
829 	else
830 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
831 	return printed + fprintf(fp, "): ");
832 }
833 
834 /**
835  * filename.ptr: The filename char pointer that will be vfs_getname'd
836  * filename.entry_str_pos: Where to insert the string translated from
837  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
838  * ret_scnprintf: syscall args may set this to a different syscall return
839  *                formatter, for instance, fcntl may return fds, file flags, etc.
840  */
841 struct thread_trace {
842 	u64		  entry_time;
843 	bool		  entry_pending;
844 	unsigned long	  nr_events;
845 	unsigned long	  pfmaj, pfmin;
846 	char		  *entry_str;
847 	double		  runtime_ms;
848 	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
849         struct {
850 		unsigned long ptr;
851 		short int     entry_str_pos;
852 		bool	      pending_open;
853 		unsigned int  namelen;
854 		char	      *name;
855 	} filename;
856 	struct {
857 		int	  max;
858 		char	  **table;
859 	} paths;
860 
861 	struct intlist *syscall_stats;
862 };
863 
864 static struct thread_trace *thread_trace__new(void)
865 {
866 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
867 
868 	if (ttrace)
869 		ttrace->paths.max = -1;
870 
871 	ttrace->syscall_stats = intlist__new(NULL);
872 
873 	return ttrace;
874 }
875 
876 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
877 {
878 	struct thread_trace *ttrace;
879 
880 	if (thread == NULL)
881 		goto fail;
882 
883 	if (thread__priv(thread) == NULL)
884 		thread__set_priv(thread, thread_trace__new());
885 
886 	if (thread__priv(thread) == NULL)
887 		goto fail;
888 
889 	ttrace = thread__priv(thread);
890 	++ttrace->nr_events;
891 
892 	return ttrace;
893 fail:
894 	color_fprintf(fp, PERF_COLOR_RED,
895 		      "WARNING: not enough memory, dropping samples!\n");
896 	return NULL;
897 }
898 
899 
900 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
901 				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
902 {
903 	struct thread_trace *ttrace = thread__priv(arg->thread);
904 
905 	ttrace->ret_scnprintf = ret_scnprintf;
906 }
907 
908 #define TRACE_PFMAJ		(1 << 0)
909 #define TRACE_PFMIN		(1 << 1)
910 
911 static const size_t trace__entry_str_size = 2048;
912 
913 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
914 {
915 	struct thread_trace *ttrace = thread__priv(thread);
916 
917 	if (fd > ttrace->paths.max) {
918 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
919 
920 		if (npath == NULL)
921 			return -1;
922 
923 		if (ttrace->paths.max != -1) {
924 			memset(npath + ttrace->paths.max + 1, 0,
925 			       (fd - ttrace->paths.max) * sizeof(char *));
926 		} else {
927 			memset(npath, 0, (fd + 1) * sizeof(char *));
928 		}
929 
930 		ttrace->paths.table = npath;
931 		ttrace->paths.max   = fd;
932 	}
933 
934 	ttrace->paths.table[fd] = strdup(pathname);
935 
936 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
937 }
938 
939 static int thread__read_fd_path(struct thread *thread, int fd)
940 {
941 	char linkname[PATH_MAX], pathname[PATH_MAX];
942 	struct stat st;
943 	int ret;
944 
945 	if (thread->pid_ == thread->tid) {
946 		scnprintf(linkname, sizeof(linkname),
947 			  "/proc/%d/fd/%d", thread->pid_, fd);
948 	} else {
949 		scnprintf(linkname, sizeof(linkname),
950 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
951 	}
952 
953 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
954 		return -1;
955 
956 	ret = readlink(linkname, pathname, sizeof(pathname));
957 
958 	if (ret < 0 || ret > st.st_size)
959 		return -1;
960 
961 	pathname[ret] = '\0';
962 	return trace__set_fd_pathname(thread, fd, pathname);
963 }
964 
965 static const char *thread__fd_path(struct thread *thread, int fd,
966 				   struct trace *trace)
967 {
968 	struct thread_trace *ttrace = thread__priv(thread);
969 
970 	if (ttrace == NULL)
971 		return NULL;
972 
973 	if (fd < 0)
974 		return NULL;
975 
976 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
977 		if (!trace->live)
978 			return NULL;
979 		++trace->stats.proc_getname;
980 		if (thread__read_fd_path(thread, fd))
981 			return NULL;
982 	}
983 
984 	return ttrace->paths.table[fd];
985 }
986 
987 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
988 {
989 	int fd = arg->val;
990 	size_t printed = scnprintf(bf, size, "%d", fd);
991 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
992 
993 	if (path)
994 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
995 
996 	return printed;
997 }
998 
999 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1000 {
1001         size_t printed = scnprintf(bf, size, "%d", fd);
1002 	struct thread *thread = machine__find_thread(trace->host, pid, pid);
1003 
1004 	if (thread) {
1005 		const char *path = thread__fd_path(thread, fd, trace);
1006 
1007 		if (path)
1008 			printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1009 
1010 		thread__put(thread);
1011 	}
1012 
1013         return printed;
1014 }
1015 
1016 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1017 					      struct syscall_arg *arg)
1018 {
1019 	int fd = arg->val;
1020 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1021 	struct thread_trace *ttrace = thread__priv(arg->thread);
1022 
1023 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1024 		zfree(&ttrace->paths.table[fd]);
1025 
1026 	return printed;
1027 }
1028 
1029 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1030 				     unsigned long ptr)
1031 {
1032 	struct thread_trace *ttrace = thread__priv(thread);
1033 
1034 	ttrace->filename.ptr = ptr;
1035 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1036 }
1037 
1038 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1039 					      struct syscall_arg *arg)
1040 {
1041 	unsigned long ptr = arg->val;
1042 
1043 	if (!arg->trace->vfs_getname)
1044 		return scnprintf(bf, size, "%#x", ptr);
1045 
1046 	thread__set_filename_pos(arg->thread, bf, ptr);
1047 	return 0;
1048 }
1049 
1050 static bool trace__filter_duration(struct trace *trace, double t)
1051 {
1052 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1053 }
1054 
1055 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1056 {
1057 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1058 
1059 	return fprintf(fp, "%10.3f ", ts);
1060 }
1061 
1062 /*
1063  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1064  * using ttrace->entry_time for a thread that receives a sys_exit without
1065  * first having received a sys_enter ("poll" issued before tracing session
1066  * starts, lost sys_enter exit due to ring buffer overflow).
1067  */
1068 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1069 {
1070 	if (tstamp > 0)
1071 		return __trace__fprintf_tstamp(trace, tstamp, fp);
1072 
1073 	return fprintf(fp, "         ? ");
1074 }
1075 
1076 static bool done = false;
1077 static bool interrupted = false;
1078 
1079 static void sig_handler(int sig)
1080 {
1081 	done = true;
1082 	interrupted = sig == SIGINT;
1083 }
1084 
1085 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1086 					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1087 {
1088 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1089 	printed += fprintf_duration(duration, duration_calculated, fp);
1090 
1091 	if (trace->multiple_threads) {
1092 		if (trace->show_comm)
1093 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1094 		printed += fprintf(fp, "%d ", thread->tid);
1095 	}
1096 
1097 	return printed;
1098 }
1099 
1100 static int trace__process_event(struct trace *trace, struct machine *machine,
1101 				union perf_event *event, struct perf_sample *sample)
1102 {
1103 	int ret = 0;
1104 
1105 	switch (event->header.type) {
1106 	case PERF_RECORD_LOST:
1107 		color_fprintf(trace->output, PERF_COLOR_RED,
1108 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1109 		ret = machine__process_lost_event(machine, event, sample);
1110 		break;
1111 	default:
1112 		ret = machine__process_event(machine, event, sample);
1113 		break;
1114 	}
1115 
1116 	return ret;
1117 }
1118 
1119 static int trace__tool_process(struct perf_tool *tool,
1120 			       union perf_event *event,
1121 			       struct perf_sample *sample,
1122 			       struct machine *machine)
1123 {
1124 	struct trace *trace = container_of(tool, struct trace, tool);
1125 	return trace__process_event(trace, machine, event, sample);
1126 }
1127 
1128 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1129 {
1130 	struct machine *machine = vmachine;
1131 
1132 	if (machine->kptr_restrict_warned)
1133 		return NULL;
1134 
1135 	if (symbol_conf.kptr_restrict) {
1136 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1137 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1138 			   "Kernel samples will not be resolved.\n");
1139 		machine->kptr_restrict_warned = true;
1140 		return NULL;
1141 	}
1142 
1143 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1144 }
1145 
1146 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1147 {
1148 	int err = symbol__init(NULL);
1149 
1150 	if (err)
1151 		return err;
1152 
1153 	trace->host = machine__new_host();
1154 	if (trace->host == NULL)
1155 		return -ENOMEM;
1156 
1157 	err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1158 	if (err < 0)
1159 		goto out;
1160 
1161 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1162 					    evlist->threads, trace__tool_process, false,
1163 					    trace->opts.proc_map_timeout, 1);
1164 out:
1165 	if (err)
1166 		symbol__exit();
1167 
1168 	return err;
1169 }
1170 
1171 static void trace__symbols__exit(struct trace *trace)
1172 {
1173 	machine__exit(trace->host);
1174 	trace->host = NULL;
1175 
1176 	symbol__exit();
1177 }
1178 
1179 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1180 {
1181 	int idx;
1182 
1183 	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1184 		nr_args = sc->fmt->nr_args;
1185 
1186 	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1187 	if (sc->arg_fmt == NULL)
1188 		return -1;
1189 
1190 	for (idx = 0; idx < nr_args; ++idx) {
1191 		if (sc->fmt)
1192 			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1193 	}
1194 
1195 	sc->nr_args = nr_args;
1196 	return 0;
1197 }
1198 
1199 static int syscall__set_arg_fmts(struct syscall *sc)
1200 {
1201 	struct format_field *field;
1202 	int idx = 0, len;
1203 
1204 	for (field = sc->args; field; field = field->next, ++idx) {
1205 		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1206 			continue;
1207 
1208 		if (strcmp(field->type, "const char *") == 0 &&
1209 			 (strcmp(field->name, "filename") == 0 ||
1210 			  strcmp(field->name, "path") == 0 ||
1211 			  strcmp(field->name, "pathname") == 0))
1212 			sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1213 		else if (field->flags & FIELD_IS_POINTER)
1214 			sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1215 		else if (strcmp(field->type, "pid_t") == 0)
1216 			sc->arg_fmt[idx].scnprintf = SCA_PID;
1217 		else if (strcmp(field->type, "umode_t") == 0)
1218 			sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1219 		else if ((strcmp(field->type, "int") == 0 ||
1220 			  strcmp(field->type, "unsigned int") == 0 ||
1221 			  strcmp(field->type, "long") == 0) &&
1222 			 (len = strlen(field->name)) >= 2 &&
1223 			 strcmp(field->name + len - 2, "fd") == 0) {
1224 			/*
1225 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1226 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1227 			 * 65 int
1228 			 * 23 unsigned int
1229 			 * 7 unsigned long
1230 			 */
1231 			sc->arg_fmt[idx].scnprintf = SCA_FD;
1232 		}
1233 	}
1234 
1235 	return 0;
1236 }
1237 
1238 static int trace__read_syscall_info(struct trace *trace, int id)
1239 {
1240 	char tp_name[128];
1241 	struct syscall *sc;
1242 	const char *name = syscalltbl__name(trace->sctbl, id);
1243 
1244 	if (name == NULL)
1245 		return -1;
1246 
1247 	if (id > trace->syscalls.max) {
1248 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1249 
1250 		if (nsyscalls == NULL)
1251 			return -1;
1252 
1253 		if (trace->syscalls.max != -1) {
1254 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1255 			       (id - trace->syscalls.max) * sizeof(*sc));
1256 		} else {
1257 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1258 		}
1259 
1260 		trace->syscalls.table = nsyscalls;
1261 		trace->syscalls.max   = id;
1262 	}
1263 
1264 	sc = trace->syscalls.table + id;
1265 	sc->name = name;
1266 
1267 	sc->fmt  = syscall_fmt__find(sc->name);
1268 
1269 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1270 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1271 
1272 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1273 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1274 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1275 	}
1276 
1277 	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1278 		return -1;
1279 
1280 	if (IS_ERR(sc->tp_format))
1281 		return -1;
1282 
1283 	sc->args = sc->tp_format->format.fields;
1284 	/*
1285 	 * We need to check and discard the first variable '__syscall_nr'
1286 	 * or 'nr' that mean the syscall number. It is needless here.
1287 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1288 	 */
1289 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1290 		sc->args = sc->args->next;
1291 		--sc->nr_args;
1292 	}
1293 
1294 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1295 
1296 	return syscall__set_arg_fmts(sc);
1297 }
1298 
1299 static int trace__validate_ev_qualifier(struct trace *trace)
1300 {
1301 	int err = 0, i;
1302 	size_t nr_allocated;
1303 	struct str_node *pos;
1304 
1305 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1306 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1307 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1308 
1309 	if (trace->ev_qualifier_ids.entries == NULL) {
1310 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1311 		       trace->output);
1312 		err = -EINVAL;
1313 		goto out;
1314 	}
1315 
1316 	nr_allocated = trace->ev_qualifier_ids.nr;
1317 	i = 0;
1318 
1319 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1320 		const char *sc = pos->s;
1321 		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1322 
1323 		if (id < 0) {
1324 			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1325 			if (id >= 0)
1326 				goto matches;
1327 
1328 			if (err == 0) {
1329 				fputs("Error:\tInvalid syscall ", trace->output);
1330 				err = -EINVAL;
1331 			} else {
1332 				fputs(", ", trace->output);
1333 			}
1334 
1335 			fputs(sc, trace->output);
1336 		}
1337 matches:
1338 		trace->ev_qualifier_ids.entries[i++] = id;
1339 		if (match_next == -1)
1340 			continue;
1341 
1342 		while (1) {
1343 			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1344 			if (id < 0)
1345 				break;
1346 			if (nr_allocated == trace->ev_qualifier_ids.nr) {
1347 				void *entries;
1348 
1349 				nr_allocated += 8;
1350 				entries = realloc(trace->ev_qualifier_ids.entries,
1351 						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1352 				if (entries == NULL) {
1353 					err = -ENOMEM;
1354 					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1355 					goto out_free;
1356 				}
1357 				trace->ev_qualifier_ids.entries = entries;
1358 			}
1359 			trace->ev_qualifier_ids.nr++;
1360 			trace->ev_qualifier_ids.entries[i++] = id;
1361 		}
1362 	}
1363 
1364 	if (err < 0) {
1365 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1366 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1367 out_free:
1368 		zfree(&trace->ev_qualifier_ids.entries);
1369 		trace->ev_qualifier_ids.nr = 0;
1370 	}
1371 out:
1372 	return err;
1373 }
1374 
1375 /*
1376  * args is to be interpreted as a series of longs but we need to handle
1377  * 8-byte unaligned accesses. args points to raw_data within the event
1378  * and raw_data is guaranteed to be 8-byte unaligned because it is
1379  * preceded by raw_size which is a u32. So we need to copy args to a temp
1380  * variable to read it. Most notably this avoids extended load instructions
1381  * on unaligned addresses
1382  */
1383 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1384 {
1385 	unsigned long val;
1386 	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1387 
1388 	memcpy(&val, p, sizeof(val));
1389 	return val;
1390 }
1391 
1392 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1393 				      struct syscall_arg *arg)
1394 {
1395 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1396 		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1397 
1398 	return scnprintf(bf, size, "arg%d: ", arg->idx);
1399 }
1400 
1401 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1402 				     struct syscall_arg *arg, unsigned long val)
1403 {
1404 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1405 		arg->val = val;
1406 		if (sc->arg_fmt[arg->idx].parm)
1407 			arg->parm = sc->arg_fmt[arg->idx].parm;
1408 		return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1409 	}
1410 	return scnprintf(bf, size, "%ld", val);
1411 }
1412 
1413 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1414 				      unsigned char *args, struct trace *trace,
1415 				      struct thread *thread)
1416 {
1417 	size_t printed = 0;
1418 	unsigned long val;
1419 	u8 bit = 1;
1420 	struct syscall_arg arg = {
1421 		.args	= args,
1422 		.idx	= 0,
1423 		.mask	= 0,
1424 		.trace  = trace,
1425 		.thread = thread,
1426 	};
1427 	struct thread_trace *ttrace = thread__priv(thread);
1428 
1429 	/*
1430 	 * Things like fcntl will set this in its 'cmd' formatter to pick the
1431 	 * right formatter for the return value (an fd? file flags?), which is
1432 	 * not needed for syscalls that always return a given type, say an fd.
1433 	 */
1434 	ttrace->ret_scnprintf = NULL;
1435 
1436 	if (sc->args != NULL) {
1437 		struct format_field *field;
1438 
1439 		for (field = sc->args; field;
1440 		     field = field->next, ++arg.idx, bit <<= 1) {
1441 			if (arg.mask & bit)
1442 				continue;
1443 
1444 			val = syscall_arg__val(&arg, arg.idx);
1445 
1446 			/*
1447  			 * Suppress this argument if its value is zero and
1448  			 * and we don't have a string associated in an
1449  			 * strarray for it.
1450  			 */
1451 			if (val == 0 &&
1452 			    !(sc->arg_fmt &&
1453 			      (sc->arg_fmt[arg.idx].show_zero ||
1454 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1455 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1456 			      sc->arg_fmt[arg.idx].parm))
1457 				continue;
1458 
1459 			printed += scnprintf(bf + printed, size - printed,
1460 					     "%s%s: ", printed ? ", " : "", field->name);
1461 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1462 		}
1463 	} else if (IS_ERR(sc->tp_format)) {
1464 		/*
1465 		 * If we managed to read the tracepoint /format file, then we
1466 		 * may end up not having any args, like with gettid(), so only
1467 		 * print the raw args when we didn't manage to read it.
1468 		 */
1469 		while (arg.idx < sc->nr_args) {
1470 			if (arg.mask & bit)
1471 				goto next_arg;
1472 			val = syscall_arg__val(&arg, arg.idx);
1473 			if (printed)
1474 				printed += scnprintf(bf + printed, size - printed, ", ");
1475 			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1476 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1477 next_arg:
1478 			++arg.idx;
1479 			bit <<= 1;
1480 		}
1481 	}
1482 
1483 	return printed;
1484 }
1485 
1486 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1487 				  union perf_event *event,
1488 				  struct perf_sample *sample);
1489 
1490 static struct syscall *trace__syscall_info(struct trace *trace,
1491 					   struct perf_evsel *evsel, int id)
1492 {
1493 
1494 	if (id < 0) {
1495 
1496 		/*
1497 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1498 		 * before that, leaving at a higher verbosity level till that is
1499 		 * explained. Reproduced with plain ftrace with:
1500 		 *
1501 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1502 		 * grep "NR -1 " /t/trace_pipe
1503 		 *
1504 		 * After generating some load on the machine.
1505  		 */
1506 		if (verbose > 1) {
1507 			static u64 n;
1508 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1509 				id, perf_evsel__name(evsel), ++n);
1510 		}
1511 		return NULL;
1512 	}
1513 
1514 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1515 	    trace__read_syscall_info(trace, id))
1516 		goto out_cant_read;
1517 
1518 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1519 		goto out_cant_read;
1520 
1521 	return &trace->syscalls.table[id];
1522 
1523 out_cant_read:
1524 	if (verbose > 0) {
1525 		fprintf(trace->output, "Problems reading syscall %d", id);
1526 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1527 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1528 		fputs(" information\n", trace->output);
1529 	}
1530 	return NULL;
1531 }
1532 
1533 static void thread__update_stats(struct thread_trace *ttrace,
1534 				 int id, struct perf_sample *sample)
1535 {
1536 	struct int_node *inode;
1537 	struct stats *stats;
1538 	u64 duration = 0;
1539 
1540 	inode = intlist__findnew(ttrace->syscall_stats, id);
1541 	if (inode == NULL)
1542 		return;
1543 
1544 	stats = inode->priv;
1545 	if (stats == NULL) {
1546 		stats = malloc(sizeof(struct stats));
1547 		if (stats == NULL)
1548 			return;
1549 		init_stats(stats);
1550 		inode->priv = stats;
1551 	}
1552 
1553 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1554 		duration = sample->time - ttrace->entry_time;
1555 
1556 	update_stats(stats, duration);
1557 }
1558 
1559 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1560 {
1561 	struct thread_trace *ttrace;
1562 	u64 duration;
1563 	size_t printed;
1564 
1565 	if (trace->current == NULL)
1566 		return 0;
1567 
1568 	ttrace = thread__priv(trace->current);
1569 
1570 	if (!ttrace->entry_pending)
1571 		return 0;
1572 
1573 	duration = sample->time - ttrace->entry_time;
1574 
1575 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1576 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1577 	ttrace->entry_pending = false;
1578 
1579 	return printed;
1580 }
1581 
1582 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1583 				 struct perf_sample *sample, struct thread *thread)
1584 {
1585 	int printed = 0;
1586 
1587 	if (trace->print_sample) {
1588 		double ts = (double)sample->time / NSEC_PER_MSEC;
1589 
1590 		printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1591 				   perf_evsel__name(evsel), ts,
1592 				   thread__comm_str(thread),
1593 				   sample->pid, sample->tid, sample->cpu);
1594 	}
1595 
1596 	return printed;
1597 }
1598 
1599 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1600 			    union perf_event *event __maybe_unused,
1601 			    struct perf_sample *sample)
1602 {
1603 	char *msg;
1604 	void *args;
1605 	size_t printed = 0;
1606 	struct thread *thread;
1607 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1608 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1609 	struct thread_trace *ttrace;
1610 
1611 	if (sc == NULL)
1612 		return -1;
1613 
1614 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1615 	ttrace = thread__trace(thread, trace->output);
1616 	if (ttrace == NULL)
1617 		goto out_put;
1618 
1619 	trace__fprintf_sample(trace, evsel, sample, thread);
1620 
1621 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1622 
1623 	if (ttrace->entry_str == NULL) {
1624 		ttrace->entry_str = malloc(trace__entry_str_size);
1625 		if (!ttrace->entry_str)
1626 			goto out_put;
1627 	}
1628 
1629 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1630 		trace__printf_interrupted_entry(trace, sample);
1631 
1632 	ttrace->entry_time = sample->time;
1633 	msg = ttrace->entry_str;
1634 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1635 
1636 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1637 					   args, trace, thread);
1638 
1639 	if (sc->is_exit) {
1640 		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1641 			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1642 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1643 		}
1644 	} else {
1645 		ttrace->entry_pending = true;
1646 		/* See trace__vfs_getname & trace__sys_exit */
1647 		ttrace->filename.pending_open = false;
1648 	}
1649 
1650 	if (trace->current != thread) {
1651 		thread__put(trace->current);
1652 		trace->current = thread__get(thread);
1653 	}
1654 	err = 0;
1655 out_put:
1656 	thread__put(thread);
1657 	return err;
1658 }
1659 
1660 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1661 				    struct perf_sample *sample,
1662 				    struct callchain_cursor *cursor)
1663 {
1664 	struct addr_location al;
1665 
1666 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1667 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, evsel->attr.sample_max_stack))
1668 		return -1;
1669 
1670 	return 0;
1671 }
1672 
1673 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1674 {
1675 	/* TODO: user-configurable print_opts */
1676 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1677 				        EVSEL__PRINT_DSO |
1678 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1679 
1680 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1681 }
1682 
1683 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1684 {
1685 	struct perf_env *env = perf_evsel__env(evsel);
1686 	const char *arch_name = perf_env__arch(env);
1687 
1688 	return arch_syscalls__strerrno(arch_name, err);
1689 }
1690 
1691 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1692 			   union perf_event *event __maybe_unused,
1693 			   struct perf_sample *sample)
1694 {
1695 	long ret;
1696 	u64 duration = 0;
1697 	bool duration_calculated = false;
1698 	struct thread *thread;
1699 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1700 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1701 	struct thread_trace *ttrace;
1702 
1703 	if (sc == NULL)
1704 		return -1;
1705 
1706 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1707 	ttrace = thread__trace(thread, trace->output);
1708 	if (ttrace == NULL)
1709 		goto out_put;
1710 
1711 	trace__fprintf_sample(trace, evsel, sample, thread);
1712 
1713 	if (trace->summary)
1714 		thread__update_stats(ttrace, id, sample);
1715 
1716 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1717 
1718 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1719 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1720 		ttrace->filename.pending_open = false;
1721 		++trace->stats.vfs_getname;
1722 	}
1723 
1724 	if (ttrace->entry_time) {
1725 		duration = sample->time - ttrace->entry_time;
1726 		if (trace__filter_duration(trace, duration))
1727 			goto out;
1728 		duration_calculated = true;
1729 	} else if (trace->duration_filter)
1730 		goto out;
1731 
1732 	if (sample->callchain) {
1733 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1734 		if (callchain_ret == 0) {
1735 			if (callchain_cursor.nr < trace->min_stack)
1736 				goto out;
1737 			callchain_ret = 1;
1738 		}
1739 	}
1740 
1741 	if (trace->summary_only)
1742 		goto out;
1743 
1744 	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1745 
1746 	if (ttrace->entry_pending) {
1747 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1748 	} else {
1749 		fprintf(trace->output, " ... [");
1750 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1751 		fprintf(trace->output, "]: %s()", sc->name);
1752 	}
1753 
1754 	if (sc->fmt == NULL) {
1755 		if (ret < 0)
1756 			goto errno_print;
1757 signed_print:
1758 		fprintf(trace->output, ") = %ld", ret);
1759 	} else if (ret < 0) {
1760 errno_print: {
1761 		char bf[STRERR_BUFSIZE];
1762 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1763 			   *e = errno_to_name(evsel, -ret);
1764 
1765 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1766 	}
1767 	} else if (ret == 0 && sc->fmt->timeout)
1768 		fprintf(trace->output, ") = 0 Timeout");
1769 	else if (ttrace->ret_scnprintf) {
1770 		char bf[1024];
1771 		struct syscall_arg arg = {
1772 			.val	= ret,
1773 			.thread	= thread,
1774 			.trace	= trace,
1775 		};
1776 		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1777 		ttrace->ret_scnprintf = NULL;
1778 		fprintf(trace->output, ") = %s", bf);
1779 	} else if (sc->fmt->hexret)
1780 		fprintf(trace->output, ") = %#lx", ret);
1781 	else if (sc->fmt->errpid) {
1782 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1783 
1784 		if (child != NULL) {
1785 			fprintf(trace->output, ") = %ld", ret);
1786 			if (child->comm_set)
1787 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1788 			thread__put(child);
1789 		}
1790 	} else
1791 		goto signed_print;
1792 
1793 	fputc('\n', trace->output);
1794 
1795 	if (callchain_ret > 0)
1796 		trace__fprintf_callchain(trace, sample);
1797 	else if (callchain_ret < 0)
1798 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1799 out:
1800 	ttrace->entry_pending = false;
1801 	err = 0;
1802 out_put:
1803 	thread__put(thread);
1804 	return err;
1805 }
1806 
1807 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1808 			      union perf_event *event __maybe_unused,
1809 			      struct perf_sample *sample)
1810 {
1811 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1812 	struct thread_trace *ttrace;
1813 	size_t filename_len, entry_str_len, to_move;
1814 	ssize_t remaining_space;
1815 	char *pos;
1816 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1817 
1818 	if (!thread)
1819 		goto out;
1820 
1821 	ttrace = thread__priv(thread);
1822 	if (!ttrace)
1823 		goto out_put;
1824 
1825 	filename_len = strlen(filename);
1826 	if (filename_len == 0)
1827 		goto out_put;
1828 
1829 	if (ttrace->filename.namelen < filename_len) {
1830 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1831 
1832 		if (f == NULL)
1833 			goto out_put;
1834 
1835 		ttrace->filename.namelen = filename_len;
1836 		ttrace->filename.name = f;
1837 	}
1838 
1839 	strcpy(ttrace->filename.name, filename);
1840 	ttrace->filename.pending_open = true;
1841 
1842 	if (!ttrace->filename.ptr)
1843 		goto out_put;
1844 
1845 	entry_str_len = strlen(ttrace->entry_str);
1846 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1847 	if (remaining_space <= 0)
1848 		goto out_put;
1849 
1850 	if (filename_len > (size_t)remaining_space) {
1851 		filename += filename_len - remaining_space;
1852 		filename_len = remaining_space;
1853 	}
1854 
1855 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1856 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1857 	memmove(pos + filename_len, pos, to_move);
1858 	memcpy(pos, filename, filename_len);
1859 
1860 	ttrace->filename.ptr = 0;
1861 	ttrace->filename.entry_str_pos = 0;
1862 out_put:
1863 	thread__put(thread);
1864 out:
1865 	return 0;
1866 }
1867 
1868 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1869 				     union perf_event *event __maybe_unused,
1870 				     struct perf_sample *sample)
1871 {
1872         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1873 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1874 	struct thread *thread = machine__findnew_thread(trace->host,
1875 							sample->pid,
1876 							sample->tid);
1877 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1878 
1879 	if (ttrace == NULL)
1880 		goto out_dump;
1881 
1882 	ttrace->runtime_ms += runtime_ms;
1883 	trace->runtime_ms += runtime_ms;
1884 out_put:
1885 	thread__put(thread);
1886 	return 0;
1887 
1888 out_dump:
1889 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1890 	       evsel->name,
1891 	       perf_evsel__strval(evsel, sample, "comm"),
1892 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1893 	       runtime,
1894 	       perf_evsel__intval(evsel, sample, "vruntime"));
1895 	goto out_put;
1896 }
1897 
1898 static int bpf_output__printer(enum binary_printer_ops op,
1899 			       unsigned int val, void *extra __maybe_unused, FILE *fp)
1900 {
1901 	unsigned char ch = (unsigned char)val;
1902 
1903 	switch (op) {
1904 	case BINARY_PRINT_CHAR_DATA:
1905 		return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1906 	case BINARY_PRINT_DATA_BEGIN:
1907 	case BINARY_PRINT_LINE_BEGIN:
1908 	case BINARY_PRINT_ADDR:
1909 	case BINARY_PRINT_NUM_DATA:
1910 	case BINARY_PRINT_NUM_PAD:
1911 	case BINARY_PRINT_SEP:
1912 	case BINARY_PRINT_CHAR_PAD:
1913 	case BINARY_PRINT_LINE_END:
1914 	case BINARY_PRINT_DATA_END:
1915 	default:
1916 		break;
1917 	}
1918 
1919 	return 0;
1920 }
1921 
1922 static void bpf_output__fprintf(struct trace *trace,
1923 				struct perf_sample *sample)
1924 {
1925 	binary__fprintf(sample->raw_data, sample->raw_size, 8,
1926 			bpf_output__printer, NULL, trace->output);
1927 }
1928 
1929 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1930 				union perf_event *event __maybe_unused,
1931 				struct perf_sample *sample)
1932 {
1933 	int callchain_ret = 0;
1934 
1935 	if (sample->callchain) {
1936 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1937 		if (callchain_ret == 0) {
1938 			if (callchain_cursor.nr < trace->min_stack)
1939 				goto out;
1940 			callchain_ret = 1;
1941 		}
1942 	}
1943 
1944 	trace__printf_interrupted_entry(trace, sample);
1945 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1946 
1947 	if (trace->trace_syscalls)
1948 		fprintf(trace->output, "(         ): ");
1949 
1950 	fprintf(trace->output, "%s:", evsel->name);
1951 
1952 	if (perf_evsel__is_bpf_output(evsel)) {
1953 		bpf_output__fprintf(trace, sample);
1954 	} else if (evsel->tp_format) {
1955 		event_format__fprintf(evsel->tp_format, sample->cpu,
1956 				      sample->raw_data, sample->raw_size,
1957 				      trace->output);
1958 	}
1959 
1960 	fprintf(trace->output, ")\n");
1961 
1962 	if (callchain_ret > 0)
1963 		trace__fprintf_callchain(trace, sample);
1964 	else if (callchain_ret < 0)
1965 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1966 out:
1967 	return 0;
1968 }
1969 
1970 static void print_location(FILE *f, struct perf_sample *sample,
1971 			   struct addr_location *al,
1972 			   bool print_dso, bool print_sym)
1973 {
1974 
1975 	if ((verbose > 0 || print_dso) && al->map)
1976 		fprintf(f, "%s@", al->map->dso->long_name);
1977 
1978 	if ((verbose > 0 || print_sym) && al->sym)
1979 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1980 			al->addr - al->sym->start);
1981 	else if (al->map)
1982 		fprintf(f, "0x%" PRIx64, al->addr);
1983 	else
1984 		fprintf(f, "0x%" PRIx64, sample->addr);
1985 }
1986 
1987 static int trace__pgfault(struct trace *trace,
1988 			  struct perf_evsel *evsel,
1989 			  union perf_event *event __maybe_unused,
1990 			  struct perf_sample *sample)
1991 {
1992 	struct thread *thread;
1993 	struct addr_location al;
1994 	char map_type = 'd';
1995 	struct thread_trace *ttrace;
1996 	int err = -1;
1997 	int callchain_ret = 0;
1998 
1999 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2000 
2001 	if (sample->callchain) {
2002 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2003 		if (callchain_ret == 0) {
2004 			if (callchain_cursor.nr < trace->min_stack)
2005 				goto out_put;
2006 			callchain_ret = 1;
2007 		}
2008 	}
2009 
2010 	ttrace = thread__trace(thread, trace->output);
2011 	if (ttrace == NULL)
2012 		goto out_put;
2013 
2014 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2015 		ttrace->pfmaj++;
2016 	else
2017 		ttrace->pfmin++;
2018 
2019 	if (trace->summary_only)
2020 		goto out;
2021 
2022 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
2023 			      sample->ip, &al);
2024 
2025 	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2026 
2027 	fprintf(trace->output, "%sfault [",
2028 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2029 		"maj" : "min");
2030 
2031 	print_location(trace->output, sample, &al, false, true);
2032 
2033 	fprintf(trace->output, "] => ");
2034 
2035 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
2036 				   sample->addr, &al);
2037 
2038 	if (!al.map) {
2039 		thread__find_addr_location(thread, sample->cpumode,
2040 					   MAP__FUNCTION, sample->addr, &al);
2041 
2042 		if (al.map)
2043 			map_type = 'x';
2044 		else
2045 			map_type = '?';
2046 	}
2047 
2048 	print_location(trace->output, sample, &al, true, false);
2049 
2050 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2051 
2052 	if (callchain_ret > 0)
2053 		trace__fprintf_callchain(trace, sample);
2054 	else if (callchain_ret < 0)
2055 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2056 out:
2057 	err = 0;
2058 out_put:
2059 	thread__put(thread);
2060 	return err;
2061 }
2062 
2063 static void trace__set_base_time(struct trace *trace,
2064 				 struct perf_evsel *evsel,
2065 				 struct perf_sample *sample)
2066 {
2067 	/*
2068 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2069 	 * and don't use sample->time unconditionally, we may end up having
2070 	 * some other event in the future without PERF_SAMPLE_TIME for good
2071 	 * reason, i.e. we may not be interested in its timestamps, just in
2072 	 * it taking place, picking some piece of information when it
2073 	 * appears in our event stream (vfs_getname comes to mind).
2074 	 */
2075 	if (trace->base_time == 0 && !trace->full_time &&
2076 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2077 		trace->base_time = sample->time;
2078 }
2079 
2080 static int trace__process_sample(struct perf_tool *tool,
2081 				 union perf_event *event,
2082 				 struct perf_sample *sample,
2083 				 struct perf_evsel *evsel,
2084 				 struct machine *machine __maybe_unused)
2085 {
2086 	struct trace *trace = container_of(tool, struct trace, tool);
2087 	struct thread *thread;
2088 	int err = 0;
2089 
2090 	tracepoint_handler handler = evsel->handler;
2091 
2092 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2093 	if (thread && thread__is_filtered(thread))
2094 		goto out;
2095 
2096 	trace__set_base_time(trace, evsel, sample);
2097 
2098 	if (handler) {
2099 		++trace->nr_events;
2100 		handler(trace, evsel, event, sample);
2101 	}
2102 out:
2103 	thread__put(thread);
2104 	return err;
2105 }
2106 
2107 static int trace__record(struct trace *trace, int argc, const char **argv)
2108 {
2109 	unsigned int rec_argc, i, j;
2110 	const char **rec_argv;
2111 	const char * const record_args[] = {
2112 		"record",
2113 		"-R",
2114 		"-m", "1024",
2115 		"-c", "1",
2116 	};
2117 
2118 	const char * const sc_args[] = { "-e", };
2119 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2120 	const char * const majpf_args[] = { "-e", "major-faults" };
2121 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2122 	const char * const minpf_args[] = { "-e", "minor-faults" };
2123 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2124 
2125 	/* +1 is for the event string below */
2126 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2127 		majpf_args_nr + minpf_args_nr + argc;
2128 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2129 
2130 	if (rec_argv == NULL)
2131 		return -ENOMEM;
2132 
2133 	j = 0;
2134 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2135 		rec_argv[j++] = record_args[i];
2136 
2137 	if (trace->trace_syscalls) {
2138 		for (i = 0; i < sc_args_nr; i++)
2139 			rec_argv[j++] = sc_args[i];
2140 
2141 		/* event string may be different for older kernels - e.g., RHEL6 */
2142 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2143 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2144 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2145 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2146 		else {
2147 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2148 			free(rec_argv);
2149 			return -1;
2150 		}
2151 	}
2152 
2153 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2154 		for (i = 0; i < majpf_args_nr; i++)
2155 			rec_argv[j++] = majpf_args[i];
2156 
2157 	if (trace->trace_pgfaults & TRACE_PFMIN)
2158 		for (i = 0; i < minpf_args_nr; i++)
2159 			rec_argv[j++] = minpf_args[i];
2160 
2161 	for (i = 0; i < (unsigned int)argc; i++)
2162 		rec_argv[j++] = argv[i];
2163 
2164 	return cmd_record(j, rec_argv);
2165 }
2166 
2167 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2168 
2169 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2170 {
2171 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2172 
2173 	if (IS_ERR(evsel))
2174 		return false;
2175 
2176 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2177 		perf_evsel__delete(evsel);
2178 		return false;
2179 	}
2180 
2181 	evsel->handler = trace__vfs_getname;
2182 	perf_evlist__add(evlist, evsel);
2183 	return true;
2184 }
2185 
2186 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2187 {
2188 	struct perf_evsel *evsel;
2189 	struct perf_event_attr attr = {
2190 		.type = PERF_TYPE_SOFTWARE,
2191 		.mmap_data = 1,
2192 	};
2193 
2194 	attr.config = config;
2195 	attr.sample_period = 1;
2196 
2197 	event_attr_init(&attr);
2198 
2199 	evsel = perf_evsel__new(&attr);
2200 	if (evsel)
2201 		evsel->handler = trace__pgfault;
2202 
2203 	return evsel;
2204 }
2205 
2206 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2207 {
2208 	const u32 type = event->header.type;
2209 	struct perf_evsel *evsel;
2210 
2211 	if (type != PERF_RECORD_SAMPLE) {
2212 		trace__process_event(trace, trace->host, event, sample);
2213 		return;
2214 	}
2215 
2216 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2217 	if (evsel == NULL) {
2218 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2219 		return;
2220 	}
2221 
2222 	trace__set_base_time(trace, evsel, sample);
2223 
2224 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2225 	    sample->raw_data == NULL) {
2226 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2227 		       perf_evsel__name(evsel), sample->tid,
2228 		       sample->cpu, sample->raw_size);
2229 	} else {
2230 		tracepoint_handler handler = evsel->handler;
2231 		handler(trace, evsel, event, sample);
2232 	}
2233 }
2234 
2235 static int trace__add_syscall_newtp(struct trace *trace)
2236 {
2237 	int ret = -1;
2238 	struct perf_evlist *evlist = trace->evlist;
2239 	struct perf_evsel *sys_enter, *sys_exit;
2240 
2241 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2242 	if (sys_enter == NULL)
2243 		goto out;
2244 
2245 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2246 		goto out_delete_sys_enter;
2247 
2248 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2249 	if (sys_exit == NULL)
2250 		goto out_delete_sys_enter;
2251 
2252 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2253 		goto out_delete_sys_exit;
2254 
2255 	perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2256 	perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2257 
2258 	perf_evlist__add(evlist, sys_enter);
2259 	perf_evlist__add(evlist, sys_exit);
2260 
2261 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2262 		/*
2263 		 * We're interested only in the user space callchain
2264 		 * leading to the syscall, allow overriding that for
2265 		 * debugging reasons using --kernel_syscall_callchains
2266 		 */
2267 		sys_exit->attr.exclude_callchain_kernel = 1;
2268 	}
2269 
2270 	trace->syscalls.events.sys_enter = sys_enter;
2271 	trace->syscalls.events.sys_exit  = sys_exit;
2272 
2273 	ret = 0;
2274 out:
2275 	return ret;
2276 
2277 out_delete_sys_exit:
2278 	perf_evsel__delete_priv(sys_exit);
2279 out_delete_sys_enter:
2280 	perf_evsel__delete_priv(sys_enter);
2281 	goto out;
2282 }
2283 
2284 static int trace__set_ev_qualifier_filter(struct trace *trace)
2285 {
2286 	int err = -1;
2287 	struct perf_evsel *sys_exit;
2288 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2289 						trace->ev_qualifier_ids.nr,
2290 						trace->ev_qualifier_ids.entries);
2291 
2292 	if (filter == NULL)
2293 		goto out_enomem;
2294 
2295 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2296 					  filter)) {
2297 		sys_exit = trace->syscalls.events.sys_exit;
2298 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2299 	}
2300 
2301 	free(filter);
2302 out:
2303 	return err;
2304 out_enomem:
2305 	errno = ENOMEM;
2306 	goto out;
2307 }
2308 
2309 static int trace__set_filter_loop_pids(struct trace *trace)
2310 {
2311 	unsigned int nr = 1;
2312 	pid_t pids[32] = {
2313 		getpid(),
2314 	};
2315 	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2316 
2317 	while (thread && nr < ARRAY_SIZE(pids)) {
2318 		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2319 
2320 		if (parent == NULL)
2321 			break;
2322 
2323 		if (!strcmp(thread__comm_str(parent), "sshd")) {
2324 			pids[nr++] = parent->tid;
2325 			break;
2326 		}
2327 		thread = parent;
2328 	}
2329 
2330 	return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2331 }
2332 
2333 static int trace__run(struct trace *trace, int argc, const char **argv)
2334 {
2335 	struct perf_evlist *evlist = trace->evlist;
2336 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2337 	int err = -1, i;
2338 	unsigned long before;
2339 	const bool forks = argc > 0;
2340 	bool draining = false;
2341 
2342 	trace->live = true;
2343 
2344 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2345 		goto out_error_raw_syscalls;
2346 
2347 	if (trace->trace_syscalls)
2348 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2349 
2350 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2351 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2352 		if (pgfault_maj == NULL)
2353 			goto out_error_mem;
2354 		perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2355 		perf_evlist__add(evlist, pgfault_maj);
2356 	}
2357 
2358 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2359 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2360 		if (pgfault_min == NULL)
2361 			goto out_error_mem;
2362 		perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2363 		perf_evlist__add(evlist, pgfault_min);
2364 	}
2365 
2366 	if (trace->sched &&
2367 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2368 				   trace__sched_stat_runtime))
2369 		goto out_error_sched_stat_runtime;
2370 
2371 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2372 	if (err < 0) {
2373 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2374 		goto out_delete_evlist;
2375 	}
2376 
2377 	err = trace__symbols_init(trace, evlist);
2378 	if (err < 0) {
2379 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2380 		goto out_delete_evlist;
2381 	}
2382 
2383 	perf_evlist__config(evlist, &trace->opts, &callchain_param);
2384 
2385 	signal(SIGCHLD, sig_handler);
2386 	signal(SIGINT, sig_handler);
2387 
2388 	if (forks) {
2389 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2390 						    argv, false, NULL);
2391 		if (err < 0) {
2392 			fprintf(trace->output, "Couldn't run the workload!\n");
2393 			goto out_delete_evlist;
2394 		}
2395 	}
2396 
2397 	err = perf_evlist__open(evlist);
2398 	if (err < 0)
2399 		goto out_error_open;
2400 
2401 	err = bpf__apply_obj_config();
2402 	if (err) {
2403 		char errbuf[BUFSIZ];
2404 
2405 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2406 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2407 			 errbuf);
2408 		goto out_error_open;
2409 	}
2410 
2411 	/*
2412 	 * Better not use !target__has_task() here because we need to cover the
2413 	 * case where no threads were specified in the command line, but a
2414 	 * workload was, and in that case we will fill in the thread_map when
2415 	 * we fork the workload in perf_evlist__prepare_workload.
2416 	 */
2417 	if (trace->filter_pids.nr > 0)
2418 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2419 	else if (thread_map__pid(evlist->threads, 0) == -1)
2420 		err = trace__set_filter_loop_pids(trace);
2421 
2422 	if (err < 0)
2423 		goto out_error_mem;
2424 
2425 	if (trace->ev_qualifier_ids.nr > 0) {
2426 		err = trace__set_ev_qualifier_filter(trace);
2427 		if (err < 0)
2428 			goto out_errno;
2429 
2430 		pr_debug("event qualifier tracepoint filter: %s\n",
2431 			 trace->syscalls.events.sys_exit->filter);
2432 	}
2433 
2434 	err = perf_evlist__apply_filters(evlist, &evsel);
2435 	if (err < 0)
2436 		goto out_error_apply_filters;
2437 
2438 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2439 	if (err < 0)
2440 		goto out_error_mmap;
2441 
2442 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2443 		perf_evlist__enable(evlist);
2444 
2445 	if (forks)
2446 		perf_evlist__start_workload(evlist);
2447 
2448 	if (trace->opts.initial_delay) {
2449 		usleep(trace->opts.initial_delay * 1000);
2450 		perf_evlist__enable(evlist);
2451 	}
2452 
2453 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2454 				  evlist->threads->nr > 1 ||
2455 				  perf_evlist__first(evlist)->attr.inherit;
2456 
2457 	/*
2458 	 * Now that we already used evsel->attr to ask the kernel to setup the
2459 	 * events, lets reuse evsel->attr.sample_max_stack as the limit in
2460 	 * trace__resolve_callchain(), allowing per-event max-stack settings
2461 	 * to override an explicitely set --max-stack global setting.
2462 	 */
2463 	evlist__for_each_entry(evlist, evsel) {
2464 		if ((evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
2465 		    evsel->attr.sample_max_stack == 0)
2466 			evsel->attr.sample_max_stack = trace->max_stack;
2467 	}
2468 again:
2469 	before = trace->nr_events;
2470 
2471 	for (i = 0; i < evlist->nr_mmaps; i++) {
2472 		union perf_event *event;
2473 
2474 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2475 			struct perf_sample sample;
2476 
2477 			++trace->nr_events;
2478 
2479 			err = perf_evlist__parse_sample(evlist, event, &sample);
2480 			if (err) {
2481 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2482 				goto next_event;
2483 			}
2484 
2485 			trace__handle_event(trace, event, &sample);
2486 next_event:
2487 			perf_evlist__mmap_consume(evlist, i);
2488 
2489 			if (interrupted)
2490 				goto out_disable;
2491 
2492 			if (done && !draining) {
2493 				perf_evlist__disable(evlist);
2494 				draining = true;
2495 			}
2496 		}
2497 	}
2498 
2499 	if (trace->nr_events == before) {
2500 		int timeout = done ? 100 : -1;
2501 
2502 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2503 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2504 				draining = true;
2505 
2506 			goto again;
2507 		}
2508 	} else {
2509 		goto again;
2510 	}
2511 
2512 out_disable:
2513 	thread__zput(trace->current);
2514 
2515 	perf_evlist__disable(evlist);
2516 
2517 	if (!err) {
2518 		if (trace->summary)
2519 			trace__fprintf_thread_summary(trace, trace->output);
2520 
2521 		if (trace->show_tool_stats) {
2522 			fprintf(trace->output, "Stats:\n "
2523 					       " vfs_getname : %" PRIu64 "\n"
2524 					       " proc_getname: %" PRIu64 "\n",
2525 				trace->stats.vfs_getname,
2526 				trace->stats.proc_getname);
2527 		}
2528 	}
2529 
2530 out_delete_evlist:
2531 	trace__symbols__exit(trace);
2532 
2533 	perf_evlist__delete(evlist);
2534 	trace->evlist = NULL;
2535 	trace->live = false;
2536 	return err;
2537 {
2538 	char errbuf[BUFSIZ];
2539 
2540 out_error_sched_stat_runtime:
2541 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2542 	goto out_error;
2543 
2544 out_error_raw_syscalls:
2545 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2546 	goto out_error;
2547 
2548 out_error_mmap:
2549 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2550 	goto out_error;
2551 
2552 out_error_open:
2553 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2554 
2555 out_error:
2556 	fprintf(trace->output, "%s\n", errbuf);
2557 	goto out_delete_evlist;
2558 
2559 out_error_apply_filters:
2560 	fprintf(trace->output,
2561 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2562 		evsel->filter, perf_evsel__name(evsel), errno,
2563 		str_error_r(errno, errbuf, sizeof(errbuf)));
2564 	goto out_delete_evlist;
2565 }
2566 out_error_mem:
2567 	fprintf(trace->output, "Not enough memory to run!\n");
2568 	goto out_delete_evlist;
2569 
2570 out_errno:
2571 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2572 	goto out_delete_evlist;
2573 }
2574 
2575 static int trace__replay(struct trace *trace)
2576 {
2577 	const struct perf_evsel_str_handler handlers[] = {
2578 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2579 	};
2580 	struct perf_data data = {
2581 		.file      = {
2582 			.path = input_name,
2583 		},
2584 		.mode      = PERF_DATA_MODE_READ,
2585 		.force     = trace->force,
2586 	};
2587 	struct perf_session *session;
2588 	struct perf_evsel *evsel;
2589 	int err = -1;
2590 
2591 	trace->tool.sample	  = trace__process_sample;
2592 	trace->tool.mmap	  = perf_event__process_mmap;
2593 	trace->tool.mmap2	  = perf_event__process_mmap2;
2594 	trace->tool.comm	  = perf_event__process_comm;
2595 	trace->tool.exit	  = perf_event__process_exit;
2596 	trace->tool.fork	  = perf_event__process_fork;
2597 	trace->tool.attr	  = perf_event__process_attr;
2598 	trace->tool.tracing_data  = perf_event__process_tracing_data;
2599 	trace->tool.build_id	  = perf_event__process_build_id;
2600 	trace->tool.namespaces	  = perf_event__process_namespaces;
2601 
2602 	trace->tool.ordered_events = true;
2603 	trace->tool.ordering_requires_timestamps = true;
2604 
2605 	/* add tid to output */
2606 	trace->multiple_threads = true;
2607 
2608 	session = perf_session__new(&data, false, &trace->tool);
2609 	if (session == NULL)
2610 		return -1;
2611 
2612 	if (trace->opts.target.pid)
2613 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2614 
2615 	if (trace->opts.target.tid)
2616 		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2617 
2618 	if (symbol__init(&session->header.env) < 0)
2619 		goto out;
2620 
2621 	trace->host = &session->machines.host;
2622 
2623 	err = perf_session__set_tracepoints_handlers(session, handlers);
2624 	if (err)
2625 		goto out;
2626 
2627 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2628 						     "raw_syscalls:sys_enter");
2629 	/* older kernels have syscalls tp versus raw_syscalls */
2630 	if (evsel == NULL)
2631 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2632 							     "syscalls:sys_enter");
2633 
2634 	if (evsel &&
2635 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2636 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2637 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2638 		goto out;
2639 	}
2640 
2641 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2642 						     "raw_syscalls:sys_exit");
2643 	if (evsel == NULL)
2644 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2645 							     "syscalls:sys_exit");
2646 	if (evsel &&
2647 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2648 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2649 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2650 		goto out;
2651 	}
2652 
2653 	evlist__for_each_entry(session->evlist, evsel) {
2654 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2655 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2656 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2657 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2658 			evsel->handler = trace__pgfault;
2659 	}
2660 
2661 	setup_pager();
2662 
2663 	err = perf_session__process_events(session);
2664 	if (err)
2665 		pr_err("Failed to process events, error %d", err);
2666 
2667 	else if (trace->summary)
2668 		trace__fprintf_thread_summary(trace, trace->output);
2669 
2670 out:
2671 	perf_session__delete(session);
2672 
2673 	return err;
2674 }
2675 
2676 static size_t trace__fprintf_threads_header(FILE *fp)
2677 {
2678 	size_t printed;
2679 
2680 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2681 
2682 	return printed;
2683 }
2684 
2685 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2686 	struct stats 	*stats;
2687 	double		msecs;
2688 	int		syscall;
2689 )
2690 {
2691 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2692 	struct stats *stats = source->priv;
2693 
2694 	entry->syscall = source->i;
2695 	entry->stats   = stats;
2696 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2697 }
2698 
2699 static size_t thread__dump_stats(struct thread_trace *ttrace,
2700 				 struct trace *trace, FILE *fp)
2701 {
2702 	size_t printed = 0;
2703 	struct syscall *sc;
2704 	struct rb_node *nd;
2705 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2706 
2707 	if (syscall_stats == NULL)
2708 		return 0;
2709 
2710 	printed += fprintf(fp, "\n");
2711 
2712 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2713 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2714 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2715 
2716 	resort_rb__for_each_entry(nd, syscall_stats) {
2717 		struct stats *stats = syscall_stats_entry->stats;
2718 		if (stats) {
2719 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2720 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2721 			double avg = avg_stats(stats);
2722 			double pct;
2723 			u64 n = (u64) stats->n;
2724 
2725 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2726 			avg /= NSEC_PER_MSEC;
2727 
2728 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2729 			printed += fprintf(fp, "   %-15s", sc->name);
2730 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2731 					   n, syscall_stats_entry->msecs, min, avg);
2732 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2733 		}
2734 	}
2735 
2736 	resort_rb__delete(syscall_stats);
2737 	printed += fprintf(fp, "\n\n");
2738 
2739 	return printed;
2740 }
2741 
2742 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2743 {
2744 	size_t printed = 0;
2745 	struct thread_trace *ttrace = thread__priv(thread);
2746 	double ratio;
2747 
2748 	if (ttrace == NULL)
2749 		return 0;
2750 
2751 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2752 
2753 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2754 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2755 	printed += fprintf(fp, "%.1f%%", ratio);
2756 	if (ttrace->pfmaj)
2757 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2758 	if (ttrace->pfmin)
2759 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2760 	if (trace->sched)
2761 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2762 	else if (fputc('\n', fp) != EOF)
2763 		++printed;
2764 
2765 	printed += thread__dump_stats(ttrace, trace, fp);
2766 
2767 	return printed;
2768 }
2769 
2770 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2771 {
2772 	return ttrace ? ttrace->nr_events : 0;
2773 }
2774 
2775 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2776 	struct thread *thread;
2777 )
2778 {
2779 	entry->thread = rb_entry(nd, struct thread, rb_node);
2780 }
2781 
2782 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2783 {
2784 	size_t printed = trace__fprintf_threads_header(fp);
2785 	struct rb_node *nd;
2786 	int i;
2787 
2788 	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2789 		DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2790 
2791 		if (threads == NULL) {
2792 			fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2793 			return 0;
2794 		}
2795 
2796 		resort_rb__for_each_entry(nd, threads)
2797 			printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2798 
2799 		resort_rb__delete(threads);
2800 	}
2801 	return printed;
2802 }
2803 
2804 static int trace__set_duration(const struct option *opt, const char *str,
2805 			       int unset __maybe_unused)
2806 {
2807 	struct trace *trace = opt->value;
2808 
2809 	trace->duration_filter = atof(str);
2810 	return 0;
2811 }
2812 
2813 static int trace__set_filter_pids(const struct option *opt, const char *str,
2814 				  int unset __maybe_unused)
2815 {
2816 	int ret = -1;
2817 	size_t i;
2818 	struct trace *trace = opt->value;
2819 	/*
2820 	 * FIXME: introduce a intarray class, plain parse csv and create a
2821 	 * { int nr, int entries[] } struct...
2822 	 */
2823 	struct intlist *list = intlist__new(str);
2824 
2825 	if (list == NULL)
2826 		return -1;
2827 
2828 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2829 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2830 
2831 	if (trace->filter_pids.entries == NULL)
2832 		goto out;
2833 
2834 	trace->filter_pids.entries[0] = getpid();
2835 
2836 	for (i = 1; i < trace->filter_pids.nr; ++i)
2837 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2838 
2839 	intlist__delete(list);
2840 	ret = 0;
2841 out:
2842 	return ret;
2843 }
2844 
2845 static int trace__open_output(struct trace *trace, const char *filename)
2846 {
2847 	struct stat st;
2848 
2849 	if (!stat(filename, &st) && st.st_size) {
2850 		char oldname[PATH_MAX];
2851 
2852 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2853 		unlink(oldname);
2854 		rename(filename, oldname);
2855 	}
2856 
2857 	trace->output = fopen(filename, "w");
2858 
2859 	return trace->output == NULL ? -errno : 0;
2860 }
2861 
2862 static int parse_pagefaults(const struct option *opt, const char *str,
2863 			    int unset __maybe_unused)
2864 {
2865 	int *trace_pgfaults = opt->value;
2866 
2867 	if (strcmp(str, "all") == 0)
2868 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2869 	else if (strcmp(str, "maj") == 0)
2870 		*trace_pgfaults |= TRACE_PFMAJ;
2871 	else if (strcmp(str, "min") == 0)
2872 		*trace_pgfaults |= TRACE_PFMIN;
2873 	else
2874 		return -1;
2875 
2876 	return 0;
2877 }
2878 
2879 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2880 {
2881 	struct perf_evsel *evsel;
2882 
2883 	evlist__for_each_entry(evlist, evsel)
2884 		evsel->handler = handler;
2885 }
2886 
2887 /*
2888  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2889  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2890  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2891  *
2892  * It'd be better to introduce a parse_options() variant that would return a
2893  * list with the terms it didn't match to an event...
2894  */
2895 static int trace__parse_events_option(const struct option *opt, const char *str,
2896 				      int unset __maybe_unused)
2897 {
2898 	struct trace *trace = (struct trace *)opt->value;
2899 	const char *s = str;
2900 	char *sep = NULL, *lists[2] = { NULL, NULL, };
2901 	int len = strlen(str) + 1, err = -1, list, idx;
2902 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2903 	char group_name[PATH_MAX];
2904 
2905 	if (strace_groups_dir == NULL)
2906 		return -1;
2907 
2908 	if (*s == '!') {
2909 		++s;
2910 		trace->not_ev_qualifier = true;
2911 	}
2912 
2913 	while (1) {
2914 		if ((sep = strchr(s, ',')) != NULL)
2915 			*sep = '\0';
2916 
2917 		list = 0;
2918 		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2919 		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2920 			list = 1;
2921 		} else {
2922 			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2923 			if (access(group_name, R_OK) == 0)
2924 				list = 1;
2925 		}
2926 
2927 		if (lists[list]) {
2928 			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2929 		} else {
2930 			lists[list] = malloc(len);
2931 			if (lists[list] == NULL)
2932 				goto out;
2933 			strcpy(lists[list], s);
2934 		}
2935 
2936 		if (!sep)
2937 			break;
2938 
2939 		*sep = ',';
2940 		s = sep + 1;
2941 	}
2942 
2943 	if (lists[1] != NULL) {
2944 		struct strlist_config slist_config = {
2945 			.dirname = strace_groups_dir,
2946 		};
2947 
2948 		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2949 		if (trace->ev_qualifier == NULL) {
2950 			fputs("Not enough memory to parse event qualifier", trace->output);
2951 			goto out;
2952 		}
2953 
2954 		if (trace__validate_ev_qualifier(trace))
2955 			goto out;
2956 	}
2957 
2958 	err = 0;
2959 
2960 	if (lists[0]) {
2961 		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2962 					       "event selector. use 'perf list' to list available events",
2963 					       parse_events_option);
2964 		err = parse_events_option(&o, lists[0], 0);
2965 	}
2966 out:
2967 	if (sep)
2968 		*sep = ',';
2969 
2970 	return err;
2971 }
2972 
2973 int cmd_trace(int argc, const char **argv)
2974 {
2975 	const char *trace_usage[] = {
2976 		"perf trace [<options>] [<command>]",
2977 		"perf trace [<options>] -- <command> [<options>]",
2978 		"perf trace record [<options>] [<command>]",
2979 		"perf trace record [<options>] -- <command> [<options>]",
2980 		NULL
2981 	};
2982 	struct trace trace = {
2983 		.syscalls = {
2984 			. max = -1,
2985 		},
2986 		.opts = {
2987 			.target = {
2988 				.uid	   = UINT_MAX,
2989 				.uses_mmap = true,
2990 			},
2991 			.user_freq     = UINT_MAX,
2992 			.user_interval = ULLONG_MAX,
2993 			.no_buffering  = true,
2994 			.mmap_pages    = UINT_MAX,
2995 			.proc_map_timeout  = 500,
2996 		},
2997 		.output = stderr,
2998 		.show_comm = true,
2999 		.trace_syscalls = true,
3000 		.kernel_syscallchains = false,
3001 		.max_stack = UINT_MAX,
3002 	};
3003 	const char *output_name = NULL;
3004 	const struct option trace_options[] = {
3005 	OPT_CALLBACK('e', "event", &trace, "event",
3006 		     "event/syscall selector. use 'perf list' to list available events",
3007 		     trace__parse_events_option),
3008 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
3009 		    "show the thread COMM next to its id"),
3010 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3011 	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3012 		     trace__parse_events_option),
3013 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
3014 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3015 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3016 		    "trace events on existing process id"),
3017 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3018 		    "trace events on existing thread id"),
3019 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3020 		     "pids to filter (by the kernel)", trace__set_filter_pids),
3021 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3022 		    "system-wide collection from all CPUs"),
3023 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3024 		    "list of cpus to monitor"),
3025 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3026 		    "child tasks do not inherit counters"),
3027 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3028 		     "number of mmap data pages",
3029 		     perf_evlist__parse_mmap_pages),
3030 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3031 		   "user to profile"),
3032 	OPT_CALLBACK(0, "duration", &trace, "float",
3033 		     "show only events with duration > N.M ms",
3034 		     trace__set_duration),
3035 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3036 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3037 	OPT_BOOLEAN('T', "time", &trace.full_time,
3038 		    "Show full timestamp, not time relative to first start"),
3039 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
3040 		    "Show only syscall summary with statistics"),
3041 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
3042 		    "Show all syscalls and summary with statistics"),
3043 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3044 		     "Trace pagefaults", parse_pagefaults, "maj"),
3045 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3046 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3047 	OPT_CALLBACK(0, "call-graph", &trace.opts,
3048 		     "record_mode[,record_size]", record_callchain_help,
3049 		     &record_parse_callchain_opt),
3050 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3051 		    "Show the kernel callchains on the syscall exit path"),
3052 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3053 		     "Set the minimum stack depth when parsing the callchain, "
3054 		     "anything below the specified depth will be ignored."),
3055 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3056 		     "Set the maximum stack depth when parsing the callchain, "
3057 		     "anything beyond the specified depth will be ignored. "
3058 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3059 	OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3060 			"print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3061 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3062 			"per thread proc mmap processing timeout in ms"),
3063 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3064 		     "ms to wait before starting measurement after program "
3065 		     "start"),
3066 	OPT_END()
3067 	};
3068 	bool __maybe_unused max_stack_user_set = true;
3069 	bool mmap_pages_user_set = true;
3070 	const char * const trace_subcommands[] = { "record", NULL };
3071 	int err;
3072 	char bf[BUFSIZ];
3073 
3074 	signal(SIGSEGV, sighandler_dump_stack);
3075 	signal(SIGFPE, sighandler_dump_stack);
3076 
3077 	trace.evlist = perf_evlist__new();
3078 	trace.sctbl = syscalltbl__new();
3079 
3080 	if (trace.evlist == NULL || trace.sctbl == NULL) {
3081 		pr_err("Not enough memory to run!\n");
3082 		err = -ENOMEM;
3083 		goto out;
3084 	}
3085 
3086 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3087 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3088 
3089 	err = bpf__setup_stdout(trace.evlist);
3090 	if (err) {
3091 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3092 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3093 		goto out;
3094 	}
3095 
3096 	err = -1;
3097 
3098 	if (trace.trace_pgfaults) {
3099 		trace.opts.sample_address = true;
3100 		trace.opts.sample_time = true;
3101 	}
3102 
3103 	if (trace.opts.mmap_pages == UINT_MAX)
3104 		mmap_pages_user_set = false;
3105 
3106 	if (trace.max_stack == UINT_MAX) {
3107 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3108 		max_stack_user_set = false;
3109 	}
3110 
3111 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3112 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3113 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3114 	}
3115 #endif
3116 
3117 	if (callchain_param.enabled) {
3118 		if (!mmap_pages_user_set && geteuid() == 0)
3119 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3120 
3121 		symbol_conf.use_callchain = true;
3122 	}
3123 
3124 	if (trace.evlist->nr_entries > 0)
3125 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3126 
3127 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3128 		return trace__record(&trace, argc-1, &argv[1]);
3129 
3130 	/* summary_only implies summary option, but don't overwrite summary if set */
3131 	if (trace.summary_only)
3132 		trace.summary = trace.summary_only;
3133 
3134 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3135 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3136 		pr_err("Please specify something to trace.\n");
3137 		return -1;
3138 	}
3139 
3140 	if (!trace.trace_syscalls && trace.ev_qualifier) {
3141 		pr_err("The -e option can't be used with --no-syscalls.\n");
3142 		goto out;
3143 	}
3144 
3145 	if (output_name != NULL) {
3146 		err = trace__open_output(&trace, output_name);
3147 		if (err < 0) {
3148 			perror("failed to create output file");
3149 			goto out;
3150 		}
3151 	}
3152 
3153 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
3154 
3155 	err = target__validate(&trace.opts.target);
3156 	if (err) {
3157 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3158 		fprintf(trace.output, "%s", bf);
3159 		goto out_close;
3160 	}
3161 
3162 	err = target__parse_uid(&trace.opts.target);
3163 	if (err) {
3164 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3165 		fprintf(trace.output, "%s", bf);
3166 		goto out_close;
3167 	}
3168 
3169 	if (!argc && target__none(&trace.opts.target))
3170 		trace.opts.target.system_wide = true;
3171 
3172 	if (input_name)
3173 		err = trace__replay(&trace);
3174 	else
3175 		err = trace__run(&trace, argc, argv);
3176 
3177 out_close:
3178 	if (output_name != NULL)
3179 		fclose(trace.output);
3180 out:
3181 	return err;
3182 }
3183