xref: /linux/tools/perf/builtin-trace.c (revision 55f3538c4923e9dfca132e99ebec370e8094afda)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/env.h"
25 #include "util/event.h"
26 #include "util/evlist.h"
27 #include <subcmd/exec-cmd.h>
28 #include "util/machine.h"
29 #include "util/path.h"
30 #include "util/session.h"
31 #include "util/thread.h"
32 #include <subcmd/parse-options.h>
33 #include "util/strlist.h"
34 #include "util/intlist.h"
35 #include "util/thread_map.h"
36 #include "util/stat.h"
37 #include "trace/beauty/beauty.h"
38 #include "trace-event.h"
39 #include "util/parse-events.h"
40 #include "util/bpf-loader.h"
41 #include "callchain.h"
42 #include "print_binary.h"
43 #include "string2.h"
44 #include "syscalltbl.h"
45 #include "rb_resort.h"
46 
47 #include <errno.h>
48 #include <inttypes.h>
49 #include <poll.h>
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/kernel.h>
56 #include <linux/random.h>
57 #include <linux/stringify.h>
58 #include <linux/time64.h>
59 #include <fcntl.h>
60 
61 #include "sane_ctype.h"
62 
63 #ifndef O_CLOEXEC
64 # define O_CLOEXEC		02000000
65 #endif
66 
67 #ifndef F_LINUX_SPECIFIC_BASE
68 # define F_LINUX_SPECIFIC_BASE	1024
69 #endif
70 
71 struct trace {
72 	struct perf_tool	tool;
73 	struct syscalltbl	*sctbl;
74 	struct {
75 		int		max;
76 		struct syscall  *table;
77 		struct {
78 			struct perf_evsel *sys_enter,
79 					  *sys_exit;
80 		}		events;
81 	} syscalls;
82 	struct record_opts	opts;
83 	struct perf_evlist	*evlist;
84 	struct machine		*host;
85 	struct thread		*current;
86 	u64			base_time;
87 	FILE			*output;
88 	unsigned long		nr_events;
89 	struct strlist		*ev_qualifier;
90 	struct {
91 		size_t		nr;
92 		int		*entries;
93 	}			ev_qualifier_ids;
94 	struct {
95 		size_t		nr;
96 		pid_t		*entries;
97 	}			filter_pids;
98 	double			duration_filter;
99 	double			runtime_ms;
100 	struct {
101 		u64		vfs_getname,
102 				proc_getname;
103 	} stats;
104 	unsigned int		max_stack;
105 	unsigned int		min_stack;
106 	bool			not_ev_qualifier;
107 	bool			live;
108 	bool			full_time;
109 	bool			sched;
110 	bool			multiple_threads;
111 	bool			summary;
112 	bool			summary_only;
113 	bool			show_comm;
114 	bool			print_sample;
115 	bool			show_tool_stats;
116 	bool			trace_syscalls;
117 	bool			kernel_syscallchains;
118 	bool			force;
119 	bool			vfs_getname;
120 	int			trace_pgfaults;
121 	int			open_id;
122 };
123 
124 struct tp_field {
125 	int offset;
126 	union {
127 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
128 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
129 	};
130 };
131 
132 #define TP_UINT_FIELD(bits) \
133 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
134 { \
135 	u##bits value; \
136 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
137 	return value;  \
138 }
139 
140 TP_UINT_FIELD(8);
141 TP_UINT_FIELD(16);
142 TP_UINT_FIELD(32);
143 TP_UINT_FIELD(64);
144 
145 #define TP_UINT_FIELD__SWAPPED(bits) \
146 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
147 { \
148 	u##bits value; \
149 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
150 	return bswap_##bits(value);\
151 }
152 
153 TP_UINT_FIELD__SWAPPED(16);
154 TP_UINT_FIELD__SWAPPED(32);
155 TP_UINT_FIELD__SWAPPED(64);
156 
157 static int tp_field__init_uint(struct tp_field *field,
158 			       struct format_field *format_field,
159 			       bool needs_swap)
160 {
161 	field->offset = format_field->offset;
162 
163 	switch (format_field->size) {
164 	case 1:
165 		field->integer = tp_field__u8;
166 		break;
167 	case 2:
168 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
169 		break;
170 	case 4:
171 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
172 		break;
173 	case 8:
174 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
175 		break;
176 	default:
177 		return -1;
178 	}
179 
180 	return 0;
181 }
182 
183 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
184 {
185 	return sample->raw_data + field->offset;
186 }
187 
188 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
189 {
190 	field->offset = format_field->offset;
191 	field->pointer = tp_field__ptr;
192 	return 0;
193 }
194 
195 struct syscall_tp {
196 	struct tp_field id;
197 	union {
198 		struct tp_field args, ret;
199 	};
200 };
201 
202 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
203 					  struct tp_field *field,
204 					  const char *name)
205 {
206 	struct format_field *format_field = perf_evsel__field(evsel, name);
207 
208 	if (format_field == NULL)
209 		return -1;
210 
211 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
212 }
213 
214 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
215 	({ struct syscall_tp *sc = evsel->priv;\
216 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
217 
218 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
219 					 struct tp_field *field,
220 					 const char *name)
221 {
222 	struct format_field *format_field = perf_evsel__field(evsel, name);
223 
224 	if (format_field == NULL)
225 		return -1;
226 
227 	return tp_field__init_ptr(field, format_field);
228 }
229 
230 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
231 	({ struct syscall_tp *sc = evsel->priv;\
232 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
233 
234 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
235 {
236 	zfree(&evsel->priv);
237 	perf_evsel__delete(evsel);
238 }
239 
240 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
241 {
242 	evsel->priv = malloc(sizeof(struct syscall_tp));
243 	if (evsel->priv != NULL) {
244 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
245 			goto out_delete;
246 
247 		evsel->handler = handler;
248 		return 0;
249 	}
250 
251 	return -ENOMEM;
252 
253 out_delete:
254 	zfree(&evsel->priv);
255 	return -ENOENT;
256 }
257 
258 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
259 {
260 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
261 
262 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
263 	if (IS_ERR(evsel))
264 		evsel = perf_evsel__newtp("syscalls", direction);
265 
266 	if (IS_ERR(evsel))
267 		return NULL;
268 
269 	if (perf_evsel__init_syscall_tp(evsel, handler))
270 		goto out_delete;
271 
272 	return evsel;
273 
274 out_delete:
275 	perf_evsel__delete_priv(evsel);
276 	return NULL;
277 }
278 
279 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
280 	({ struct syscall_tp *fields = evsel->priv; \
281 	   fields->name.integer(&fields->name, sample); })
282 
283 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
284 	({ struct syscall_tp *fields = evsel->priv; \
285 	   fields->name.pointer(&fields->name, sample); })
286 
287 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
288 {
289 	int idx = val - sa->offset;
290 
291 	if (idx < 0 || idx >= sa->nr_entries)
292 		return scnprintf(bf, size, intfmt, val);
293 
294 	return scnprintf(bf, size, "%s", sa->entries[idx]);
295 }
296 
297 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
298 						const char *intfmt,
299 					        struct syscall_arg *arg)
300 {
301 	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
302 }
303 
304 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
305 					      struct syscall_arg *arg)
306 {
307 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
308 }
309 
310 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
311 
312 struct strarrays {
313 	int		nr_entries;
314 	struct strarray **entries;
315 };
316 
317 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
318 	.nr_entries = ARRAY_SIZE(array), \
319 	.entries = array, \
320 }
321 
322 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
323 					struct syscall_arg *arg)
324 {
325 	struct strarrays *sas = arg->parm;
326 	int i;
327 
328 	for (i = 0; i < sas->nr_entries; ++i) {
329 		struct strarray *sa = sas->entries[i];
330 		int idx = arg->val - sa->offset;
331 
332 		if (idx >= 0 && idx < sa->nr_entries) {
333 			if (sa->entries[idx] == NULL)
334 				break;
335 			return scnprintf(bf, size, "%s", sa->entries[idx]);
336 		}
337 	}
338 
339 	return scnprintf(bf, size, "%d", arg->val);
340 }
341 
342 #ifndef AT_FDCWD
343 #define AT_FDCWD	-100
344 #endif
345 
346 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
347 					   struct syscall_arg *arg)
348 {
349 	int fd = arg->val;
350 
351 	if (fd == AT_FDCWD)
352 		return scnprintf(bf, size, "CWD");
353 
354 	return syscall_arg__scnprintf_fd(bf, size, arg);
355 }
356 
357 #define SCA_FDAT syscall_arg__scnprintf_fd_at
358 
359 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
360 					      struct syscall_arg *arg);
361 
362 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
363 
364 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
365 {
366 	return scnprintf(bf, size, "%#lx", arg->val);
367 }
368 
369 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
370 {
371 	return scnprintf(bf, size, "%d", arg->val);
372 }
373 
374 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
375 {
376 	return scnprintf(bf, size, "%ld", arg->val);
377 }
378 
379 static const char *bpf_cmd[] = {
380 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
381 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
382 };
383 static DEFINE_STRARRAY(bpf_cmd);
384 
385 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
386 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
387 
388 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
389 static DEFINE_STRARRAY(itimers);
390 
391 static const char *keyctl_options[] = {
392 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
393 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
394 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
395 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
396 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
397 };
398 static DEFINE_STRARRAY(keyctl_options);
399 
400 static const char *whences[] = { "SET", "CUR", "END",
401 #ifdef SEEK_DATA
402 "DATA",
403 #endif
404 #ifdef SEEK_HOLE
405 "HOLE",
406 #endif
407 };
408 static DEFINE_STRARRAY(whences);
409 
410 static const char *fcntl_cmds[] = {
411 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
412 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
413 	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
414 	"GETOWNER_UIDS",
415 };
416 static DEFINE_STRARRAY(fcntl_cmds);
417 
418 static const char *fcntl_linux_specific_cmds[] = {
419 	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
420 	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
421 	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
422 };
423 
424 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
425 
426 static struct strarray *fcntl_cmds_arrays[] = {
427 	&strarray__fcntl_cmds,
428 	&strarray__fcntl_linux_specific_cmds,
429 };
430 
431 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
432 
433 static const char *rlimit_resources[] = {
434 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
435 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
436 	"RTTIME",
437 };
438 static DEFINE_STRARRAY(rlimit_resources);
439 
440 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
441 static DEFINE_STRARRAY(sighow);
442 
443 static const char *clockid[] = {
444 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
445 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
446 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
447 };
448 static DEFINE_STRARRAY(clockid);
449 
450 static const char *socket_families[] = {
451 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
452 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
453 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
454 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
455 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
456 	"ALG", "NFC", "VSOCK",
457 };
458 static DEFINE_STRARRAY(socket_families);
459 
460 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
461 						 struct syscall_arg *arg)
462 {
463 	size_t printed = 0;
464 	int mode = arg->val;
465 
466 	if (mode == F_OK) /* 0 */
467 		return scnprintf(bf, size, "F");
468 #define	P_MODE(n) \
469 	if (mode & n##_OK) { \
470 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
471 		mode &= ~n##_OK; \
472 	}
473 
474 	P_MODE(R);
475 	P_MODE(W);
476 	P_MODE(X);
477 #undef P_MODE
478 
479 	if (mode)
480 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
481 
482 	return printed;
483 }
484 
485 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
486 
487 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
488 					      struct syscall_arg *arg);
489 
490 #define SCA_FILENAME syscall_arg__scnprintf_filename
491 
492 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
493 						struct syscall_arg *arg)
494 {
495 	int printed = 0, flags = arg->val;
496 
497 #define	P_FLAG(n) \
498 	if (flags & O_##n) { \
499 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
500 		flags &= ~O_##n; \
501 	}
502 
503 	P_FLAG(CLOEXEC);
504 	P_FLAG(NONBLOCK);
505 #undef P_FLAG
506 
507 	if (flags)
508 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
509 
510 	return printed;
511 }
512 
513 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
514 
515 #ifndef GRND_NONBLOCK
516 #define GRND_NONBLOCK	0x0001
517 #endif
518 #ifndef GRND_RANDOM
519 #define GRND_RANDOM	0x0002
520 #endif
521 
522 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
523 						   struct syscall_arg *arg)
524 {
525 	int printed = 0, flags = arg->val;
526 
527 #define	P_FLAG(n) \
528 	if (flags & GRND_##n) { \
529 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
530 		flags &= ~GRND_##n; \
531 	}
532 
533 	P_FLAG(RANDOM);
534 	P_FLAG(NONBLOCK);
535 #undef P_FLAG
536 
537 	if (flags)
538 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
539 
540 	return printed;
541 }
542 
543 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
544 
545 #define STRARRAY(name, array) \
546 	  { .scnprintf	= SCA_STRARRAY, \
547 	    .parm	= &strarray__##array, }
548 
549 #include "trace/beauty/arch_errno_names.c"
550 #include "trace/beauty/eventfd.c"
551 #include "trace/beauty/futex_op.c"
552 #include "trace/beauty/futex_val3.c"
553 #include "trace/beauty/mmap.c"
554 #include "trace/beauty/mode_t.c"
555 #include "trace/beauty/msg_flags.c"
556 #include "trace/beauty/open_flags.c"
557 #include "trace/beauty/perf_event_open.c"
558 #include "trace/beauty/pid.c"
559 #include "trace/beauty/sched_policy.c"
560 #include "trace/beauty/seccomp.c"
561 #include "trace/beauty/signum.c"
562 #include "trace/beauty/socket_type.c"
563 #include "trace/beauty/waitid_options.c"
564 
565 struct syscall_arg_fmt {
566 	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
567 	void	   *parm;
568 	const char *name;
569 	bool	   show_zero;
570 };
571 
572 static struct syscall_fmt {
573 	const char *name;
574 	const char *alias;
575 	struct syscall_arg_fmt arg[6];
576 	u8	   nr_args;
577 	bool	   errpid;
578 	bool	   timeout;
579 	bool	   hexret;
580 } syscall_fmts[] = {
581 	{ .name	    = "access",
582 	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
583 	{ .name	    = "bpf",
584 	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
585 	{ .name	    = "brk",	    .hexret = true,
586 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
587 	{ .name     = "clock_gettime",
588 	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
589 	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
590 	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
591 		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
592 		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
593 		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
594 		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
595 	{ .name	    = "close",
596 	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
597 	{ .name	    = "epoll_ctl",
598 	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
599 	{ .name	    = "eventfd2",
600 	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
601 	{ .name	    = "fchmodat",
602 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
603 	{ .name	    = "fchownat",
604 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
605 	{ .name	    = "fcntl",
606 	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
607 			   .parm      = &strarrays__fcntl_cmds_arrays,
608 			   .show_zero = true, },
609 		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
610 	{ .name	    = "flock",
611 	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
612 	{ .name	    = "fstat", .alias = "newfstat", },
613 	{ .name	    = "fstatat", .alias = "newfstatat", },
614 	{ .name	    = "futex",
615 	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
616 		   [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
617 	{ .name	    = "futimesat",
618 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
619 	{ .name	    = "getitimer",
620 	  .arg = { [0] = STRARRAY(which, itimers), }, },
621 	{ .name	    = "getpid",	    .errpid = true, },
622 	{ .name	    = "getpgid",    .errpid = true, },
623 	{ .name	    = "getppid",    .errpid = true, },
624 	{ .name	    = "getrandom",
625 	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
626 	{ .name	    = "getrlimit",
627 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
628 	{ .name	    = "gettid",	    .errpid = true, },
629 	{ .name	    = "ioctl",
630 	  .arg = {
631 #if defined(__i386__) || defined(__x86_64__)
632 /*
633  * FIXME: Make this available to all arches.
634  */
635 		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
636 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
637 #else
638 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
639 #endif
640 	{ .name	    = "kcmp",	    .nr_args = 5,
641 	  .arg = { [0] = { .name = "pid1",	.scnprintf = SCA_PID, },
642 		   [1] = { .name = "pid2",	.scnprintf = SCA_PID, },
643 		   [2] = { .name = "type",	.scnprintf = SCA_KCMP_TYPE, },
644 		   [3] = { .name = "idx1",	.scnprintf = SCA_KCMP_IDX, },
645 		   [4] = { .name = "idx2",	.scnprintf = SCA_KCMP_IDX, }, }, },
646 	{ .name	    = "keyctl",
647 	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
648 	{ .name	    = "kill",
649 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
650 	{ .name	    = "linkat",
651 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
652 	{ .name	    = "lseek",
653 	  .arg = { [2] = STRARRAY(whence, whences), }, },
654 	{ .name	    = "lstat", .alias = "newlstat", },
655 	{ .name     = "madvise",
656 	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
657 		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
658 	{ .name	    = "mkdirat",
659 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
660 	{ .name	    = "mknodat",
661 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
662 	{ .name	    = "mlock",
663 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
664 	{ .name	    = "mlockall",
665 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
666 	{ .name	    = "mmap",	    .hexret = true,
667 /* The standard mmap maps to old_mmap on s390x */
668 #if defined(__s390x__)
669 	.alias = "old_mmap",
670 #endif
671 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* addr */ },
672 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
673 		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */ }, }, },
674 	{ .name	    = "mprotect",
675 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
676 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
677 	{ .name	    = "mq_unlink",
678 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
679 	{ .name	    = "mremap",	    .hexret = true,
680 	  .arg = { [0] = { .scnprintf = SCA_HEX,	  /* addr */ },
681 		   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
682 		   [4] = { .scnprintf = SCA_HEX,	  /* new_addr */ }, }, },
683 	{ .name	    = "munlock",
684 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
685 	{ .name	    = "munmap",
686 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
687 	{ .name	    = "name_to_handle_at",
688 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
689 	{ .name	    = "newfstatat",
690 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
691 	{ .name	    = "open",
692 	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
693 	{ .name	    = "open_by_handle_at",
694 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
695 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
696 	{ .name	    = "openat",
697 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
698 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
699 	{ .name	    = "perf_event_open",
700 	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
701 		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
702 		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
703 	{ .name	    = "pipe2",
704 	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
705 	{ .name	    = "pkey_alloc",
706 	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
707 	{ .name	    = "pkey_free",
708 	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
709 	{ .name	    = "pkey_mprotect",
710 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
711 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
712 		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
713 	{ .name	    = "poll", .timeout = true, },
714 	{ .name	    = "ppoll", .timeout = true, },
715 	{ .name	    = "prctl", .alias = "arch_prctl",
716 	  .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
717 		   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
718 		   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
719 	{ .name	    = "pread", .alias = "pread64", },
720 	{ .name	    = "preadv", .alias = "pread", },
721 	{ .name	    = "prlimit64",
722 	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
723 	{ .name	    = "pwrite", .alias = "pwrite64", },
724 	{ .name	    = "readlinkat",
725 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
726 	{ .name	    = "recvfrom",
727 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
728 	{ .name	    = "recvmmsg",
729 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
730 	{ .name	    = "recvmsg",
731 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
732 	{ .name	    = "renameat",
733 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
734 	{ .name	    = "rt_sigaction",
735 	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
736 	{ .name	    = "rt_sigprocmask",
737 	  .arg = { [0] = STRARRAY(how, sighow), }, },
738 	{ .name	    = "rt_sigqueueinfo",
739 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
740 	{ .name	    = "rt_tgsigqueueinfo",
741 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
742 	{ .name	    = "sched_setscheduler",
743 	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
744 	{ .name	    = "seccomp",
745 	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
746 		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
747 	{ .name	    = "select", .timeout = true, },
748 	{ .name	    = "sendmmsg",
749 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
750 	{ .name	    = "sendmsg",
751 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
752 	{ .name	    = "sendto",
753 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
754 	{ .name	    = "set_tid_address", .errpid = true, },
755 	{ .name	    = "setitimer",
756 	  .arg = { [0] = STRARRAY(which, itimers), }, },
757 	{ .name	    = "setrlimit",
758 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
759 	{ .name	    = "socket",
760 	  .arg = { [0] = STRARRAY(family, socket_families),
761 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
762 	{ .name	    = "socketpair",
763 	  .arg = { [0] = STRARRAY(family, socket_families),
764 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
765 	{ .name	    = "stat", .alias = "newstat", },
766 	{ .name	    = "statx",
767 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
768 		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
769 		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
770 	{ .name	    = "swapoff",
771 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
772 	{ .name	    = "swapon",
773 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
774 	{ .name	    = "symlinkat",
775 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
776 	{ .name	    = "tgkill",
777 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
778 	{ .name	    = "tkill",
779 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
780 	{ .name	    = "uname", .alias = "newuname", },
781 	{ .name	    = "unlinkat",
782 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
783 	{ .name	    = "utimensat",
784 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
785 	{ .name	    = "wait4",	    .errpid = true,
786 	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
787 	{ .name	    = "waitid",	    .errpid = true,
788 	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
789 };
790 
791 static int syscall_fmt__cmp(const void *name, const void *fmtp)
792 {
793 	const struct syscall_fmt *fmt = fmtp;
794 	return strcmp(name, fmt->name);
795 }
796 
797 static struct syscall_fmt *syscall_fmt__find(const char *name)
798 {
799 	const int nmemb = ARRAY_SIZE(syscall_fmts);
800 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
801 }
802 
803 struct syscall {
804 	struct event_format *tp_format;
805 	int		    nr_args;
806 	struct format_field *args;
807 	const char	    *name;
808 	bool		    is_exit;
809 	struct syscall_fmt  *fmt;
810 	struct syscall_arg_fmt *arg_fmt;
811 };
812 
813 /*
814  * We need to have this 'calculated' boolean because in some cases we really
815  * don't know what is the duration of a syscall, for instance, when we start
816  * a session and some threads are waiting for a syscall to finish, say 'poll',
817  * in which case all we can do is to print "( ? ) for duration and for the
818  * start timestamp.
819  */
820 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
821 {
822 	double duration = (double)t / NSEC_PER_MSEC;
823 	size_t printed = fprintf(fp, "(");
824 
825 	if (!calculated)
826 		printed += fprintf(fp, "         ");
827 	else if (duration >= 1.0)
828 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
829 	else if (duration >= 0.01)
830 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
831 	else
832 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
833 	return printed + fprintf(fp, "): ");
834 }
835 
836 /**
837  * filename.ptr: The filename char pointer that will be vfs_getname'd
838  * filename.entry_str_pos: Where to insert the string translated from
839  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
840  * ret_scnprintf: syscall args may set this to a different syscall return
841  *                formatter, for instance, fcntl may return fds, file flags, etc.
842  */
843 struct thread_trace {
844 	u64		  entry_time;
845 	bool		  entry_pending;
846 	unsigned long	  nr_events;
847 	unsigned long	  pfmaj, pfmin;
848 	char		  *entry_str;
849 	double		  runtime_ms;
850 	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
851         struct {
852 		unsigned long ptr;
853 		short int     entry_str_pos;
854 		bool	      pending_open;
855 		unsigned int  namelen;
856 		char	      *name;
857 	} filename;
858 	struct {
859 		int	  max;
860 		char	  **table;
861 	} paths;
862 
863 	struct intlist *syscall_stats;
864 };
865 
866 static struct thread_trace *thread_trace__new(void)
867 {
868 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
869 
870 	if (ttrace)
871 		ttrace->paths.max = -1;
872 
873 	ttrace->syscall_stats = intlist__new(NULL);
874 
875 	return ttrace;
876 }
877 
878 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
879 {
880 	struct thread_trace *ttrace;
881 
882 	if (thread == NULL)
883 		goto fail;
884 
885 	if (thread__priv(thread) == NULL)
886 		thread__set_priv(thread, thread_trace__new());
887 
888 	if (thread__priv(thread) == NULL)
889 		goto fail;
890 
891 	ttrace = thread__priv(thread);
892 	++ttrace->nr_events;
893 
894 	return ttrace;
895 fail:
896 	color_fprintf(fp, PERF_COLOR_RED,
897 		      "WARNING: not enough memory, dropping samples!\n");
898 	return NULL;
899 }
900 
901 
902 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
903 				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
904 {
905 	struct thread_trace *ttrace = thread__priv(arg->thread);
906 
907 	ttrace->ret_scnprintf = ret_scnprintf;
908 }
909 
910 #define TRACE_PFMAJ		(1 << 0)
911 #define TRACE_PFMIN		(1 << 1)
912 
913 static const size_t trace__entry_str_size = 2048;
914 
915 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
916 {
917 	struct thread_trace *ttrace = thread__priv(thread);
918 
919 	if (fd > ttrace->paths.max) {
920 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
921 
922 		if (npath == NULL)
923 			return -1;
924 
925 		if (ttrace->paths.max != -1) {
926 			memset(npath + ttrace->paths.max + 1, 0,
927 			       (fd - ttrace->paths.max) * sizeof(char *));
928 		} else {
929 			memset(npath, 0, (fd + 1) * sizeof(char *));
930 		}
931 
932 		ttrace->paths.table = npath;
933 		ttrace->paths.max   = fd;
934 	}
935 
936 	ttrace->paths.table[fd] = strdup(pathname);
937 
938 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
939 }
940 
941 static int thread__read_fd_path(struct thread *thread, int fd)
942 {
943 	char linkname[PATH_MAX], pathname[PATH_MAX];
944 	struct stat st;
945 	int ret;
946 
947 	if (thread->pid_ == thread->tid) {
948 		scnprintf(linkname, sizeof(linkname),
949 			  "/proc/%d/fd/%d", thread->pid_, fd);
950 	} else {
951 		scnprintf(linkname, sizeof(linkname),
952 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
953 	}
954 
955 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
956 		return -1;
957 
958 	ret = readlink(linkname, pathname, sizeof(pathname));
959 
960 	if (ret < 0 || ret > st.st_size)
961 		return -1;
962 
963 	pathname[ret] = '\0';
964 	return trace__set_fd_pathname(thread, fd, pathname);
965 }
966 
967 static const char *thread__fd_path(struct thread *thread, int fd,
968 				   struct trace *trace)
969 {
970 	struct thread_trace *ttrace = thread__priv(thread);
971 
972 	if (ttrace == NULL)
973 		return NULL;
974 
975 	if (fd < 0)
976 		return NULL;
977 
978 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
979 		if (!trace->live)
980 			return NULL;
981 		++trace->stats.proc_getname;
982 		if (thread__read_fd_path(thread, fd))
983 			return NULL;
984 	}
985 
986 	return ttrace->paths.table[fd];
987 }
988 
989 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
990 {
991 	int fd = arg->val;
992 	size_t printed = scnprintf(bf, size, "%d", fd);
993 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
994 
995 	if (path)
996 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
997 
998 	return printed;
999 }
1000 
1001 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1002 {
1003         size_t printed = scnprintf(bf, size, "%d", fd);
1004 	struct thread *thread = machine__find_thread(trace->host, pid, pid);
1005 
1006 	if (thread) {
1007 		const char *path = thread__fd_path(thread, fd, trace);
1008 
1009 		if (path)
1010 			printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1011 
1012 		thread__put(thread);
1013 	}
1014 
1015         return printed;
1016 }
1017 
1018 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1019 					      struct syscall_arg *arg)
1020 {
1021 	int fd = arg->val;
1022 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1023 	struct thread_trace *ttrace = thread__priv(arg->thread);
1024 
1025 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1026 		zfree(&ttrace->paths.table[fd]);
1027 
1028 	return printed;
1029 }
1030 
1031 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1032 				     unsigned long ptr)
1033 {
1034 	struct thread_trace *ttrace = thread__priv(thread);
1035 
1036 	ttrace->filename.ptr = ptr;
1037 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1038 }
1039 
1040 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1041 					      struct syscall_arg *arg)
1042 {
1043 	unsigned long ptr = arg->val;
1044 
1045 	if (!arg->trace->vfs_getname)
1046 		return scnprintf(bf, size, "%#x", ptr);
1047 
1048 	thread__set_filename_pos(arg->thread, bf, ptr);
1049 	return 0;
1050 }
1051 
1052 static bool trace__filter_duration(struct trace *trace, double t)
1053 {
1054 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1055 }
1056 
1057 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1058 {
1059 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1060 
1061 	return fprintf(fp, "%10.3f ", ts);
1062 }
1063 
1064 /*
1065  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1066  * using ttrace->entry_time for a thread that receives a sys_exit without
1067  * first having received a sys_enter ("poll" issued before tracing session
1068  * starts, lost sys_enter exit due to ring buffer overflow).
1069  */
1070 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1071 {
1072 	if (tstamp > 0)
1073 		return __trace__fprintf_tstamp(trace, tstamp, fp);
1074 
1075 	return fprintf(fp, "         ? ");
1076 }
1077 
1078 static bool done = false;
1079 static bool interrupted = false;
1080 
1081 static void sig_handler(int sig)
1082 {
1083 	done = true;
1084 	interrupted = sig == SIGINT;
1085 }
1086 
1087 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1088 					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1089 {
1090 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1091 	printed += fprintf_duration(duration, duration_calculated, fp);
1092 
1093 	if (trace->multiple_threads) {
1094 		if (trace->show_comm)
1095 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1096 		printed += fprintf(fp, "%d ", thread->tid);
1097 	}
1098 
1099 	return printed;
1100 }
1101 
1102 static int trace__process_event(struct trace *trace, struct machine *machine,
1103 				union perf_event *event, struct perf_sample *sample)
1104 {
1105 	int ret = 0;
1106 
1107 	switch (event->header.type) {
1108 	case PERF_RECORD_LOST:
1109 		color_fprintf(trace->output, PERF_COLOR_RED,
1110 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1111 		ret = machine__process_lost_event(machine, event, sample);
1112 		break;
1113 	default:
1114 		ret = machine__process_event(machine, event, sample);
1115 		break;
1116 	}
1117 
1118 	return ret;
1119 }
1120 
1121 static int trace__tool_process(struct perf_tool *tool,
1122 			       union perf_event *event,
1123 			       struct perf_sample *sample,
1124 			       struct machine *machine)
1125 {
1126 	struct trace *trace = container_of(tool, struct trace, tool);
1127 	return trace__process_event(trace, machine, event, sample);
1128 }
1129 
1130 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1131 {
1132 	struct machine *machine = vmachine;
1133 
1134 	if (machine->kptr_restrict_warned)
1135 		return NULL;
1136 
1137 	if (symbol_conf.kptr_restrict) {
1138 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1139 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1140 			   "Kernel samples will not be resolved.\n");
1141 		machine->kptr_restrict_warned = true;
1142 		return NULL;
1143 	}
1144 
1145 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1146 }
1147 
1148 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1149 {
1150 	int err = symbol__init(NULL);
1151 
1152 	if (err)
1153 		return err;
1154 
1155 	trace->host = machine__new_host();
1156 	if (trace->host == NULL)
1157 		return -ENOMEM;
1158 
1159 	err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1160 	if (err < 0)
1161 		goto out;
1162 
1163 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1164 					    evlist->threads, trace__tool_process, false,
1165 					    trace->opts.proc_map_timeout, 1);
1166 out:
1167 	if (err)
1168 		symbol__exit();
1169 
1170 	return err;
1171 }
1172 
1173 static void trace__symbols__exit(struct trace *trace)
1174 {
1175 	machine__exit(trace->host);
1176 	trace->host = NULL;
1177 
1178 	symbol__exit();
1179 }
1180 
1181 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1182 {
1183 	int idx;
1184 
1185 	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1186 		nr_args = sc->fmt->nr_args;
1187 
1188 	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1189 	if (sc->arg_fmt == NULL)
1190 		return -1;
1191 
1192 	for (idx = 0; idx < nr_args; ++idx) {
1193 		if (sc->fmt)
1194 			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1195 	}
1196 
1197 	sc->nr_args = nr_args;
1198 	return 0;
1199 }
1200 
1201 static int syscall__set_arg_fmts(struct syscall *sc)
1202 {
1203 	struct format_field *field;
1204 	int idx = 0, len;
1205 
1206 	for (field = sc->args; field; field = field->next, ++idx) {
1207 		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1208 			continue;
1209 
1210 		if (strcmp(field->type, "const char *") == 0 &&
1211 			 (strcmp(field->name, "filename") == 0 ||
1212 			  strcmp(field->name, "path") == 0 ||
1213 			  strcmp(field->name, "pathname") == 0))
1214 			sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1215 		else if (field->flags & FIELD_IS_POINTER)
1216 			sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1217 		else if (strcmp(field->type, "pid_t") == 0)
1218 			sc->arg_fmt[idx].scnprintf = SCA_PID;
1219 		else if (strcmp(field->type, "umode_t") == 0)
1220 			sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1221 		else if ((strcmp(field->type, "int") == 0 ||
1222 			  strcmp(field->type, "unsigned int") == 0 ||
1223 			  strcmp(field->type, "long") == 0) &&
1224 			 (len = strlen(field->name)) >= 2 &&
1225 			 strcmp(field->name + len - 2, "fd") == 0) {
1226 			/*
1227 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1228 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1229 			 * 65 int
1230 			 * 23 unsigned int
1231 			 * 7 unsigned long
1232 			 */
1233 			sc->arg_fmt[idx].scnprintf = SCA_FD;
1234 		}
1235 	}
1236 
1237 	return 0;
1238 }
1239 
1240 static int trace__read_syscall_info(struct trace *trace, int id)
1241 {
1242 	char tp_name[128];
1243 	struct syscall *sc;
1244 	const char *name = syscalltbl__name(trace->sctbl, id);
1245 
1246 	if (name == NULL)
1247 		return -1;
1248 
1249 	if (id > trace->syscalls.max) {
1250 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1251 
1252 		if (nsyscalls == NULL)
1253 			return -1;
1254 
1255 		if (trace->syscalls.max != -1) {
1256 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1257 			       (id - trace->syscalls.max) * sizeof(*sc));
1258 		} else {
1259 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1260 		}
1261 
1262 		trace->syscalls.table = nsyscalls;
1263 		trace->syscalls.max   = id;
1264 	}
1265 
1266 	sc = trace->syscalls.table + id;
1267 	sc->name = name;
1268 
1269 	sc->fmt  = syscall_fmt__find(sc->name);
1270 
1271 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1272 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1273 
1274 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1275 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1276 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1277 	}
1278 
1279 	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1280 		return -1;
1281 
1282 	if (IS_ERR(sc->tp_format))
1283 		return -1;
1284 
1285 	sc->args = sc->tp_format->format.fields;
1286 	/*
1287 	 * We need to check and discard the first variable '__syscall_nr'
1288 	 * or 'nr' that mean the syscall number. It is needless here.
1289 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1290 	 */
1291 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1292 		sc->args = sc->args->next;
1293 		--sc->nr_args;
1294 	}
1295 
1296 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1297 
1298 	return syscall__set_arg_fmts(sc);
1299 }
1300 
1301 static int trace__validate_ev_qualifier(struct trace *trace)
1302 {
1303 	int err = 0, i;
1304 	size_t nr_allocated;
1305 	struct str_node *pos;
1306 
1307 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1308 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1309 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1310 
1311 	if (trace->ev_qualifier_ids.entries == NULL) {
1312 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1313 		       trace->output);
1314 		err = -EINVAL;
1315 		goto out;
1316 	}
1317 
1318 	nr_allocated = trace->ev_qualifier_ids.nr;
1319 	i = 0;
1320 
1321 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1322 		const char *sc = pos->s;
1323 		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1324 
1325 		if (id < 0) {
1326 			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1327 			if (id >= 0)
1328 				goto matches;
1329 
1330 			if (err == 0) {
1331 				fputs("Error:\tInvalid syscall ", trace->output);
1332 				err = -EINVAL;
1333 			} else {
1334 				fputs(", ", trace->output);
1335 			}
1336 
1337 			fputs(sc, trace->output);
1338 		}
1339 matches:
1340 		trace->ev_qualifier_ids.entries[i++] = id;
1341 		if (match_next == -1)
1342 			continue;
1343 
1344 		while (1) {
1345 			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1346 			if (id < 0)
1347 				break;
1348 			if (nr_allocated == trace->ev_qualifier_ids.nr) {
1349 				void *entries;
1350 
1351 				nr_allocated += 8;
1352 				entries = realloc(trace->ev_qualifier_ids.entries,
1353 						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1354 				if (entries == NULL) {
1355 					err = -ENOMEM;
1356 					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1357 					goto out_free;
1358 				}
1359 				trace->ev_qualifier_ids.entries = entries;
1360 			}
1361 			trace->ev_qualifier_ids.nr++;
1362 			trace->ev_qualifier_ids.entries[i++] = id;
1363 		}
1364 	}
1365 
1366 	if (err < 0) {
1367 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1368 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1369 out_free:
1370 		zfree(&trace->ev_qualifier_ids.entries);
1371 		trace->ev_qualifier_ids.nr = 0;
1372 	}
1373 out:
1374 	return err;
1375 }
1376 
1377 /*
1378  * args is to be interpreted as a series of longs but we need to handle
1379  * 8-byte unaligned accesses. args points to raw_data within the event
1380  * and raw_data is guaranteed to be 8-byte unaligned because it is
1381  * preceded by raw_size which is a u32. So we need to copy args to a temp
1382  * variable to read it. Most notably this avoids extended load instructions
1383  * on unaligned addresses
1384  */
1385 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1386 {
1387 	unsigned long val;
1388 	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1389 
1390 	memcpy(&val, p, sizeof(val));
1391 	return val;
1392 }
1393 
1394 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1395 				      struct syscall_arg *arg)
1396 {
1397 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1398 		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1399 
1400 	return scnprintf(bf, size, "arg%d: ", arg->idx);
1401 }
1402 
1403 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1404 				     struct syscall_arg *arg, unsigned long val)
1405 {
1406 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1407 		arg->val = val;
1408 		if (sc->arg_fmt[arg->idx].parm)
1409 			arg->parm = sc->arg_fmt[arg->idx].parm;
1410 		return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1411 	}
1412 	return scnprintf(bf, size, "%ld", val);
1413 }
1414 
1415 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1416 				      unsigned char *args, struct trace *trace,
1417 				      struct thread *thread)
1418 {
1419 	size_t printed = 0;
1420 	unsigned long val;
1421 	u8 bit = 1;
1422 	struct syscall_arg arg = {
1423 		.args	= args,
1424 		.idx	= 0,
1425 		.mask	= 0,
1426 		.trace  = trace,
1427 		.thread = thread,
1428 	};
1429 	struct thread_trace *ttrace = thread__priv(thread);
1430 
1431 	/*
1432 	 * Things like fcntl will set this in its 'cmd' formatter to pick the
1433 	 * right formatter for the return value (an fd? file flags?), which is
1434 	 * not needed for syscalls that always return a given type, say an fd.
1435 	 */
1436 	ttrace->ret_scnprintf = NULL;
1437 
1438 	if (sc->args != NULL) {
1439 		struct format_field *field;
1440 
1441 		for (field = sc->args; field;
1442 		     field = field->next, ++arg.idx, bit <<= 1) {
1443 			if (arg.mask & bit)
1444 				continue;
1445 
1446 			val = syscall_arg__val(&arg, arg.idx);
1447 
1448 			/*
1449  			 * Suppress this argument if its value is zero and
1450  			 * and we don't have a string associated in an
1451  			 * strarray for it.
1452  			 */
1453 			if (val == 0 &&
1454 			    !(sc->arg_fmt &&
1455 			      (sc->arg_fmt[arg.idx].show_zero ||
1456 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1457 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1458 			      sc->arg_fmt[arg.idx].parm))
1459 				continue;
1460 
1461 			printed += scnprintf(bf + printed, size - printed,
1462 					     "%s%s: ", printed ? ", " : "", field->name);
1463 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1464 		}
1465 	} else if (IS_ERR(sc->tp_format)) {
1466 		/*
1467 		 * If we managed to read the tracepoint /format file, then we
1468 		 * may end up not having any args, like with gettid(), so only
1469 		 * print the raw args when we didn't manage to read it.
1470 		 */
1471 		while (arg.idx < sc->nr_args) {
1472 			if (arg.mask & bit)
1473 				goto next_arg;
1474 			val = syscall_arg__val(&arg, arg.idx);
1475 			if (printed)
1476 				printed += scnprintf(bf + printed, size - printed, ", ");
1477 			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1478 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1479 next_arg:
1480 			++arg.idx;
1481 			bit <<= 1;
1482 		}
1483 	}
1484 
1485 	return printed;
1486 }
1487 
1488 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1489 				  union perf_event *event,
1490 				  struct perf_sample *sample);
1491 
1492 static struct syscall *trace__syscall_info(struct trace *trace,
1493 					   struct perf_evsel *evsel, int id)
1494 {
1495 
1496 	if (id < 0) {
1497 
1498 		/*
1499 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1500 		 * before that, leaving at a higher verbosity level till that is
1501 		 * explained. Reproduced with plain ftrace with:
1502 		 *
1503 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1504 		 * grep "NR -1 " /t/trace_pipe
1505 		 *
1506 		 * After generating some load on the machine.
1507  		 */
1508 		if (verbose > 1) {
1509 			static u64 n;
1510 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1511 				id, perf_evsel__name(evsel), ++n);
1512 		}
1513 		return NULL;
1514 	}
1515 
1516 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1517 	    trace__read_syscall_info(trace, id))
1518 		goto out_cant_read;
1519 
1520 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1521 		goto out_cant_read;
1522 
1523 	return &trace->syscalls.table[id];
1524 
1525 out_cant_read:
1526 	if (verbose > 0) {
1527 		fprintf(trace->output, "Problems reading syscall %d", id);
1528 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1529 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1530 		fputs(" information\n", trace->output);
1531 	}
1532 	return NULL;
1533 }
1534 
1535 static void thread__update_stats(struct thread_trace *ttrace,
1536 				 int id, struct perf_sample *sample)
1537 {
1538 	struct int_node *inode;
1539 	struct stats *stats;
1540 	u64 duration = 0;
1541 
1542 	inode = intlist__findnew(ttrace->syscall_stats, id);
1543 	if (inode == NULL)
1544 		return;
1545 
1546 	stats = inode->priv;
1547 	if (stats == NULL) {
1548 		stats = malloc(sizeof(struct stats));
1549 		if (stats == NULL)
1550 			return;
1551 		init_stats(stats);
1552 		inode->priv = stats;
1553 	}
1554 
1555 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1556 		duration = sample->time - ttrace->entry_time;
1557 
1558 	update_stats(stats, duration);
1559 }
1560 
1561 static int trace__printf_interrupted_entry(struct trace *trace)
1562 {
1563 	struct thread_trace *ttrace;
1564 	size_t printed;
1565 
1566 	if (trace->current == NULL)
1567 		return 0;
1568 
1569 	ttrace = thread__priv(trace->current);
1570 
1571 	if (!ttrace->entry_pending)
1572 		return 0;
1573 
1574 	printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1575 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1576 	ttrace->entry_pending = false;
1577 
1578 	return printed;
1579 }
1580 
1581 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1582 				 struct perf_sample *sample, struct thread *thread)
1583 {
1584 	int printed = 0;
1585 
1586 	if (trace->print_sample) {
1587 		double ts = (double)sample->time / NSEC_PER_MSEC;
1588 
1589 		printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1590 				   perf_evsel__name(evsel), ts,
1591 				   thread__comm_str(thread),
1592 				   sample->pid, sample->tid, sample->cpu);
1593 	}
1594 
1595 	return printed;
1596 }
1597 
1598 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1599 			    union perf_event *event __maybe_unused,
1600 			    struct perf_sample *sample)
1601 {
1602 	char *msg;
1603 	void *args;
1604 	size_t printed = 0;
1605 	struct thread *thread;
1606 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1607 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1608 	struct thread_trace *ttrace;
1609 
1610 	if (sc == NULL)
1611 		return -1;
1612 
1613 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1614 	ttrace = thread__trace(thread, trace->output);
1615 	if (ttrace == NULL)
1616 		goto out_put;
1617 
1618 	trace__fprintf_sample(trace, evsel, sample, thread);
1619 
1620 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1621 
1622 	if (ttrace->entry_str == NULL) {
1623 		ttrace->entry_str = malloc(trace__entry_str_size);
1624 		if (!ttrace->entry_str)
1625 			goto out_put;
1626 	}
1627 
1628 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1629 		trace__printf_interrupted_entry(trace);
1630 
1631 	ttrace->entry_time = sample->time;
1632 	msg = ttrace->entry_str;
1633 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1634 
1635 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1636 					   args, trace, thread);
1637 
1638 	if (sc->is_exit) {
1639 		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1640 			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1641 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1642 		}
1643 	} else {
1644 		ttrace->entry_pending = true;
1645 		/* See trace__vfs_getname & trace__sys_exit */
1646 		ttrace->filename.pending_open = false;
1647 	}
1648 
1649 	if (trace->current != thread) {
1650 		thread__put(trace->current);
1651 		trace->current = thread__get(thread);
1652 	}
1653 	err = 0;
1654 out_put:
1655 	thread__put(thread);
1656 	return err;
1657 }
1658 
1659 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1660 				    struct perf_sample *sample,
1661 				    struct callchain_cursor *cursor)
1662 {
1663 	struct addr_location al;
1664 
1665 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1666 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, evsel->attr.sample_max_stack))
1667 		return -1;
1668 
1669 	return 0;
1670 }
1671 
1672 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1673 {
1674 	/* TODO: user-configurable print_opts */
1675 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1676 				        EVSEL__PRINT_DSO |
1677 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1678 
1679 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1680 }
1681 
1682 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1683 {
1684 	struct perf_env *env = perf_evsel__env(evsel);
1685 	const char *arch_name = perf_env__arch(env);
1686 
1687 	return arch_syscalls__strerrno(arch_name, err);
1688 }
1689 
1690 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1691 			   union perf_event *event __maybe_unused,
1692 			   struct perf_sample *sample)
1693 {
1694 	long ret;
1695 	u64 duration = 0;
1696 	bool duration_calculated = false;
1697 	struct thread *thread;
1698 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1699 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1700 	struct thread_trace *ttrace;
1701 
1702 	if (sc == NULL)
1703 		return -1;
1704 
1705 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1706 	ttrace = thread__trace(thread, trace->output);
1707 	if (ttrace == NULL)
1708 		goto out_put;
1709 
1710 	trace__fprintf_sample(trace, evsel, sample, thread);
1711 
1712 	if (trace->summary)
1713 		thread__update_stats(ttrace, id, sample);
1714 
1715 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1716 
1717 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1718 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1719 		ttrace->filename.pending_open = false;
1720 		++trace->stats.vfs_getname;
1721 	}
1722 
1723 	if (ttrace->entry_time) {
1724 		duration = sample->time - ttrace->entry_time;
1725 		if (trace__filter_duration(trace, duration))
1726 			goto out;
1727 		duration_calculated = true;
1728 	} else if (trace->duration_filter)
1729 		goto out;
1730 
1731 	if (sample->callchain) {
1732 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1733 		if (callchain_ret == 0) {
1734 			if (callchain_cursor.nr < trace->min_stack)
1735 				goto out;
1736 			callchain_ret = 1;
1737 		}
1738 	}
1739 
1740 	if (trace->summary_only)
1741 		goto out;
1742 
1743 	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1744 
1745 	if (ttrace->entry_pending) {
1746 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1747 	} else {
1748 		fprintf(trace->output, " ... [");
1749 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1750 		fprintf(trace->output, "]: %s()", sc->name);
1751 	}
1752 
1753 	if (sc->fmt == NULL) {
1754 		if (ret < 0)
1755 			goto errno_print;
1756 signed_print:
1757 		fprintf(trace->output, ") = %ld", ret);
1758 	} else if (ret < 0) {
1759 errno_print: {
1760 		char bf[STRERR_BUFSIZE];
1761 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1762 			   *e = errno_to_name(evsel, -ret);
1763 
1764 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1765 	}
1766 	} else if (ret == 0 && sc->fmt->timeout)
1767 		fprintf(trace->output, ") = 0 Timeout");
1768 	else if (ttrace->ret_scnprintf) {
1769 		char bf[1024];
1770 		struct syscall_arg arg = {
1771 			.val	= ret,
1772 			.thread	= thread,
1773 			.trace	= trace,
1774 		};
1775 		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1776 		ttrace->ret_scnprintf = NULL;
1777 		fprintf(trace->output, ") = %s", bf);
1778 	} else if (sc->fmt->hexret)
1779 		fprintf(trace->output, ") = %#lx", ret);
1780 	else if (sc->fmt->errpid) {
1781 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1782 
1783 		if (child != NULL) {
1784 			fprintf(trace->output, ") = %ld", ret);
1785 			if (child->comm_set)
1786 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1787 			thread__put(child);
1788 		}
1789 	} else
1790 		goto signed_print;
1791 
1792 	fputc('\n', trace->output);
1793 
1794 	if (callchain_ret > 0)
1795 		trace__fprintf_callchain(trace, sample);
1796 	else if (callchain_ret < 0)
1797 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1798 out:
1799 	ttrace->entry_pending = false;
1800 	err = 0;
1801 out_put:
1802 	thread__put(thread);
1803 	return err;
1804 }
1805 
1806 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1807 			      union perf_event *event __maybe_unused,
1808 			      struct perf_sample *sample)
1809 {
1810 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1811 	struct thread_trace *ttrace;
1812 	size_t filename_len, entry_str_len, to_move;
1813 	ssize_t remaining_space;
1814 	char *pos;
1815 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1816 
1817 	if (!thread)
1818 		goto out;
1819 
1820 	ttrace = thread__priv(thread);
1821 	if (!ttrace)
1822 		goto out_put;
1823 
1824 	filename_len = strlen(filename);
1825 	if (filename_len == 0)
1826 		goto out_put;
1827 
1828 	if (ttrace->filename.namelen < filename_len) {
1829 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1830 
1831 		if (f == NULL)
1832 			goto out_put;
1833 
1834 		ttrace->filename.namelen = filename_len;
1835 		ttrace->filename.name = f;
1836 	}
1837 
1838 	strcpy(ttrace->filename.name, filename);
1839 	ttrace->filename.pending_open = true;
1840 
1841 	if (!ttrace->filename.ptr)
1842 		goto out_put;
1843 
1844 	entry_str_len = strlen(ttrace->entry_str);
1845 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1846 	if (remaining_space <= 0)
1847 		goto out_put;
1848 
1849 	if (filename_len > (size_t)remaining_space) {
1850 		filename += filename_len - remaining_space;
1851 		filename_len = remaining_space;
1852 	}
1853 
1854 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1855 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1856 	memmove(pos + filename_len, pos, to_move);
1857 	memcpy(pos, filename, filename_len);
1858 
1859 	ttrace->filename.ptr = 0;
1860 	ttrace->filename.entry_str_pos = 0;
1861 out_put:
1862 	thread__put(thread);
1863 out:
1864 	return 0;
1865 }
1866 
1867 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1868 				     union perf_event *event __maybe_unused,
1869 				     struct perf_sample *sample)
1870 {
1871         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1872 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1873 	struct thread *thread = machine__findnew_thread(trace->host,
1874 							sample->pid,
1875 							sample->tid);
1876 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1877 
1878 	if (ttrace == NULL)
1879 		goto out_dump;
1880 
1881 	ttrace->runtime_ms += runtime_ms;
1882 	trace->runtime_ms += runtime_ms;
1883 out_put:
1884 	thread__put(thread);
1885 	return 0;
1886 
1887 out_dump:
1888 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1889 	       evsel->name,
1890 	       perf_evsel__strval(evsel, sample, "comm"),
1891 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1892 	       runtime,
1893 	       perf_evsel__intval(evsel, sample, "vruntime"));
1894 	goto out_put;
1895 }
1896 
1897 static int bpf_output__printer(enum binary_printer_ops op,
1898 			       unsigned int val, void *extra __maybe_unused, FILE *fp)
1899 {
1900 	unsigned char ch = (unsigned char)val;
1901 
1902 	switch (op) {
1903 	case BINARY_PRINT_CHAR_DATA:
1904 		return fprintf(fp, "%c", isprint(ch) ? ch : '.');
1905 	case BINARY_PRINT_DATA_BEGIN:
1906 	case BINARY_PRINT_LINE_BEGIN:
1907 	case BINARY_PRINT_ADDR:
1908 	case BINARY_PRINT_NUM_DATA:
1909 	case BINARY_PRINT_NUM_PAD:
1910 	case BINARY_PRINT_SEP:
1911 	case BINARY_PRINT_CHAR_PAD:
1912 	case BINARY_PRINT_LINE_END:
1913 	case BINARY_PRINT_DATA_END:
1914 	default:
1915 		break;
1916 	}
1917 
1918 	return 0;
1919 }
1920 
1921 static void bpf_output__fprintf(struct trace *trace,
1922 				struct perf_sample *sample)
1923 {
1924 	binary__fprintf(sample->raw_data, sample->raw_size, 8,
1925 			bpf_output__printer, NULL, trace->output);
1926 }
1927 
1928 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1929 				union perf_event *event __maybe_unused,
1930 				struct perf_sample *sample)
1931 {
1932 	int callchain_ret = 0;
1933 
1934 	if (sample->callchain) {
1935 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1936 		if (callchain_ret == 0) {
1937 			if (callchain_cursor.nr < trace->min_stack)
1938 				goto out;
1939 			callchain_ret = 1;
1940 		}
1941 	}
1942 
1943 	trace__printf_interrupted_entry(trace);
1944 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1945 
1946 	if (trace->trace_syscalls)
1947 		fprintf(trace->output, "(         ): ");
1948 
1949 	fprintf(trace->output, "%s:", evsel->name);
1950 
1951 	if (perf_evsel__is_bpf_output(evsel)) {
1952 		bpf_output__fprintf(trace, sample);
1953 	} else if (evsel->tp_format) {
1954 		event_format__fprintf(evsel->tp_format, sample->cpu,
1955 				      sample->raw_data, sample->raw_size,
1956 				      trace->output);
1957 	}
1958 
1959 	fprintf(trace->output, ")\n");
1960 
1961 	if (callchain_ret > 0)
1962 		trace__fprintf_callchain(trace, sample);
1963 	else if (callchain_ret < 0)
1964 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1965 out:
1966 	return 0;
1967 }
1968 
1969 static void print_location(FILE *f, struct perf_sample *sample,
1970 			   struct addr_location *al,
1971 			   bool print_dso, bool print_sym)
1972 {
1973 
1974 	if ((verbose > 0 || print_dso) && al->map)
1975 		fprintf(f, "%s@", al->map->dso->long_name);
1976 
1977 	if ((verbose > 0 || print_sym) && al->sym)
1978 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1979 			al->addr - al->sym->start);
1980 	else if (al->map)
1981 		fprintf(f, "0x%" PRIx64, al->addr);
1982 	else
1983 		fprintf(f, "0x%" PRIx64, sample->addr);
1984 }
1985 
1986 static int trace__pgfault(struct trace *trace,
1987 			  struct perf_evsel *evsel,
1988 			  union perf_event *event __maybe_unused,
1989 			  struct perf_sample *sample)
1990 {
1991 	struct thread *thread;
1992 	struct addr_location al;
1993 	char map_type = 'd';
1994 	struct thread_trace *ttrace;
1995 	int err = -1;
1996 	int callchain_ret = 0;
1997 
1998 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1999 
2000 	if (sample->callchain) {
2001 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2002 		if (callchain_ret == 0) {
2003 			if (callchain_cursor.nr < trace->min_stack)
2004 				goto out_put;
2005 			callchain_ret = 1;
2006 		}
2007 	}
2008 
2009 	ttrace = thread__trace(thread, trace->output);
2010 	if (ttrace == NULL)
2011 		goto out_put;
2012 
2013 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2014 		ttrace->pfmaj++;
2015 	else
2016 		ttrace->pfmin++;
2017 
2018 	if (trace->summary_only)
2019 		goto out;
2020 
2021 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
2022 			      sample->ip, &al);
2023 
2024 	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2025 
2026 	fprintf(trace->output, "%sfault [",
2027 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2028 		"maj" : "min");
2029 
2030 	print_location(trace->output, sample, &al, false, true);
2031 
2032 	fprintf(trace->output, "] => ");
2033 
2034 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
2035 				   sample->addr, &al);
2036 
2037 	if (!al.map) {
2038 		thread__find_addr_location(thread, sample->cpumode,
2039 					   MAP__FUNCTION, sample->addr, &al);
2040 
2041 		if (al.map)
2042 			map_type = 'x';
2043 		else
2044 			map_type = '?';
2045 	}
2046 
2047 	print_location(trace->output, sample, &al, true, false);
2048 
2049 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2050 
2051 	if (callchain_ret > 0)
2052 		trace__fprintf_callchain(trace, sample);
2053 	else if (callchain_ret < 0)
2054 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2055 out:
2056 	err = 0;
2057 out_put:
2058 	thread__put(thread);
2059 	return err;
2060 }
2061 
2062 static void trace__set_base_time(struct trace *trace,
2063 				 struct perf_evsel *evsel,
2064 				 struct perf_sample *sample)
2065 {
2066 	/*
2067 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2068 	 * and don't use sample->time unconditionally, we may end up having
2069 	 * some other event in the future without PERF_SAMPLE_TIME for good
2070 	 * reason, i.e. we may not be interested in its timestamps, just in
2071 	 * it taking place, picking some piece of information when it
2072 	 * appears in our event stream (vfs_getname comes to mind).
2073 	 */
2074 	if (trace->base_time == 0 && !trace->full_time &&
2075 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2076 		trace->base_time = sample->time;
2077 }
2078 
2079 static int trace__process_sample(struct perf_tool *tool,
2080 				 union perf_event *event,
2081 				 struct perf_sample *sample,
2082 				 struct perf_evsel *evsel,
2083 				 struct machine *machine __maybe_unused)
2084 {
2085 	struct trace *trace = container_of(tool, struct trace, tool);
2086 	struct thread *thread;
2087 	int err = 0;
2088 
2089 	tracepoint_handler handler = evsel->handler;
2090 
2091 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2092 	if (thread && thread__is_filtered(thread))
2093 		goto out;
2094 
2095 	trace__set_base_time(trace, evsel, sample);
2096 
2097 	if (handler) {
2098 		++trace->nr_events;
2099 		handler(trace, evsel, event, sample);
2100 	}
2101 out:
2102 	thread__put(thread);
2103 	return err;
2104 }
2105 
2106 static int trace__record(struct trace *trace, int argc, const char **argv)
2107 {
2108 	unsigned int rec_argc, i, j;
2109 	const char **rec_argv;
2110 	const char * const record_args[] = {
2111 		"record",
2112 		"-R",
2113 		"-m", "1024",
2114 		"-c", "1",
2115 	};
2116 
2117 	const char * const sc_args[] = { "-e", };
2118 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2119 	const char * const majpf_args[] = { "-e", "major-faults" };
2120 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2121 	const char * const minpf_args[] = { "-e", "minor-faults" };
2122 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2123 
2124 	/* +1 is for the event string below */
2125 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2126 		majpf_args_nr + minpf_args_nr + argc;
2127 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2128 
2129 	if (rec_argv == NULL)
2130 		return -ENOMEM;
2131 
2132 	j = 0;
2133 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2134 		rec_argv[j++] = record_args[i];
2135 
2136 	if (trace->trace_syscalls) {
2137 		for (i = 0; i < sc_args_nr; i++)
2138 			rec_argv[j++] = sc_args[i];
2139 
2140 		/* event string may be different for older kernels - e.g., RHEL6 */
2141 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2142 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2143 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2144 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2145 		else {
2146 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2147 			free(rec_argv);
2148 			return -1;
2149 		}
2150 	}
2151 
2152 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2153 		for (i = 0; i < majpf_args_nr; i++)
2154 			rec_argv[j++] = majpf_args[i];
2155 
2156 	if (trace->trace_pgfaults & TRACE_PFMIN)
2157 		for (i = 0; i < minpf_args_nr; i++)
2158 			rec_argv[j++] = minpf_args[i];
2159 
2160 	for (i = 0; i < (unsigned int)argc; i++)
2161 		rec_argv[j++] = argv[i];
2162 
2163 	return cmd_record(j, rec_argv);
2164 }
2165 
2166 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2167 
2168 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2169 {
2170 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2171 
2172 	if (IS_ERR(evsel))
2173 		return false;
2174 
2175 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2176 		perf_evsel__delete(evsel);
2177 		return false;
2178 	}
2179 
2180 	evsel->handler = trace__vfs_getname;
2181 	perf_evlist__add(evlist, evsel);
2182 	return true;
2183 }
2184 
2185 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2186 {
2187 	struct perf_evsel *evsel;
2188 	struct perf_event_attr attr = {
2189 		.type = PERF_TYPE_SOFTWARE,
2190 		.mmap_data = 1,
2191 	};
2192 
2193 	attr.config = config;
2194 	attr.sample_period = 1;
2195 
2196 	event_attr_init(&attr);
2197 
2198 	evsel = perf_evsel__new(&attr);
2199 	if (evsel)
2200 		evsel->handler = trace__pgfault;
2201 
2202 	return evsel;
2203 }
2204 
2205 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2206 {
2207 	const u32 type = event->header.type;
2208 	struct perf_evsel *evsel;
2209 
2210 	if (type != PERF_RECORD_SAMPLE) {
2211 		trace__process_event(trace, trace->host, event, sample);
2212 		return;
2213 	}
2214 
2215 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2216 	if (evsel == NULL) {
2217 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2218 		return;
2219 	}
2220 
2221 	trace__set_base_time(trace, evsel, sample);
2222 
2223 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2224 	    sample->raw_data == NULL) {
2225 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2226 		       perf_evsel__name(evsel), sample->tid,
2227 		       sample->cpu, sample->raw_size);
2228 	} else {
2229 		tracepoint_handler handler = evsel->handler;
2230 		handler(trace, evsel, event, sample);
2231 	}
2232 }
2233 
2234 static int trace__add_syscall_newtp(struct trace *trace)
2235 {
2236 	int ret = -1;
2237 	struct perf_evlist *evlist = trace->evlist;
2238 	struct perf_evsel *sys_enter, *sys_exit;
2239 
2240 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2241 	if (sys_enter == NULL)
2242 		goto out;
2243 
2244 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2245 		goto out_delete_sys_enter;
2246 
2247 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2248 	if (sys_exit == NULL)
2249 		goto out_delete_sys_enter;
2250 
2251 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2252 		goto out_delete_sys_exit;
2253 
2254 	perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2255 	perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2256 
2257 	perf_evlist__add(evlist, sys_enter);
2258 	perf_evlist__add(evlist, sys_exit);
2259 
2260 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2261 		/*
2262 		 * We're interested only in the user space callchain
2263 		 * leading to the syscall, allow overriding that for
2264 		 * debugging reasons using --kernel_syscall_callchains
2265 		 */
2266 		sys_exit->attr.exclude_callchain_kernel = 1;
2267 	}
2268 
2269 	trace->syscalls.events.sys_enter = sys_enter;
2270 	trace->syscalls.events.sys_exit  = sys_exit;
2271 
2272 	ret = 0;
2273 out:
2274 	return ret;
2275 
2276 out_delete_sys_exit:
2277 	perf_evsel__delete_priv(sys_exit);
2278 out_delete_sys_enter:
2279 	perf_evsel__delete_priv(sys_enter);
2280 	goto out;
2281 }
2282 
2283 static int trace__set_ev_qualifier_filter(struct trace *trace)
2284 {
2285 	int err = -1;
2286 	struct perf_evsel *sys_exit;
2287 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2288 						trace->ev_qualifier_ids.nr,
2289 						trace->ev_qualifier_ids.entries);
2290 
2291 	if (filter == NULL)
2292 		goto out_enomem;
2293 
2294 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2295 					  filter)) {
2296 		sys_exit = trace->syscalls.events.sys_exit;
2297 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2298 	}
2299 
2300 	free(filter);
2301 out:
2302 	return err;
2303 out_enomem:
2304 	errno = ENOMEM;
2305 	goto out;
2306 }
2307 
2308 static int trace__set_filter_loop_pids(struct trace *trace)
2309 {
2310 	unsigned int nr = 1;
2311 	pid_t pids[32] = {
2312 		getpid(),
2313 	};
2314 	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2315 
2316 	while (thread && nr < ARRAY_SIZE(pids)) {
2317 		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2318 
2319 		if (parent == NULL)
2320 			break;
2321 
2322 		if (!strcmp(thread__comm_str(parent), "sshd")) {
2323 			pids[nr++] = parent->tid;
2324 			break;
2325 		}
2326 		thread = parent;
2327 	}
2328 
2329 	return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2330 }
2331 
2332 static int trace__run(struct trace *trace, int argc, const char **argv)
2333 {
2334 	struct perf_evlist *evlist = trace->evlist;
2335 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2336 	int err = -1, i;
2337 	unsigned long before;
2338 	const bool forks = argc > 0;
2339 	bool draining = false;
2340 
2341 	trace->live = true;
2342 
2343 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2344 		goto out_error_raw_syscalls;
2345 
2346 	if (trace->trace_syscalls)
2347 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2348 
2349 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2350 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2351 		if (pgfault_maj == NULL)
2352 			goto out_error_mem;
2353 		perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2354 		perf_evlist__add(evlist, pgfault_maj);
2355 	}
2356 
2357 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2358 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2359 		if (pgfault_min == NULL)
2360 			goto out_error_mem;
2361 		perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2362 		perf_evlist__add(evlist, pgfault_min);
2363 	}
2364 
2365 	if (trace->sched &&
2366 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2367 				   trace__sched_stat_runtime))
2368 		goto out_error_sched_stat_runtime;
2369 
2370 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2371 	if (err < 0) {
2372 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2373 		goto out_delete_evlist;
2374 	}
2375 
2376 	err = trace__symbols_init(trace, evlist);
2377 	if (err < 0) {
2378 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2379 		goto out_delete_evlist;
2380 	}
2381 
2382 	perf_evlist__config(evlist, &trace->opts, &callchain_param);
2383 
2384 	signal(SIGCHLD, sig_handler);
2385 	signal(SIGINT, sig_handler);
2386 
2387 	if (forks) {
2388 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2389 						    argv, false, NULL);
2390 		if (err < 0) {
2391 			fprintf(trace->output, "Couldn't run the workload!\n");
2392 			goto out_delete_evlist;
2393 		}
2394 	}
2395 
2396 	err = perf_evlist__open(evlist);
2397 	if (err < 0)
2398 		goto out_error_open;
2399 
2400 	err = bpf__apply_obj_config();
2401 	if (err) {
2402 		char errbuf[BUFSIZ];
2403 
2404 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2405 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2406 			 errbuf);
2407 		goto out_error_open;
2408 	}
2409 
2410 	/*
2411 	 * Better not use !target__has_task() here because we need to cover the
2412 	 * case where no threads were specified in the command line, but a
2413 	 * workload was, and in that case we will fill in the thread_map when
2414 	 * we fork the workload in perf_evlist__prepare_workload.
2415 	 */
2416 	if (trace->filter_pids.nr > 0)
2417 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2418 	else if (thread_map__pid(evlist->threads, 0) == -1)
2419 		err = trace__set_filter_loop_pids(trace);
2420 
2421 	if (err < 0)
2422 		goto out_error_mem;
2423 
2424 	if (trace->ev_qualifier_ids.nr > 0) {
2425 		err = trace__set_ev_qualifier_filter(trace);
2426 		if (err < 0)
2427 			goto out_errno;
2428 
2429 		pr_debug("event qualifier tracepoint filter: %s\n",
2430 			 trace->syscalls.events.sys_exit->filter);
2431 	}
2432 
2433 	err = perf_evlist__apply_filters(evlist, &evsel);
2434 	if (err < 0)
2435 		goto out_error_apply_filters;
2436 
2437 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2438 	if (err < 0)
2439 		goto out_error_mmap;
2440 
2441 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2442 		perf_evlist__enable(evlist);
2443 
2444 	if (forks)
2445 		perf_evlist__start_workload(evlist);
2446 
2447 	if (trace->opts.initial_delay) {
2448 		usleep(trace->opts.initial_delay * 1000);
2449 		perf_evlist__enable(evlist);
2450 	}
2451 
2452 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2453 				  evlist->threads->nr > 1 ||
2454 				  perf_evlist__first(evlist)->attr.inherit;
2455 
2456 	/*
2457 	 * Now that we already used evsel->attr to ask the kernel to setup the
2458 	 * events, lets reuse evsel->attr.sample_max_stack as the limit in
2459 	 * trace__resolve_callchain(), allowing per-event max-stack settings
2460 	 * to override an explicitely set --max-stack global setting.
2461 	 */
2462 	evlist__for_each_entry(evlist, evsel) {
2463 		if ((evsel->attr.sample_type & PERF_SAMPLE_CALLCHAIN) &&
2464 		    evsel->attr.sample_max_stack == 0)
2465 			evsel->attr.sample_max_stack = trace->max_stack;
2466 	}
2467 again:
2468 	before = trace->nr_events;
2469 
2470 	for (i = 0; i < evlist->nr_mmaps; i++) {
2471 		union perf_event *event;
2472 
2473 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2474 			struct perf_sample sample;
2475 
2476 			++trace->nr_events;
2477 
2478 			err = perf_evlist__parse_sample(evlist, event, &sample);
2479 			if (err) {
2480 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2481 				goto next_event;
2482 			}
2483 
2484 			trace__handle_event(trace, event, &sample);
2485 next_event:
2486 			perf_evlist__mmap_consume(evlist, i);
2487 
2488 			if (interrupted)
2489 				goto out_disable;
2490 
2491 			if (done && !draining) {
2492 				perf_evlist__disable(evlist);
2493 				draining = true;
2494 			}
2495 		}
2496 	}
2497 
2498 	if (trace->nr_events == before) {
2499 		int timeout = done ? 100 : -1;
2500 
2501 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2502 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2503 				draining = true;
2504 
2505 			goto again;
2506 		}
2507 	} else {
2508 		goto again;
2509 	}
2510 
2511 out_disable:
2512 	thread__zput(trace->current);
2513 
2514 	perf_evlist__disable(evlist);
2515 
2516 	if (!err) {
2517 		if (trace->summary)
2518 			trace__fprintf_thread_summary(trace, trace->output);
2519 
2520 		if (trace->show_tool_stats) {
2521 			fprintf(trace->output, "Stats:\n "
2522 					       " vfs_getname : %" PRIu64 "\n"
2523 					       " proc_getname: %" PRIu64 "\n",
2524 				trace->stats.vfs_getname,
2525 				trace->stats.proc_getname);
2526 		}
2527 	}
2528 
2529 out_delete_evlist:
2530 	trace__symbols__exit(trace);
2531 
2532 	perf_evlist__delete(evlist);
2533 	trace->evlist = NULL;
2534 	trace->live = false;
2535 	return err;
2536 {
2537 	char errbuf[BUFSIZ];
2538 
2539 out_error_sched_stat_runtime:
2540 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2541 	goto out_error;
2542 
2543 out_error_raw_syscalls:
2544 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2545 	goto out_error;
2546 
2547 out_error_mmap:
2548 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2549 	goto out_error;
2550 
2551 out_error_open:
2552 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2553 
2554 out_error:
2555 	fprintf(trace->output, "%s\n", errbuf);
2556 	goto out_delete_evlist;
2557 
2558 out_error_apply_filters:
2559 	fprintf(trace->output,
2560 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2561 		evsel->filter, perf_evsel__name(evsel), errno,
2562 		str_error_r(errno, errbuf, sizeof(errbuf)));
2563 	goto out_delete_evlist;
2564 }
2565 out_error_mem:
2566 	fprintf(trace->output, "Not enough memory to run!\n");
2567 	goto out_delete_evlist;
2568 
2569 out_errno:
2570 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2571 	goto out_delete_evlist;
2572 }
2573 
2574 static int trace__replay(struct trace *trace)
2575 {
2576 	const struct perf_evsel_str_handler handlers[] = {
2577 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2578 	};
2579 	struct perf_data data = {
2580 		.file      = {
2581 			.path = input_name,
2582 		},
2583 		.mode      = PERF_DATA_MODE_READ,
2584 		.force     = trace->force,
2585 	};
2586 	struct perf_session *session;
2587 	struct perf_evsel *evsel;
2588 	int err = -1;
2589 
2590 	trace->tool.sample	  = trace__process_sample;
2591 	trace->tool.mmap	  = perf_event__process_mmap;
2592 	trace->tool.mmap2	  = perf_event__process_mmap2;
2593 	trace->tool.comm	  = perf_event__process_comm;
2594 	trace->tool.exit	  = perf_event__process_exit;
2595 	trace->tool.fork	  = perf_event__process_fork;
2596 	trace->tool.attr	  = perf_event__process_attr;
2597 	trace->tool.tracing_data  = perf_event__process_tracing_data;
2598 	trace->tool.build_id	  = perf_event__process_build_id;
2599 	trace->tool.namespaces	  = perf_event__process_namespaces;
2600 
2601 	trace->tool.ordered_events = true;
2602 	trace->tool.ordering_requires_timestamps = true;
2603 
2604 	/* add tid to output */
2605 	trace->multiple_threads = true;
2606 
2607 	session = perf_session__new(&data, false, &trace->tool);
2608 	if (session == NULL)
2609 		return -1;
2610 
2611 	if (trace->opts.target.pid)
2612 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2613 
2614 	if (trace->opts.target.tid)
2615 		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2616 
2617 	if (symbol__init(&session->header.env) < 0)
2618 		goto out;
2619 
2620 	trace->host = &session->machines.host;
2621 
2622 	err = perf_session__set_tracepoints_handlers(session, handlers);
2623 	if (err)
2624 		goto out;
2625 
2626 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2627 						     "raw_syscalls:sys_enter");
2628 	/* older kernels have syscalls tp versus raw_syscalls */
2629 	if (evsel == NULL)
2630 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2631 							     "syscalls:sys_enter");
2632 
2633 	if (evsel &&
2634 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2635 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2636 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2637 		goto out;
2638 	}
2639 
2640 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2641 						     "raw_syscalls:sys_exit");
2642 	if (evsel == NULL)
2643 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2644 							     "syscalls:sys_exit");
2645 	if (evsel &&
2646 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2647 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2648 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2649 		goto out;
2650 	}
2651 
2652 	evlist__for_each_entry(session->evlist, evsel) {
2653 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2654 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2655 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2656 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2657 			evsel->handler = trace__pgfault;
2658 	}
2659 
2660 	setup_pager();
2661 
2662 	err = perf_session__process_events(session);
2663 	if (err)
2664 		pr_err("Failed to process events, error %d", err);
2665 
2666 	else if (trace->summary)
2667 		trace__fprintf_thread_summary(trace, trace->output);
2668 
2669 out:
2670 	perf_session__delete(session);
2671 
2672 	return err;
2673 }
2674 
2675 static size_t trace__fprintf_threads_header(FILE *fp)
2676 {
2677 	size_t printed;
2678 
2679 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2680 
2681 	return printed;
2682 }
2683 
2684 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2685 	struct stats 	*stats;
2686 	double		msecs;
2687 	int		syscall;
2688 )
2689 {
2690 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2691 	struct stats *stats = source->priv;
2692 
2693 	entry->syscall = source->i;
2694 	entry->stats   = stats;
2695 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2696 }
2697 
2698 static size_t thread__dump_stats(struct thread_trace *ttrace,
2699 				 struct trace *trace, FILE *fp)
2700 {
2701 	size_t printed = 0;
2702 	struct syscall *sc;
2703 	struct rb_node *nd;
2704 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2705 
2706 	if (syscall_stats == NULL)
2707 		return 0;
2708 
2709 	printed += fprintf(fp, "\n");
2710 
2711 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2712 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2713 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2714 
2715 	resort_rb__for_each_entry(nd, syscall_stats) {
2716 		struct stats *stats = syscall_stats_entry->stats;
2717 		if (stats) {
2718 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2719 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2720 			double avg = avg_stats(stats);
2721 			double pct;
2722 			u64 n = (u64) stats->n;
2723 
2724 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2725 			avg /= NSEC_PER_MSEC;
2726 
2727 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2728 			printed += fprintf(fp, "   %-15s", sc->name);
2729 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2730 					   n, syscall_stats_entry->msecs, min, avg);
2731 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2732 		}
2733 	}
2734 
2735 	resort_rb__delete(syscall_stats);
2736 	printed += fprintf(fp, "\n\n");
2737 
2738 	return printed;
2739 }
2740 
2741 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2742 {
2743 	size_t printed = 0;
2744 	struct thread_trace *ttrace = thread__priv(thread);
2745 	double ratio;
2746 
2747 	if (ttrace == NULL)
2748 		return 0;
2749 
2750 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2751 
2752 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2753 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2754 	printed += fprintf(fp, "%.1f%%", ratio);
2755 	if (ttrace->pfmaj)
2756 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2757 	if (ttrace->pfmin)
2758 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2759 	if (trace->sched)
2760 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2761 	else if (fputc('\n', fp) != EOF)
2762 		++printed;
2763 
2764 	printed += thread__dump_stats(ttrace, trace, fp);
2765 
2766 	return printed;
2767 }
2768 
2769 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2770 {
2771 	return ttrace ? ttrace->nr_events : 0;
2772 }
2773 
2774 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2775 	struct thread *thread;
2776 )
2777 {
2778 	entry->thread = rb_entry(nd, struct thread, rb_node);
2779 }
2780 
2781 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2782 {
2783 	size_t printed = trace__fprintf_threads_header(fp);
2784 	struct rb_node *nd;
2785 	int i;
2786 
2787 	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2788 		DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2789 
2790 		if (threads == NULL) {
2791 			fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2792 			return 0;
2793 		}
2794 
2795 		resort_rb__for_each_entry(nd, threads)
2796 			printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2797 
2798 		resort_rb__delete(threads);
2799 	}
2800 	return printed;
2801 }
2802 
2803 static int trace__set_duration(const struct option *opt, const char *str,
2804 			       int unset __maybe_unused)
2805 {
2806 	struct trace *trace = opt->value;
2807 
2808 	trace->duration_filter = atof(str);
2809 	return 0;
2810 }
2811 
2812 static int trace__set_filter_pids(const struct option *opt, const char *str,
2813 				  int unset __maybe_unused)
2814 {
2815 	int ret = -1;
2816 	size_t i;
2817 	struct trace *trace = opt->value;
2818 	/*
2819 	 * FIXME: introduce a intarray class, plain parse csv and create a
2820 	 * { int nr, int entries[] } struct...
2821 	 */
2822 	struct intlist *list = intlist__new(str);
2823 
2824 	if (list == NULL)
2825 		return -1;
2826 
2827 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2828 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2829 
2830 	if (trace->filter_pids.entries == NULL)
2831 		goto out;
2832 
2833 	trace->filter_pids.entries[0] = getpid();
2834 
2835 	for (i = 1; i < trace->filter_pids.nr; ++i)
2836 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2837 
2838 	intlist__delete(list);
2839 	ret = 0;
2840 out:
2841 	return ret;
2842 }
2843 
2844 static int trace__open_output(struct trace *trace, const char *filename)
2845 {
2846 	struct stat st;
2847 
2848 	if (!stat(filename, &st) && st.st_size) {
2849 		char oldname[PATH_MAX];
2850 
2851 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2852 		unlink(oldname);
2853 		rename(filename, oldname);
2854 	}
2855 
2856 	trace->output = fopen(filename, "w");
2857 
2858 	return trace->output == NULL ? -errno : 0;
2859 }
2860 
2861 static int parse_pagefaults(const struct option *opt, const char *str,
2862 			    int unset __maybe_unused)
2863 {
2864 	int *trace_pgfaults = opt->value;
2865 
2866 	if (strcmp(str, "all") == 0)
2867 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2868 	else if (strcmp(str, "maj") == 0)
2869 		*trace_pgfaults |= TRACE_PFMAJ;
2870 	else if (strcmp(str, "min") == 0)
2871 		*trace_pgfaults |= TRACE_PFMIN;
2872 	else
2873 		return -1;
2874 
2875 	return 0;
2876 }
2877 
2878 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2879 {
2880 	struct perf_evsel *evsel;
2881 
2882 	evlist__for_each_entry(evlist, evsel)
2883 		evsel->handler = handler;
2884 }
2885 
2886 /*
2887  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2888  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2889  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2890  *
2891  * It'd be better to introduce a parse_options() variant that would return a
2892  * list with the terms it didn't match to an event...
2893  */
2894 static int trace__parse_events_option(const struct option *opt, const char *str,
2895 				      int unset __maybe_unused)
2896 {
2897 	struct trace *trace = (struct trace *)opt->value;
2898 	const char *s = str;
2899 	char *sep = NULL, *lists[2] = { NULL, NULL, };
2900 	int len = strlen(str) + 1, err = -1, list, idx;
2901 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2902 	char group_name[PATH_MAX];
2903 
2904 	if (strace_groups_dir == NULL)
2905 		return -1;
2906 
2907 	if (*s == '!') {
2908 		++s;
2909 		trace->not_ev_qualifier = true;
2910 	}
2911 
2912 	while (1) {
2913 		if ((sep = strchr(s, ',')) != NULL)
2914 			*sep = '\0';
2915 
2916 		list = 0;
2917 		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2918 		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2919 			list = 1;
2920 		} else {
2921 			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2922 			if (access(group_name, R_OK) == 0)
2923 				list = 1;
2924 		}
2925 
2926 		if (lists[list]) {
2927 			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2928 		} else {
2929 			lists[list] = malloc(len);
2930 			if (lists[list] == NULL)
2931 				goto out;
2932 			strcpy(lists[list], s);
2933 		}
2934 
2935 		if (!sep)
2936 			break;
2937 
2938 		*sep = ',';
2939 		s = sep + 1;
2940 	}
2941 
2942 	if (lists[1] != NULL) {
2943 		struct strlist_config slist_config = {
2944 			.dirname = strace_groups_dir,
2945 		};
2946 
2947 		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2948 		if (trace->ev_qualifier == NULL) {
2949 			fputs("Not enough memory to parse event qualifier", trace->output);
2950 			goto out;
2951 		}
2952 
2953 		if (trace__validate_ev_qualifier(trace))
2954 			goto out;
2955 	}
2956 
2957 	err = 0;
2958 
2959 	if (lists[0]) {
2960 		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2961 					       "event selector. use 'perf list' to list available events",
2962 					       parse_events_option);
2963 		err = parse_events_option(&o, lists[0], 0);
2964 	}
2965 out:
2966 	if (sep)
2967 		*sep = ',';
2968 
2969 	return err;
2970 }
2971 
2972 int cmd_trace(int argc, const char **argv)
2973 {
2974 	const char *trace_usage[] = {
2975 		"perf trace [<options>] [<command>]",
2976 		"perf trace [<options>] -- <command> [<options>]",
2977 		"perf trace record [<options>] [<command>]",
2978 		"perf trace record [<options>] -- <command> [<options>]",
2979 		NULL
2980 	};
2981 	struct trace trace = {
2982 		.syscalls = {
2983 			. max = -1,
2984 		},
2985 		.opts = {
2986 			.target = {
2987 				.uid	   = UINT_MAX,
2988 				.uses_mmap = true,
2989 			},
2990 			.user_freq     = UINT_MAX,
2991 			.user_interval = ULLONG_MAX,
2992 			.no_buffering  = true,
2993 			.mmap_pages    = UINT_MAX,
2994 			.proc_map_timeout  = 500,
2995 		},
2996 		.output = stderr,
2997 		.show_comm = true,
2998 		.trace_syscalls = true,
2999 		.kernel_syscallchains = false,
3000 		.max_stack = UINT_MAX,
3001 	};
3002 	const char *output_name = NULL;
3003 	const struct option trace_options[] = {
3004 	OPT_CALLBACK('e', "event", &trace, "event",
3005 		     "event/syscall selector. use 'perf list' to list available events",
3006 		     trace__parse_events_option),
3007 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
3008 		    "show the thread COMM next to its id"),
3009 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3010 	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3011 		     trace__parse_events_option),
3012 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
3013 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3014 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3015 		    "trace events on existing process id"),
3016 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3017 		    "trace events on existing thread id"),
3018 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3019 		     "pids to filter (by the kernel)", trace__set_filter_pids),
3020 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3021 		    "system-wide collection from all CPUs"),
3022 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3023 		    "list of cpus to monitor"),
3024 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3025 		    "child tasks do not inherit counters"),
3026 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3027 		     "number of mmap data pages",
3028 		     perf_evlist__parse_mmap_pages),
3029 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3030 		   "user to profile"),
3031 	OPT_CALLBACK(0, "duration", &trace, "float",
3032 		     "show only events with duration > N.M ms",
3033 		     trace__set_duration),
3034 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3035 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3036 	OPT_BOOLEAN('T', "time", &trace.full_time,
3037 		    "Show full timestamp, not time relative to first start"),
3038 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
3039 		    "Show only syscall summary with statistics"),
3040 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
3041 		    "Show all syscalls and summary with statistics"),
3042 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3043 		     "Trace pagefaults", parse_pagefaults, "maj"),
3044 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3045 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3046 	OPT_CALLBACK(0, "call-graph", &trace.opts,
3047 		     "record_mode[,record_size]", record_callchain_help,
3048 		     &record_parse_callchain_opt),
3049 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3050 		    "Show the kernel callchains on the syscall exit path"),
3051 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3052 		     "Set the minimum stack depth when parsing the callchain, "
3053 		     "anything below the specified depth will be ignored."),
3054 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3055 		     "Set the maximum stack depth when parsing the callchain, "
3056 		     "anything beyond the specified depth will be ignored. "
3057 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3058 	OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3059 			"print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3060 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3061 			"per thread proc mmap processing timeout in ms"),
3062 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3063 		     "ms to wait before starting measurement after program "
3064 		     "start"),
3065 	OPT_END()
3066 	};
3067 	bool __maybe_unused max_stack_user_set = true;
3068 	bool mmap_pages_user_set = true;
3069 	const char * const trace_subcommands[] = { "record", NULL };
3070 	int err;
3071 	char bf[BUFSIZ];
3072 
3073 	signal(SIGSEGV, sighandler_dump_stack);
3074 	signal(SIGFPE, sighandler_dump_stack);
3075 
3076 	trace.evlist = perf_evlist__new();
3077 	trace.sctbl = syscalltbl__new();
3078 
3079 	if (trace.evlist == NULL || trace.sctbl == NULL) {
3080 		pr_err("Not enough memory to run!\n");
3081 		err = -ENOMEM;
3082 		goto out;
3083 	}
3084 
3085 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3086 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3087 
3088 	err = bpf__setup_stdout(trace.evlist);
3089 	if (err) {
3090 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3091 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3092 		goto out;
3093 	}
3094 
3095 	err = -1;
3096 
3097 	if (trace.trace_pgfaults) {
3098 		trace.opts.sample_address = true;
3099 		trace.opts.sample_time = true;
3100 	}
3101 
3102 	if (trace.opts.mmap_pages == UINT_MAX)
3103 		mmap_pages_user_set = false;
3104 
3105 	if (trace.max_stack == UINT_MAX) {
3106 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3107 		max_stack_user_set = false;
3108 	}
3109 
3110 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3111 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3112 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3113 	}
3114 #endif
3115 
3116 	if (callchain_param.enabled) {
3117 		if (!mmap_pages_user_set && geteuid() == 0)
3118 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3119 
3120 		symbol_conf.use_callchain = true;
3121 	}
3122 
3123 	if (trace.evlist->nr_entries > 0)
3124 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3125 
3126 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3127 		return trace__record(&trace, argc-1, &argv[1]);
3128 
3129 	/* summary_only implies summary option, but don't overwrite summary if set */
3130 	if (trace.summary_only)
3131 		trace.summary = trace.summary_only;
3132 
3133 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3134 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3135 		pr_err("Please specify something to trace.\n");
3136 		return -1;
3137 	}
3138 
3139 	if (!trace.trace_syscalls && trace.ev_qualifier) {
3140 		pr_err("The -e option can't be used with --no-syscalls.\n");
3141 		goto out;
3142 	}
3143 
3144 	if (output_name != NULL) {
3145 		err = trace__open_output(&trace, output_name);
3146 		if (err < 0) {
3147 			perror("failed to create output file");
3148 			goto out;
3149 		}
3150 	}
3151 
3152 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
3153 
3154 	err = target__validate(&trace.opts.target);
3155 	if (err) {
3156 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3157 		fprintf(trace.output, "%s", bf);
3158 		goto out_close;
3159 	}
3160 
3161 	err = target__parse_uid(&trace.opts.target);
3162 	if (err) {
3163 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3164 		fprintf(trace.output, "%s", bf);
3165 		goto out_close;
3166 	}
3167 
3168 	if (!argc && target__none(&trace.opts.target))
3169 		trace.opts.target.system_wide = true;
3170 
3171 	if (input_name)
3172 		err = trace__replay(&trace);
3173 	else
3174 		err = trace__run(&trace, argc, argv);
3175 
3176 out_close:
3177 	if (output_name != NULL)
3178 		fclose(trace.output);
3179 out:
3180 	return err;
3181 }
3182