xref: /linux/tools/perf/builtin-trace.c (revision 33dea5aae0320345af26ae9aba0894a930e0d4ec)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/event.h"
25 #include "util/evlist.h"
26 #include <subcmd/exec-cmd.h>
27 #include "util/machine.h"
28 #include "util/path.h"
29 #include "util/session.h"
30 #include "util/thread.h"
31 #include <subcmd/parse-options.h>
32 #include "util/strlist.h"
33 #include "util/intlist.h"
34 #include "util/thread_map.h"
35 #include "util/stat.h"
36 #include "trace/beauty/beauty.h"
37 #include "trace-event.h"
38 #include "util/parse-events.h"
39 #include "util/bpf-loader.h"
40 #include "callchain.h"
41 #include "print_binary.h"
42 #include "string2.h"
43 #include "syscalltbl.h"
44 #include "rb_resort.h"
45 
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
49 #include <poll.h>
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/audit.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 
61 #include "sane_ctype.h"
62 
63 #ifndef O_CLOEXEC
64 # define O_CLOEXEC		02000000
65 #endif
66 
67 #ifndef F_LINUX_SPECIFIC_BASE
68 # define F_LINUX_SPECIFIC_BASE	1024
69 #endif
70 
71 struct trace {
72 	struct perf_tool	tool;
73 	struct syscalltbl	*sctbl;
74 	struct {
75 		int		max;
76 		struct syscall  *table;
77 		struct {
78 			struct perf_evsel *sys_enter,
79 					  *sys_exit;
80 		}		events;
81 	} syscalls;
82 	struct record_opts	opts;
83 	struct perf_evlist	*evlist;
84 	struct machine		*host;
85 	struct thread		*current;
86 	u64			base_time;
87 	FILE			*output;
88 	unsigned long		nr_events;
89 	struct strlist		*ev_qualifier;
90 	struct {
91 		size_t		nr;
92 		int		*entries;
93 	}			ev_qualifier_ids;
94 	struct {
95 		size_t		nr;
96 		pid_t		*entries;
97 	}			filter_pids;
98 	double			duration_filter;
99 	double			runtime_ms;
100 	struct {
101 		u64		vfs_getname,
102 				proc_getname;
103 	} stats;
104 	unsigned int		max_stack;
105 	unsigned int		min_stack;
106 	bool			not_ev_qualifier;
107 	bool			live;
108 	bool			full_time;
109 	bool			sched;
110 	bool			multiple_threads;
111 	bool			summary;
112 	bool			summary_only;
113 	bool			show_comm;
114 	bool			show_tool_stats;
115 	bool			trace_syscalls;
116 	bool			kernel_syscallchains;
117 	bool			force;
118 	bool			vfs_getname;
119 	int			trace_pgfaults;
120 	int			open_id;
121 };
122 
123 struct tp_field {
124 	int offset;
125 	union {
126 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
127 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
128 	};
129 };
130 
131 #define TP_UINT_FIELD(bits) \
132 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
133 { \
134 	u##bits value; \
135 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
136 	return value;  \
137 }
138 
139 TP_UINT_FIELD(8);
140 TP_UINT_FIELD(16);
141 TP_UINT_FIELD(32);
142 TP_UINT_FIELD(64);
143 
144 #define TP_UINT_FIELD__SWAPPED(bits) \
145 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
146 { \
147 	u##bits value; \
148 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
149 	return bswap_##bits(value);\
150 }
151 
152 TP_UINT_FIELD__SWAPPED(16);
153 TP_UINT_FIELD__SWAPPED(32);
154 TP_UINT_FIELD__SWAPPED(64);
155 
156 static int tp_field__init_uint(struct tp_field *field,
157 			       struct format_field *format_field,
158 			       bool needs_swap)
159 {
160 	field->offset = format_field->offset;
161 
162 	switch (format_field->size) {
163 	case 1:
164 		field->integer = tp_field__u8;
165 		break;
166 	case 2:
167 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
168 		break;
169 	case 4:
170 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
171 		break;
172 	case 8:
173 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
174 		break;
175 	default:
176 		return -1;
177 	}
178 
179 	return 0;
180 }
181 
182 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
183 {
184 	return sample->raw_data + field->offset;
185 }
186 
187 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
188 {
189 	field->offset = format_field->offset;
190 	field->pointer = tp_field__ptr;
191 	return 0;
192 }
193 
194 struct syscall_tp {
195 	struct tp_field id;
196 	union {
197 		struct tp_field args, ret;
198 	};
199 };
200 
201 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
202 					  struct tp_field *field,
203 					  const char *name)
204 {
205 	struct format_field *format_field = perf_evsel__field(evsel, name);
206 
207 	if (format_field == NULL)
208 		return -1;
209 
210 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
211 }
212 
213 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
214 	({ struct syscall_tp *sc = evsel->priv;\
215 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
216 
217 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
218 					 struct tp_field *field,
219 					 const char *name)
220 {
221 	struct format_field *format_field = perf_evsel__field(evsel, name);
222 
223 	if (format_field == NULL)
224 		return -1;
225 
226 	return tp_field__init_ptr(field, format_field);
227 }
228 
229 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
230 	({ struct syscall_tp *sc = evsel->priv;\
231 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
232 
233 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
234 {
235 	zfree(&evsel->priv);
236 	perf_evsel__delete(evsel);
237 }
238 
239 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
240 {
241 	evsel->priv = malloc(sizeof(struct syscall_tp));
242 	if (evsel->priv != NULL) {
243 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
244 			goto out_delete;
245 
246 		evsel->handler = handler;
247 		return 0;
248 	}
249 
250 	return -ENOMEM;
251 
252 out_delete:
253 	zfree(&evsel->priv);
254 	return -ENOENT;
255 }
256 
257 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
258 {
259 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
260 
261 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
262 	if (IS_ERR(evsel))
263 		evsel = perf_evsel__newtp("syscalls", direction);
264 
265 	if (IS_ERR(evsel))
266 		return NULL;
267 
268 	if (perf_evsel__init_syscall_tp(evsel, handler))
269 		goto out_delete;
270 
271 	return evsel;
272 
273 out_delete:
274 	perf_evsel__delete_priv(evsel);
275 	return NULL;
276 }
277 
278 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
279 	({ struct syscall_tp *fields = evsel->priv; \
280 	   fields->name.integer(&fields->name, sample); })
281 
282 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
283 	({ struct syscall_tp *fields = evsel->priv; \
284 	   fields->name.pointer(&fields->name, sample); })
285 
286 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
287 {
288 	int idx = val - sa->offset;
289 
290 	if (idx < 0 || idx >= sa->nr_entries)
291 		return scnprintf(bf, size, intfmt, val);
292 
293 	return scnprintf(bf, size, "%s", sa->entries[idx]);
294 }
295 
296 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
297 						const char *intfmt,
298 					        struct syscall_arg *arg)
299 {
300 	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
301 }
302 
303 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
304 					      struct syscall_arg *arg)
305 {
306 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
307 }
308 
309 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
310 
311 struct strarrays {
312 	int		nr_entries;
313 	struct strarray **entries;
314 };
315 
316 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
317 	.nr_entries = ARRAY_SIZE(array), \
318 	.entries = array, \
319 }
320 
321 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
322 					struct syscall_arg *arg)
323 {
324 	struct strarrays *sas = arg->parm;
325 	int i;
326 
327 	for (i = 0; i < sas->nr_entries; ++i) {
328 		struct strarray *sa = sas->entries[i];
329 		int idx = arg->val - sa->offset;
330 
331 		if (idx >= 0 && idx < sa->nr_entries) {
332 			if (sa->entries[idx] == NULL)
333 				break;
334 			return scnprintf(bf, size, "%s", sa->entries[idx]);
335 		}
336 	}
337 
338 	return scnprintf(bf, size, "%d", arg->val);
339 }
340 
341 #ifndef AT_FDCWD
342 #define AT_FDCWD	-100
343 #endif
344 
345 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
346 					   struct syscall_arg *arg)
347 {
348 	int fd = arg->val;
349 
350 	if (fd == AT_FDCWD)
351 		return scnprintf(bf, size, "CWD");
352 
353 	return syscall_arg__scnprintf_fd(bf, size, arg);
354 }
355 
356 #define SCA_FDAT syscall_arg__scnprintf_fd_at
357 
358 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
359 					      struct syscall_arg *arg);
360 
361 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
362 
363 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
364 {
365 	return scnprintf(bf, size, "%#lx", arg->val);
366 }
367 
368 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
369 {
370 	return scnprintf(bf, size, "%d", arg->val);
371 }
372 
373 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
374 {
375 	return scnprintf(bf, size, "%ld", arg->val);
376 }
377 
378 static const char *bpf_cmd[] = {
379 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
380 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
381 };
382 static DEFINE_STRARRAY(bpf_cmd);
383 
384 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
385 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
386 
387 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
388 static DEFINE_STRARRAY(itimers);
389 
390 static const char *keyctl_options[] = {
391 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
392 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
393 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
394 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
395 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
396 };
397 static DEFINE_STRARRAY(keyctl_options);
398 
399 static const char *whences[] = { "SET", "CUR", "END",
400 #ifdef SEEK_DATA
401 "DATA",
402 #endif
403 #ifdef SEEK_HOLE
404 "HOLE",
405 #endif
406 };
407 static DEFINE_STRARRAY(whences);
408 
409 static const char *fcntl_cmds[] = {
410 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
411 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
412 	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
413 	"GETOWNER_UIDS",
414 };
415 static DEFINE_STRARRAY(fcntl_cmds);
416 
417 static const char *fcntl_linux_specific_cmds[] = {
418 	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
419 	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
420 	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
421 };
422 
423 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
424 
425 static struct strarray *fcntl_cmds_arrays[] = {
426 	&strarray__fcntl_cmds,
427 	&strarray__fcntl_linux_specific_cmds,
428 };
429 
430 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
431 
432 static const char *rlimit_resources[] = {
433 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
434 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
435 	"RTTIME",
436 };
437 static DEFINE_STRARRAY(rlimit_resources);
438 
439 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
440 static DEFINE_STRARRAY(sighow);
441 
442 static const char *clockid[] = {
443 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
444 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
445 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
446 };
447 static DEFINE_STRARRAY(clockid);
448 
449 static const char *socket_families[] = {
450 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
451 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
452 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
453 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
454 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
455 	"ALG", "NFC", "VSOCK",
456 };
457 static DEFINE_STRARRAY(socket_families);
458 
459 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
460 						 struct syscall_arg *arg)
461 {
462 	size_t printed = 0;
463 	int mode = arg->val;
464 
465 	if (mode == F_OK) /* 0 */
466 		return scnprintf(bf, size, "F");
467 #define	P_MODE(n) \
468 	if (mode & n##_OK) { \
469 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
470 		mode &= ~n##_OK; \
471 	}
472 
473 	P_MODE(R);
474 	P_MODE(W);
475 	P_MODE(X);
476 #undef P_MODE
477 
478 	if (mode)
479 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
480 
481 	return printed;
482 }
483 
484 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
485 
486 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
487 					      struct syscall_arg *arg);
488 
489 #define SCA_FILENAME syscall_arg__scnprintf_filename
490 
491 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
492 						struct syscall_arg *arg)
493 {
494 	int printed = 0, flags = arg->val;
495 
496 #define	P_FLAG(n) \
497 	if (flags & O_##n) { \
498 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
499 		flags &= ~O_##n; \
500 	}
501 
502 	P_FLAG(CLOEXEC);
503 	P_FLAG(NONBLOCK);
504 #undef P_FLAG
505 
506 	if (flags)
507 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
508 
509 	return printed;
510 }
511 
512 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
513 
514 #ifndef GRND_NONBLOCK
515 #define GRND_NONBLOCK	0x0001
516 #endif
517 #ifndef GRND_RANDOM
518 #define GRND_RANDOM	0x0002
519 #endif
520 
521 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
522 						   struct syscall_arg *arg)
523 {
524 	int printed = 0, flags = arg->val;
525 
526 #define	P_FLAG(n) \
527 	if (flags & GRND_##n) { \
528 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
529 		flags &= ~GRND_##n; \
530 	}
531 
532 	P_FLAG(RANDOM);
533 	P_FLAG(NONBLOCK);
534 #undef P_FLAG
535 
536 	if (flags)
537 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
538 
539 	return printed;
540 }
541 
542 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
543 
544 #define STRARRAY(name, array) \
545 	  { .scnprintf	= SCA_STRARRAY, \
546 	    .parm	= &strarray__##array, }
547 
548 #include "trace/beauty/eventfd.c"
549 #include "trace/beauty/flock.c"
550 #include "trace/beauty/futex_op.c"
551 #include "trace/beauty/mmap.c"
552 #include "trace/beauty/mode_t.c"
553 #include "trace/beauty/msg_flags.c"
554 #include "trace/beauty/open_flags.c"
555 #include "trace/beauty/perf_event_open.c"
556 #include "trace/beauty/pid.c"
557 #include "trace/beauty/sched_policy.c"
558 #include "trace/beauty/seccomp.c"
559 #include "trace/beauty/signum.c"
560 #include "trace/beauty/socket_type.c"
561 #include "trace/beauty/waitid_options.c"
562 
563 struct syscall_arg_fmt {
564 	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
565 	void	   *parm;
566 	const char *name;
567 	bool	   show_zero;
568 };
569 
570 static struct syscall_fmt {
571 	const char *name;
572 	const char *alias;
573 	struct syscall_arg_fmt arg[6];
574 	u8	   nr_args;
575 	bool	   errpid;
576 	bool	   timeout;
577 	bool	   hexret;
578 } syscall_fmts[] = {
579 	{ .name	    = "access",
580 	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
581 	{ .name	    = "arch_prctl", .alias = "prctl", },
582 	{ .name	    = "bpf",
583 	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
584 	{ .name	    = "brk",	    .hexret = true,
585 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
586 	{ .name     = "clock_gettime",
587 	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
588 	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
589 	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
590 		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
591 		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
592 		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
593 		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
594 	{ .name	    = "close",
595 	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
596 	{ .name	    = "epoll_ctl",
597 	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
598 	{ .name	    = "eventfd2",
599 	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
600 	{ .name	    = "fchmodat",
601 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
602 	{ .name	    = "fchownat",
603 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
604 	{ .name	    = "fcntl",
605 	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
606 			   .parm      = &strarrays__fcntl_cmds_arrays,
607 			   .show_zero = true, },
608 		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
609 	{ .name	    = "flock",
610 	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
611 	{ .name	    = "fstat", .alias = "newfstat", },
612 	{ .name	    = "fstatat", .alias = "newfstatat", },
613 	{ .name	    = "futex",
614 	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, }, },
615 	{ .name	    = "futimesat",
616 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
617 	{ .name	    = "getitimer",
618 	  .arg = { [0] = STRARRAY(which, itimers), }, },
619 	{ .name	    = "getpid",	    .errpid = true, },
620 	{ .name	    = "getpgid",    .errpid = true, },
621 	{ .name	    = "getppid",    .errpid = true, },
622 	{ .name	    = "getrandom",
623 	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
624 	{ .name	    = "getrlimit",
625 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
626 	{ .name	    = "ioctl",
627 	  .arg = {
628 #if defined(__i386__) || defined(__x86_64__)
629 /*
630  * FIXME: Make this available to all arches.
631  */
632 		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
633 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
634 #else
635 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
636 #endif
637 	{ .name	    = "keyctl",
638 	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
639 	{ .name	    = "kill",
640 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
641 	{ .name	    = "linkat",
642 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
643 	{ .name	    = "lseek",
644 	  .arg = { [2] = STRARRAY(whence, whences), }, },
645 	{ .name	    = "lstat", .alias = "newlstat", },
646 	{ .name     = "madvise",
647 	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
648 		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
649 	{ .name	    = "mkdirat",
650 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
651 	{ .name	    = "mknodat",
652 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
653 	{ .name	    = "mlock",
654 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
655 	{ .name	    = "mlockall",
656 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
657 	{ .name	    = "mmap",	    .hexret = true,
658 /* The standard mmap maps to old_mmap on s390x */
659 #if defined(__s390x__)
660 	.alias = "old_mmap",
661 #endif
662 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* addr */ },
663 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
664 		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */ }, }, },
665 	{ .name	    = "mprotect",
666 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
667 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
668 	{ .name	    = "mq_unlink",
669 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
670 	{ .name	    = "mremap",	    .hexret = true,
671 	  .arg = { [0] = { .scnprintf = SCA_HEX,	  /* addr */ },
672 		   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
673 		   [4] = { .scnprintf = SCA_HEX,	  /* new_addr */ }, }, },
674 	{ .name	    = "munlock",
675 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
676 	{ .name	    = "munmap",
677 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
678 	{ .name	    = "name_to_handle_at",
679 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
680 	{ .name	    = "newfstatat",
681 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
682 	{ .name	    = "open",
683 	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
684 	{ .name	    = "open_by_handle_at",
685 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
686 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
687 	{ .name	    = "openat",
688 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
689 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
690 	{ .name	    = "perf_event_open",
691 	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
692 		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
693 		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
694 	{ .name	    = "pipe2",
695 	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
696 	{ .name	    = "pkey_alloc",
697 	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
698 	{ .name	    = "pkey_free",
699 	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
700 	{ .name	    = "pkey_mprotect",
701 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
702 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
703 		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
704 	{ .name	    = "poll", .timeout = true, },
705 	{ .name	    = "ppoll", .timeout = true, },
706 	{ .name	    = "pread", .alias = "pread64", },
707 	{ .name	    = "preadv", .alias = "pread", },
708 	{ .name	    = "prlimit64",
709 	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
710 	{ .name	    = "pwrite", .alias = "pwrite64", },
711 	{ .name	    = "readlinkat",
712 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
713 	{ .name	    = "recvfrom",
714 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
715 	{ .name	    = "recvmmsg",
716 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
717 	{ .name	    = "recvmsg",
718 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
719 	{ .name	    = "renameat",
720 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
721 	{ .name	    = "rt_sigaction",
722 	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
723 	{ .name	    = "rt_sigprocmask",
724 	  .arg = { [0] = STRARRAY(how, sighow), }, },
725 	{ .name	    = "rt_sigqueueinfo",
726 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
727 	{ .name	    = "rt_tgsigqueueinfo",
728 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
729 	{ .name	    = "sched_setscheduler",
730 	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
731 	{ .name	    = "seccomp",
732 	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
733 		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
734 	{ .name	    = "select", .timeout = true, },
735 	{ .name	    = "sendmmsg",
736 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
737 	{ .name	    = "sendmsg",
738 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
739 	{ .name	    = "sendto",
740 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
741 	{ .name	    = "set_tid_address", .errpid = true, },
742 	{ .name	    = "setitimer",
743 	  .arg = { [0] = STRARRAY(which, itimers), }, },
744 	{ .name	    = "setrlimit",
745 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
746 	{ .name	    = "socket",
747 	  .arg = { [0] = STRARRAY(family, socket_families),
748 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
749 	{ .name	    = "socketpair",
750 	  .arg = { [0] = STRARRAY(family, socket_families),
751 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
752 	{ .name	    = "stat", .alias = "newstat", },
753 	{ .name	    = "statx",
754 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
755 		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
756 		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
757 	{ .name	    = "swapoff",
758 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
759 	{ .name	    = "swapon",
760 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
761 	{ .name	    = "symlinkat",
762 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
763 	{ .name	    = "tgkill",
764 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
765 	{ .name	    = "tkill",
766 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
767 	{ .name	    = "uname", .alias = "newuname", },
768 	{ .name	    = "unlinkat",
769 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
770 	{ .name	    = "utimensat",
771 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
772 	{ .name	    = "wait4",	    .errpid = true,
773 	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
774 	{ .name	    = "waitid",	    .errpid = true,
775 	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
776 };
777 
778 static int syscall_fmt__cmp(const void *name, const void *fmtp)
779 {
780 	const struct syscall_fmt *fmt = fmtp;
781 	return strcmp(name, fmt->name);
782 }
783 
784 static struct syscall_fmt *syscall_fmt__find(const char *name)
785 {
786 	const int nmemb = ARRAY_SIZE(syscall_fmts);
787 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
788 }
789 
790 struct syscall {
791 	struct event_format *tp_format;
792 	int		    nr_args;
793 	struct format_field *args;
794 	const char	    *name;
795 	bool		    is_exit;
796 	struct syscall_fmt  *fmt;
797 	struct syscall_arg_fmt *arg_fmt;
798 };
799 
800 /*
801  * We need to have this 'calculated' boolean because in some cases we really
802  * don't know what is the duration of a syscall, for instance, when we start
803  * a session and some threads are waiting for a syscall to finish, say 'poll',
804  * in which case all we can do is to print "( ? ) for duration and for the
805  * start timestamp.
806  */
807 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
808 {
809 	double duration = (double)t / NSEC_PER_MSEC;
810 	size_t printed = fprintf(fp, "(");
811 
812 	if (!calculated)
813 		printed += fprintf(fp, "     ?   ");
814 	else if (duration >= 1.0)
815 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
816 	else if (duration >= 0.01)
817 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
818 	else
819 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
820 	return printed + fprintf(fp, "): ");
821 }
822 
823 /**
824  * filename.ptr: The filename char pointer that will be vfs_getname'd
825  * filename.entry_str_pos: Where to insert the string translated from
826  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
827  * ret_scnprintf: syscall args may set this to a different syscall return
828  *                formatter, for instance, fcntl may return fds, file flags, etc.
829  */
830 struct thread_trace {
831 	u64		  entry_time;
832 	bool		  entry_pending;
833 	unsigned long	  nr_events;
834 	unsigned long	  pfmaj, pfmin;
835 	char		  *entry_str;
836 	double		  runtime_ms;
837 	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
838         struct {
839 		unsigned long ptr;
840 		short int     entry_str_pos;
841 		bool	      pending_open;
842 		unsigned int  namelen;
843 		char	      *name;
844 	} filename;
845 	struct {
846 		int	  max;
847 		char	  **table;
848 	} paths;
849 
850 	struct intlist *syscall_stats;
851 };
852 
853 static struct thread_trace *thread_trace__new(void)
854 {
855 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
856 
857 	if (ttrace)
858 		ttrace->paths.max = -1;
859 
860 	ttrace->syscall_stats = intlist__new(NULL);
861 
862 	return ttrace;
863 }
864 
865 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
866 {
867 	struct thread_trace *ttrace;
868 
869 	if (thread == NULL)
870 		goto fail;
871 
872 	if (thread__priv(thread) == NULL)
873 		thread__set_priv(thread, thread_trace__new());
874 
875 	if (thread__priv(thread) == NULL)
876 		goto fail;
877 
878 	ttrace = thread__priv(thread);
879 	++ttrace->nr_events;
880 
881 	return ttrace;
882 fail:
883 	color_fprintf(fp, PERF_COLOR_RED,
884 		      "WARNING: not enough memory, dropping samples!\n");
885 	return NULL;
886 }
887 
888 
889 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
890 				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
891 {
892 	struct thread_trace *ttrace = thread__priv(arg->thread);
893 
894 	ttrace->ret_scnprintf = ret_scnprintf;
895 }
896 
897 #define TRACE_PFMAJ		(1 << 0)
898 #define TRACE_PFMIN		(1 << 1)
899 
900 static const size_t trace__entry_str_size = 2048;
901 
902 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
903 {
904 	struct thread_trace *ttrace = thread__priv(thread);
905 
906 	if (fd > ttrace->paths.max) {
907 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
908 
909 		if (npath == NULL)
910 			return -1;
911 
912 		if (ttrace->paths.max != -1) {
913 			memset(npath + ttrace->paths.max + 1, 0,
914 			       (fd - ttrace->paths.max) * sizeof(char *));
915 		} else {
916 			memset(npath, 0, (fd + 1) * sizeof(char *));
917 		}
918 
919 		ttrace->paths.table = npath;
920 		ttrace->paths.max   = fd;
921 	}
922 
923 	ttrace->paths.table[fd] = strdup(pathname);
924 
925 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
926 }
927 
928 static int thread__read_fd_path(struct thread *thread, int fd)
929 {
930 	char linkname[PATH_MAX], pathname[PATH_MAX];
931 	struct stat st;
932 	int ret;
933 
934 	if (thread->pid_ == thread->tid) {
935 		scnprintf(linkname, sizeof(linkname),
936 			  "/proc/%d/fd/%d", thread->pid_, fd);
937 	} else {
938 		scnprintf(linkname, sizeof(linkname),
939 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
940 	}
941 
942 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
943 		return -1;
944 
945 	ret = readlink(linkname, pathname, sizeof(pathname));
946 
947 	if (ret < 0 || ret > st.st_size)
948 		return -1;
949 
950 	pathname[ret] = '\0';
951 	return trace__set_fd_pathname(thread, fd, pathname);
952 }
953 
954 static const char *thread__fd_path(struct thread *thread, int fd,
955 				   struct trace *trace)
956 {
957 	struct thread_trace *ttrace = thread__priv(thread);
958 
959 	if (ttrace == NULL)
960 		return NULL;
961 
962 	if (fd < 0)
963 		return NULL;
964 
965 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
966 		if (!trace->live)
967 			return NULL;
968 		++trace->stats.proc_getname;
969 		if (thread__read_fd_path(thread, fd))
970 			return NULL;
971 	}
972 
973 	return ttrace->paths.table[fd];
974 }
975 
976 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
977 {
978 	int fd = arg->val;
979 	size_t printed = scnprintf(bf, size, "%d", fd);
980 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
981 
982 	if (path)
983 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
984 
985 	return printed;
986 }
987 
988 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
989 					      struct syscall_arg *arg)
990 {
991 	int fd = arg->val;
992 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
993 	struct thread_trace *ttrace = thread__priv(arg->thread);
994 
995 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
996 		zfree(&ttrace->paths.table[fd]);
997 
998 	return printed;
999 }
1000 
1001 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1002 				     unsigned long ptr)
1003 {
1004 	struct thread_trace *ttrace = thread__priv(thread);
1005 
1006 	ttrace->filename.ptr = ptr;
1007 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1008 }
1009 
1010 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1011 					      struct syscall_arg *arg)
1012 {
1013 	unsigned long ptr = arg->val;
1014 
1015 	if (!arg->trace->vfs_getname)
1016 		return scnprintf(bf, size, "%#x", ptr);
1017 
1018 	thread__set_filename_pos(arg->thread, bf, ptr);
1019 	return 0;
1020 }
1021 
1022 static bool trace__filter_duration(struct trace *trace, double t)
1023 {
1024 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1025 }
1026 
1027 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1028 {
1029 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1030 
1031 	return fprintf(fp, "%10.3f ", ts);
1032 }
1033 
1034 /*
1035  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1036  * using ttrace->entry_time for a thread that receives a sys_exit without
1037  * first having received a sys_enter ("poll" issued before tracing session
1038  * starts, lost sys_enter exit due to ring buffer overflow).
1039  */
1040 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1041 {
1042 	if (tstamp > 0)
1043 		return __trace__fprintf_tstamp(trace, tstamp, fp);
1044 
1045 	return fprintf(fp, "         ? ");
1046 }
1047 
1048 static bool done = false;
1049 static bool interrupted = false;
1050 
1051 static void sig_handler(int sig)
1052 {
1053 	done = true;
1054 	interrupted = sig == SIGINT;
1055 }
1056 
1057 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1058 					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1059 {
1060 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1061 	printed += fprintf_duration(duration, duration_calculated, fp);
1062 
1063 	if (trace->multiple_threads) {
1064 		if (trace->show_comm)
1065 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1066 		printed += fprintf(fp, "%d ", thread->tid);
1067 	}
1068 
1069 	return printed;
1070 }
1071 
1072 static int trace__process_event(struct trace *trace, struct machine *machine,
1073 				union perf_event *event, struct perf_sample *sample)
1074 {
1075 	int ret = 0;
1076 
1077 	switch (event->header.type) {
1078 	case PERF_RECORD_LOST:
1079 		color_fprintf(trace->output, PERF_COLOR_RED,
1080 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1081 		ret = machine__process_lost_event(machine, event, sample);
1082 		break;
1083 	default:
1084 		ret = machine__process_event(machine, event, sample);
1085 		break;
1086 	}
1087 
1088 	return ret;
1089 }
1090 
1091 static int trace__tool_process(struct perf_tool *tool,
1092 			       union perf_event *event,
1093 			       struct perf_sample *sample,
1094 			       struct machine *machine)
1095 {
1096 	struct trace *trace = container_of(tool, struct trace, tool);
1097 	return trace__process_event(trace, machine, event, sample);
1098 }
1099 
1100 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1101 {
1102 	struct machine *machine = vmachine;
1103 
1104 	if (machine->kptr_restrict_warned)
1105 		return NULL;
1106 
1107 	if (symbol_conf.kptr_restrict) {
1108 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1109 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1110 			   "Kernel samples will not be resolved.\n");
1111 		machine->kptr_restrict_warned = true;
1112 		return NULL;
1113 	}
1114 
1115 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1116 }
1117 
1118 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1119 {
1120 	int err = symbol__init(NULL);
1121 
1122 	if (err)
1123 		return err;
1124 
1125 	trace->host = machine__new_host();
1126 	if (trace->host == NULL)
1127 		return -ENOMEM;
1128 
1129 	if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1130 		return -errno;
1131 
1132 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1133 					    evlist->threads, trace__tool_process, false,
1134 					    trace->opts.proc_map_timeout);
1135 	if (err)
1136 		symbol__exit();
1137 
1138 	return err;
1139 }
1140 
1141 static void trace__symbols__exit(struct trace *trace)
1142 {
1143 	machine__exit(trace->host);
1144 	trace->host = NULL;
1145 
1146 	symbol__exit();
1147 }
1148 
1149 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1150 {
1151 	int idx;
1152 
1153 	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1154 		nr_args = sc->fmt->nr_args;
1155 
1156 	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1157 	if (sc->arg_fmt == NULL)
1158 		return -1;
1159 
1160 	for (idx = 0; idx < nr_args; ++idx) {
1161 		if (sc->fmt)
1162 			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1163 	}
1164 
1165 	sc->nr_args = nr_args;
1166 	return 0;
1167 }
1168 
1169 static int syscall__set_arg_fmts(struct syscall *sc)
1170 {
1171 	struct format_field *field;
1172 	int idx = 0, len;
1173 
1174 	for (field = sc->args; field; field = field->next, ++idx) {
1175 		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1176 			continue;
1177 
1178 		if (strcmp(field->type, "const char *") == 0 &&
1179 			 (strcmp(field->name, "filename") == 0 ||
1180 			  strcmp(field->name, "path") == 0 ||
1181 			  strcmp(field->name, "pathname") == 0))
1182 			sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1183 		else if (field->flags & FIELD_IS_POINTER)
1184 			sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1185 		else if (strcmp(field->type, "pid_t") == 0)
1186 			sc->arg_fmt[idx].scnprintf = SCA_PID;
1187 		else if (strcmp(field->type, "umode_t") == 0)
1188 			sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1189 		else if ((strcmp(field->type, "int") == 0 ||
1190 			  strcmp(field->type, "unsigned int") == 0 ||
1191 			  strcmp(field->type, "long") == 0) &&
1192 			 (len = strlen(field->name)) >= 2 &&
1193 			 strcmp(field->name + len - 2, "fd") == 0) {
1194 			/*
1195 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1196 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1197 			 * 65 int
1198 			 * 23 unsigned int
1199 			 * 7 unsigned long
1200 			 */
1201 			sc->arg_fmt[idx].scnprintf = SCA_FD;
1202 		}
1203 	}
1204 
1205 	return 0;
1206 }
1207 
1208 static int trace__read_syscall_info(struct trace *trace, int id)
1209 {
1210 	char tp_name[128];
1211 	struct syscall *sc;
1212 	const char *name = syscalltbl__name(trace->sctbl, id);
1213 
1214 	if (name == NULL)
1215 		return -1;
1216 
1217 	if (id > trace->syscalls.max) {
1218 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1219 
1220 		if (nsyscalls == NULL)
1221 			return -1;
1222 
1223 		if (trace->syscalls.max != -1) {
1224 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1225 			       (id - trace->syscalls.max) * sizeof(*sc));
1226 		} else {
1227 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1228 		}
1229 
1230 		trace->syscalls.table = nsyscalls;
1231 		trace->syscalls.max   = id;
1232 	}
1233 
1234 	sc = trace->syscalls.table + id;
1235 	sc->name = name;
1236 
1237 	sc->fmt  = syscall_fmt__find(sc->name);
1238 
1239 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1240 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1241 
1242 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1243 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1244 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1245 	}
1246 
1247 	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1248 		return -1;
1249 
1250 	if (IS_ERR(sc->tp_format))
1251 		return -1;
1252 
1253 	sc->args = sc->tp_format->format.fields;
1254 	/*
1255 	 * We need to check and discard the first variable '__syscall_nr'
1256 	 * or 'nr' that mean the syscall number. It is needless here.
1257 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1258 	 */
1259 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1260 		sc->args = sc->args->next;
1261 		--sc->nr_args;
1262 	}
1263 
1264 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1265 
1266 	return syscall__set_arg_fmts(sc);
1267 }
1268 
1269 static int trace__validate_ev_qualifier(struct trace *trace)
1270 {
1271 	int err = 0, i;
1272 	size_t nr_allocated;
1273 	struct str_node *pos;
1274 
1275 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1276 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1277 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1278 
1279 	if (trace->ev_qualifier_ids.entries == NULL) {
1280 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1281 		       trace->output);
1282 		err = -EINVAL;
1283 		goto out;
1284 	}
1285 
1286 	nr_allocated = trace->ev_qualifier_ids.nr;
1287 	i = 0;
1288 
1289 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1290 		const char *sc = pos->s;
1291 		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1292 
1293 		if (id < 0) {
1294 			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1295 			if (id >= 0)
1296 				goto matches;
1297 
1298 			if (err == 0) {
1299 				fputs("Error:\tInvalid syscall ", trace->output);
1300 				err = -EINVAL;
1301 			} else {
1302 				fputs(", ", trace->output);
1303 			}
1304 
1305 			fputs(sc, trace->output);
1306 		}
1307 matches:
1308 		trace->ev_qualifier_ids.entries[i++] = id;
1309 		if (match_next == -1)
1310 			continue;
1311 
1312 		while (1) {
1313 			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1314 			if (id < 0)
1315 				break;
1316 			if (nr_allocated == trace->ev_qualifier_ids.nr) {
1317 				void *entries;
1318 
1319 				nr_allocated += 8;
1320 				entries = realloc(trace->ev_qualifier_ids.entries,
1321 						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1322 				if (entries == NULL) {
1323 					err = -ENOMEM;
1324 					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1325 					goto out_free;
1326 				}
1327 				trace->ev_qualifier_ids.entries = entries;
1328 			}
1329 			trace->ev_qualifier_ids.nr++;
1330 			trace->ev_qualifier_ids.entries[i++] = id;
1331 		}
1332 	}
1333 
1334 	if (err < 0) {
1335 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1336 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1337 out_free:
1338 		zfree(&trace->ev_qualifier_ids.entries);
1339 		trace->ev_qualifier_ids.nr = 0;
1340 	}
1341 out:
1342 	return err;
1343 }
1344 
1345 /*
1346  * args is to be interpreted as a series of longs but we need to handle
1347  * 8-byte unaligned accesses. args points to raw_data within the event
1348  * and raw_data is guaranteed to be 8-byte unaligned because it is
1349  * preceded by raw_size which is a u32. So we need to copy args to a temp
1350  * variable to read it. Most notably this avoids extended load instructions
1351  * on unaligned addresses
1352  */
1353 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1354 {
1355 	unsigned long val;
1356 	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1357 
1358 	memcpy(&val, p, sizeof(val));
1359 	return val;
1360 }
1361 
1362 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1363 				      struct syscall_arg *arg)
1364 {
1365 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1366 		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1367 
1368 	return scnprintf(bf, size, "arg%d: ", arg->idx);
1369 }
1370 
1371 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1372 				     struct syscall_arg *arg, unsigned long val)
1373 {
1374 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1375 		arg->val = val;
1376 		if (sc->arg_fmt[arg->idx].parm)
1377 			arg->parm = sc->arg_fmt[arg->idx].parm;
1378 		return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1379 	}
1380 	return scnprintf(bf, size, "%ld", val);
1381 }
1382 
1383 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1384 				      unsigned char *args, struct trace *trace,
1385 				      struct thread *thread)
1386 {
1387 	size_t printed = 0;
1388 	unsigned long val;
1389 	u8 bit = 1;
1390 	struct syscall_arg arg = {
1391 		.args	= args,
1392 		.idx	= 0,
1393 		.mask	= 0,
1394 		.trace  = trace,
1395 		.thread = thread,
1396 	};
1397 	struct thread_trace *ttrace = thread__priv(thread);
1398 
1399 	/*
1400 	 * Things like fcntl will set this in its 'cmd' formatter to pick the
1401 	 * right formatter for the return value (an fd? file flags?), which is
1402 	 * not needed for syscalls that always return a given type, say an fd.
1403 	 */
1404 	ttrace->ret_scnprintf = NULL;
1405 
1406 	if (sc->args != NULL) {
1407 		struct format_field *field;
1408 
1409 		for (field = sc->args; field;
1410 		     field = field->next, ++arg.idx, bit <<= 1) {
1411 			if (arg.mask & bit)
1412 				continue;
1413 
1414 			val = syscall_arg__val(&arg, arg.idx);
1415 
1416 			/*
1417  			 * Suppress this argument if its value is zero and
1418  			 * and we don't have a string associated in an
1419  			 * strarray for it.
1420  			 */
1421 			if (val == 0 &&
1422 			    !(sc->arg_fmt &&
1423 			      (sc->arg_fmt[arg.idx].show_zero ||
1424 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1425 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1426 			      sc->arg_fmt[arg.idx].parm))
1427 				continue;
1428 
1429 			printed += scnprintf(bf + printed, size - printed,
1430 					     "%s%s: ", printed ? ", " : "", field->name);
1431 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1432 		}
1433 	} else if (IS_ERR(sc->tp_format)) {
1434 		/*
1435 		 * If we managed to read the tracepoint /format file, then we
1436 		 * may end up not having any args, like with gettid(), so only
1437 		 * print the raw args when we didn't manage to read it.
1438 		 */
1439 		while (arg.idx < sc->nr_args) {
1440 			if (arg.mask & bit)
1441 				goto next_arg;
1442 			val = syscall_arg__val(&arg, arg.idx);
1443 			if (printed)
1444 				printed += scnprintf(bf + printed, size - printed, ", ");
1445 			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1446 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1447 next_arg:
1448 			++arg.idx;
1449 			bit <<= 1;
1450 		}
1451 	}
1452 
1453 	return printed;
1454 }
1455 
1456 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1457 				  union perf_event *event,
1458 				  struct perf_sample *sample);
1459 
1460 static struct syscall *trace__syscall_info(struct trace *trace,
1461 					   struct perf_evsel *evsel, int id)
1462 {
1463 
1464 	if (id < 0) {
1465 
1466 		/*
1467 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1468 		 * before that, leaving at a higher verbosity level till that is
1469 		 * explained. Reproduced with plain ftrace with:
1470 		 *
1471 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1472 		 * grep "NR -1 " /t/trace_pipe
1473 		 *
1474 		 * After generating some load on the machine.
1475  		 */
1476 		if (verbose > 1) {
1477 			static u64 n;
1478 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1479 				id, perf_evsel__name(evsel), ++n);
1480 		}
1481 		return NULL;
1482 	}
1483 
1484 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1485 	    trace__read_syscall_info(trace, id))
1486 		goto out_cant_read;
1487 
1488 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1489 		goto out_cant_read;
1490 
1491 	return &trace->syscalls.table[id];
1492 
1493 out_cant_read:
1494 	if (verbose > 0) {
1495 		fprintf(trace->output, "Problems reading syscall %d", id);
1496 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1497 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1498 		fputs(" information\n", trace->output);
1499 	}
1500 	return NULL;
1501 }
1502 
1503 static void thread__update_stats(struct thread_trace *ttrace,
1504 				 int id, struct perf_sample *sample)
1505 {
1506 	struct int_node *inode;
1507 	struct stats *stats;
1508 	u64 duration = 0;
1509 
1510 	inode = intlist__findnew(ttrace->syscall_stats, id);
1511 	if (inode == NULL)
1512 		return;
1513 
1514 	stats = inode->priv;
1515 	if (stats == NULL) {
1516 		stats = malloc(sizeof(struct stats));
1517 		if (stats == NULL)
1518 			return;
1519 		init_stats(stats);
1520 		inode->priv = stats;
1521 	}
1522 
1523 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1524 		duration = sample->time - ttrace->entry_time;
1525 
1526 	update_stats(stats, duration);
1527 }
1528 
1529 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1530 {
1531 	struct thread_trace *ttrace;
1532 	u64 duration;
1533 	size_t printed;
1534 
1535 	if (trace->current == NULL)
1536 		return 0;
1537 
1538 	ttrace = thread__priv(trace->current);
1539 
1540 	if (!ttrace->entry_pending)
1541 		return 0;
1542 
1543 	duration = sample->time - ttrace->entry_time;
1544 
1545 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1546 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1547 	ttrace->entry_pending = false;
1548 
1549 	return printed;
1550 }
1551 
1552 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1553 			    union perf_event *event __maybe_unused,
1554 			    struct perf_sample *sample)
1555 {
1556 	char *msg;
1557 	void *args;
1558 	size_t printed = 0;
1559 	struct thread *thread;
1560 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1561 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1562 	struct thread_trace *ttrace;
1563 
1564 	if (sc == NULL)
1565 		return -1;
1566 
1567 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1568 	ttrace = thread__trace(thread, trace->output);
1569 	if (ttrace == NULL)
1570 		goto out_put;
1571 
1572 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1573 
1574 	if (ttrace->entry_str == NULL) {
1575 		ttrace->entry_str = malloc(trace__entry_str_size);
1576 		if (!ttrace->entry_str)
1577 			goto out_put;
1578 	}
1579 
1580 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1581 		trace__printf_interrupted_entry(trace, sample);
1582 
1583 	ttrace->entry_time = sample->time;
1584 	msg = ttrace->entry_str;
1585 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1586 
1587 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1588 					   args, trace, thread);
1589 
1590 	if (sc->is_exit) {
1591 		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1592 			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1593 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1594 		}
1595 	} else {
1596 		ttrace->entry_pending = true;
1597 		/* See trace__vfs_getname & trace__sys_exit */
1598 		ttrace->filename.pending_open = false;
1599 	}
1600 
1601 	if (trace->current != thread) {
1602 		thread__put(trace->current);
1603 		trace->current = thread__get(thread);
1604 	}
1605 	err = 0;
1606 out_put:
1607 	thread__put(thread);
1608 	return err;
1609 }
1610 
1611 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1612 				    struct perf_sample *sample,
1613 				    struct callchain_cursor *cursor)
1614 {
1615 	struct addr_location al;
1616 
1617 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1618 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1619 		return -1;
1620 
1621 	return 0;
1622 }
1623 
1624 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1625 {
1626 	/* TODO: user-configurable print_opts */
1627 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1628 				        EVSEL__PRINT_DSO |
1629 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1630 
1631 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1632 }
1633 
1634 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1635 			   union perf_event *event __maybe_unused,
1636 			   struct perf_sample *sample)
1637 {
1638 	long ret;
1639 	u64 duration = 0;
1640 	bool duration_calculated = false;
1641 	struct thread *thread;
1642 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1643 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1644 	struct thread_trace *ttrace;
1645 
1646 	if (sc == NULL)
1647 		return -1;
1648 
1649 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1650 	ttrace = thread__trace(thread, trace->output);
1651 	if (ttrace == NULL)
1652 		goto out_put;
1653 
1654 	if (trace->summary)
1655 		thread__update_stats(ttrace, id, sample);
1656 
1657 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1658 
1659 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1660 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1661 		ttrace->filename.pending_open = false;
1662 		++trace->stats.vfs_getname;
1663 	}
1664 
1665 	if (ttrace->entry_time) {
1666 		duration = sample->time - ttrace->entry_time;
1667 		if (trace__filter_duration(trace, duration))
1668 			goto out;
1669 		duration_calculated = true;
1670 	} else if (trace->duration_filter)
1671 		goto out;
1672 
1673 	if (sample->callchain) {
1674 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1675 		if (callchain_ret == 0) {
1676 			if (callchain_cursor.nr < trace->min_stack)
1677 				goto out;
1678 			callchain_ret = 1;
1679 		}
1680 	}
1681 
1682 	if (trace->summary_only)
1683 		goto out;
1684 
1685 	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1686 
1687 	if (ttrace->entry_pending) {
1688 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1689 	} else {
1690 		fprintf(trace->output, " ... [");
1691 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1692 		fprintf(trace->output, "]: %s()", sc->name);
1693 	}
1694 
1695 	if (sc->fmt == NULL) {
1696 		if (ret < 0)
1697 			goto errno_print;
1698 signed_print:
1699 		fprintf(trace->output, ") = %ld", ret);
1700 	} else if (ret < 0) {
1701 errno_print: {
1702 		char bf[STRERR_BUFSIZE];
1703 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1704 			   *e = audit_errno_to_name(-ret);
1705 
1706 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1707 	}
1708 	} else if (ret == 0 && sc->fmt->timeout)
1709 		fprintf(trace->output, ") = 0 Timeout");
1710 	else if (ttrace->ret_scnprintf) {
1711 		char bf[1024];
1712 		struct syscall_arg arg = {
1713 			.val	= ret,
1714 			.thread	= thread,
1715 			.trace	= trace,
1716 		};
1717 		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1718 		ttrace->ret_scnprintf = NULL;
1719 		fprintf(trace->output, ") = %s", bf);
1720 	} else if (sc->fmt->hexret)
1721 		fprintf(trace->output, ") = %#lx", ret);
1722 	else if (sc->fmt->errpid) {
1723 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1724 
1725 		if (child != NULL) {
1726 			fprintf(trace->output, ") = %ld", ret);
1727 			if (child->comm_set)
1728 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1729 			thread__put(child);
1730 		}
1731 	} else
1732 		goto signed_print;
1733 
1734 	fputc('\n', trace->output);
1735 
1736 	if (callchain_ret > 0)
1737 		trace__fprintf_callchain(trace, sample);
1738 	else if (callchain_ret < 0)
1739 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1740 out:
1741 	ttrace->entry_pending = false;
1742 	err = 0;
1743 out_put:
1744 	thread__put(thread);
1745 	return err;
1746 }
1747 
1748 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1749 			      union perf_event *event __maybe_unused,
1750 			      struct perf_sample *sample)
1751 {
1752 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1753 	struct thread_trace *ttrace;
1754 	size_t filename_len, entry_str_len, to_move;
1755 	ssize_t remaining_space;
1756 	char *pos;
1757 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1758 
1759 	if (!thread)
1760 		goto out;
1761 
1762 	ttrace = thread__priv(thread);
1763 	if (!ttrace)
1764 		goto out_put;
1765 
1766 	filename_len = strlen(filename);
1767 	if (filename_len == 0)
1768 		goto out_put;
1769 
1770 	if (ttrace->filename.namelen < filename_len) {
1771 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1772 
1773 		if (f == NULL)
1774 			goto out_put;
1775 
1776 		ttrace->filename.namelen = filename_len;
1777 		ttrace->filename.name = f;
1778 	}
1779 
1780 	strcpy(ttrace->filename.name, filename);
1781 	ttrace->filename.pending_open = true;
1782 
1783 	if (!ttrace->filename.ptr)
1784 		goto out_put;
1785 
1786 	entry_str_len = strlen(ttrace->entry_str);
1787 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1788 	if (remaining_space <= 0)
1789 		goto out_put;
1790 
1791 	if (filename_len > (size_t)remaining_space) {
1792 		filename += filename_len - remaining_space;
1793 		filename_len = remaining_space;
1794 	}
1795 
1796 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1797 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1798 	memmove(pos + filename_len, pos, to_move);
1799 	memcpy(pos, filename, filename_len);
1800 
1801 	ttrace->filename.ptr = 0;
1802 	ttrace->filename.entry_str_pos = 0;
1803 out_put:
1804 	thread__put(thread);
1805 out:
1806 	return 0;
1807 }
1808 
1809 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1810 				     union perf_event *event __maybe_unused,
1811 				     struct perf_sample *sample)
1812 {
1813         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1814 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1815 	struct thread *thread = machine__findnew_thread(trace->host,
1816 							sample->pid,
1817 							sample->tid);
1818 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1819 
1820 	if (ttrace == NULL)
1821 		goto out_dump;
1822 
1823 	ttrace->runtime_ms += runtime_ms;
1824 	trace->runtime_ms += runtime_ms;
1825 out_put:
1826 	thread__put(thread);
1827 	return 0;
1828 
1829 out_dump:
1830 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1831 	       evsel->name,
1832 	       perf_evsel__strval(evsel, sample, "comm"),
1833 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1834 	       runtime,
1835 	       perf_evsel__intval(evsel, sample, "vruntime"));
1836 	goto out_put;
1837 }
1838 
1839 static void bpf_output__printer(enum binary_printer_ops op,
1840 				unsigned int val, void *extra)
1841 {
1842 	FILE *output = extra;
1843 	unsigned char ch = (unsigned char)val;
1844 
1845 	switch (op) {
1846 	case BINARY_PRINT_CHAR_DATA:
1847 		fprintf(output, "%c", isprint(ch) ? ch : '.');
1848 		break;
1849 	case BINARY_PRINT_DATA_BEGIN:
1850 	case BINARY_PRINT_LINE_BEGIN:
1851 	case BINARY_PRINT_ADDR:
1852 	case BINARY_PRINT_NUM_DATA:
1853 	case BINARY_PRINT_NUM_PAD:
1854 	case BINARY_PRINT_SEP:
1855 	case BINARY_PRINT_CHAR_PAD:
1856 	case BINARY_PRINT_LINE_END:
1857 	case BINARY_PRINT_DATA_END:
1858 	default:
1859 		break;
1860 	}
1861 }
1862 
1863 static void bpf_output__fprintf(struct trace *trace,
1864 				struct perf_sample *sample)
1865 {
1866 	print_binary(sample->raw_data, sample->raw_size, 8,
1867 		     bpf_output__printer, trace->output);
1868 }
1869 
1870 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1871 				union perf_event *event __maybe_unused,
1872 				struct perf_sample *sample)
1873 {
1874 	int callchain_ret = 0;
1875 
1876 	if (sample->callchain) {
1877 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1878 		if (callchain_ret == 0) {
1879 			if (callchain_cursor.nr < trace->min_stack)
1880 				goto out;
1881 			callchain_ret = 1;
1882 		}
1883 	}
1884 
1885 	trace__printf_interrupted_entry(trace, sample);
1886 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1887 
1888 	if (trace->trace_syscalls)
1889 		fprintf(trace->output, "(         ): ");
1890 
1891 	fprintf(trace->output, "%s:", evsel->name);
1892 
1893 	if (perf_evsel__is_bpf_output(evsel)) {
1894 		bpf_output__fprintf(trace, sample);
1895 	} else if (evsel->tp_format) {
1896 		event_format__fprintf(evsel->tp_format, sample->cpu,
1897 				      sample->raw_data, sample->raw_size,
1898 				      trace->output);
1899 	}
1900 
1901 	fprintf(trace->output, ")\n");
1902 
1903 	if (callchain_ret > 0)
1904 		trace__fprintf_callchain(trace, sample);
1905 	else if (callchain_ret < 0)
1906 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1907 out:
1908 	return 0;
1909 }
1910 
1911 static void print_location(FILE *f, struct perf_sample *sample,
1912 			   struct addr_location *al,
1913 			   bool print_dso, bool print_sym)
1914 {
1915 
1916 	if ((verbose > 0 || print_dso) && al->map)
1917 		fprintf(f, "%s@", al->map->dso->long_name);
1918 
1919 	if ((verbose > 0 || print_sym) && al->sym)
1920 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1921 			al->addr - al->sym->start);
1922 	else if (al->map)
1923 		fprintf(f, "0x%" PRIx64, al->addr);
1924 	else
1925 		fprintf(f, "0x%" PRIx64, sample->addr);
1926 }
1927 
1928 static int trace__pgfault(struct trace *trace,
1929 			  struct perf_evsel *evsel,
1930 			  union perf_event *event __maybe_unused,
1931 			  struct perf_sample *sample)
1932 {
1933 	struct thread *thread;
1934 	struct addr_location al;
1935 	char map_type = 'd';
1936 	struct thread_trace *ttrace;
1937 	int err = -1;
1938 	int callchain_ret = 0;
1939 
1940 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1941 
1942 	if (sample->callchain) {
1943 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1944 		if (callchain_ret == 0) {
1945 			if (callchain_cursor.nr < trace->min_stack)
1946 				goto out_put;
1947 			callchain_ret = 1;
1948 		}
1949 	}
1950 
1951 	ttrace = thread__trace(thread, trace->output);
1952 	if (ttrace == NULL)
1953 		goto out_put;
1954 
1955 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1956 		ttrace->pfmaj++;
1957 	else
1958 		ttrace->pfmin++;
1959 
1960 	if (trace->summary_only)
1961 		goto out;
1962 
1963 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1964 			      sample->ip, &al);
1965 
1966 	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1967 
1968 	fprintf(trace->output, "%sfault [",
1969 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1970 		"maj" : "min");
1971 
1972 	print_location(trace->output, sample, &al, false, true);
1973 
1974 	fprintf(trace->output, "] => ");
1975 
1976 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1977 				   sample->addr, &al);
1978 
1979 	if (!al.map) {
1980 		thread__find_addr_location(thread, sample->cpumode,
1981 					   MAP__FUNCTION, sample->addr, &al);
1982 
1983 		if (al.map)
1984 			map_type = 'x';
1985 		else
1986 			map_type = '?';
1987 	}
1988 
1989 	print_location(trace->output, sample, &al, true, false);
1990 
1991 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1992 
1993 	if (callchain_ret > 0)
1994 		trace__fprintf_callchain(trace, sample);
1995 	else if (callchain_ret < 0)
1996 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1997 out:
1998 	err = 0;
1999 out_put:
2000 	thread__put(thread);
2001 	return err;
2002 }
2003 
2004 static void trace__set_base_time(struct trace *trace,
2005 				 struct perf_evsel *evsel,
2006 				 struct perf_sample *sample)
2007 {
2008 	/*
2009 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2010 	 * and don't use sample->time unconditionally, we may end up having
2011 	 * some other event in the future without PERF_SAMPLE_TIME for good
2012 	 * reason, i.e. we may not be interested in its timestamps, just in
2013 	 * it taking place, picking some piece of information when it
2014 	 * appears in our event stream (vfs_getname comes to mind).
2015 	 */
2016 	if (trace->base_time == 0 && !trace->full_time &&
2017 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2018 		trace->base_time = sample->time;
2019 }
2020 
2021 static int trace__process_sample(struct perf_tool *tool,
2022 				 union perf_event *event,
2023 				 struct perf_sample *sample,
2024 				 struct perf_evsel *evsel,
2025 				 struct machine *machine __maybe_unused)
2026 {
2027 	struct trace *trace = container_of(tool, struct trace, tool);
2028 	struct thread *thread;
2029 	int err = 0;
2030 
2031 	tracepoint_handler handler = evsel->handler;
2032 
2033 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2034 	if (thread && thread__is_filtered(thread))
2035 		goto out;
2036 
2037 	trace__set_base_time(trace, evsel, sample);
2038 
2039 	if (handler) {
2040 		++trace->nr_events;
2041 		handler(trace, evsel, event, sample);
2042 	}
2043 out:
2044 	thread__put(thread);
2045 	return err;
2046 }
2047 
2048 static int trace__record(struct trace *trace, int argc, const char **argv)
2049 {
2050 	unsigned int rec_argc, i, j;
2051 	const char **rec_argv;
2052 	const char * const record_args[] = {
2053 		"record",
2054 		"-R",
2055 		"-m", "1024",
2056 		"-c", "1",
2057 	};
2058 
2059 	const char * const sc_args[] = { "-e", };
2060 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2061 	const char * const majpf_args[] = { "-e", "major-faults" };
2062 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2063 	const char * const minpf_args[] = { "-e", "minor-faults" };
2064 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2065 
2066 	/* +1 is for the event string below */
2067 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2068 		majpf_args_nr + minpf_args_nr + argc;
2069 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2070 
2071 	if (rec_argv == NULL)
2072 		return -ENOMEM;
2073 
2074 	j = 0;
2075 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2076 		rec_argv[j++] = record_args[i];
2077 
2078 	if (trace->trace_syscalls) {
2079 		for (i = 0; i < sc_args_nr; i++)
2080 			rec_argv[j++] = sc_args[i];
2081 
2082 		/* event string may be different for older kernels - e.g., RHEL6 */
2083 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2084 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2085 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2086 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2087 		else {
2088 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2089 			return -1;
2090 		}
2091 	}
2092 
2093 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2094 		for (i = 0; i < majpf_args_nr; i++)
2095 			rec_argv[j++] = majpf_args[i];
2096 
2097 	if (trace->trace_pgfaults & TRACE_PFMIN)
2098 		for (i = 0; i < minpf_args_nr; i++)
2099 			rec_argv[j++] = minpf_args[i];
2100 
2101 	for (i = 0; i < (unsigned int)argc; i++)
2102 		rec_argv[j++] = argv[i];
2103 
2104 	return cmd_record(j, rec_argv);
2105 }
2106 
2107 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2108 
2109 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2110 {
2111 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2112 
2113 	if (IS_ERR(evsel))
2114 		return false;
2115 
2116 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2117 		perf_evsel__delete(evsel);
2118 		return false;
2119 	}
2120 
2121 	evsel->handler = trace__vfs_getname;
2122 	perf_evlist__add(evlist, evsel);
2123 	return true;
2124 }
2125 
2126 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2127 {
2128 	struct perf_evsel *evsel;
2129 	struct perf_event_attr attr = {
2130 		.type = PERF_TYPE_SOFTWARE,
2131 		.mmap_data = 1,
2132 	};
2133 
2134 	attr.config = config;
2135 	attr.sample_period = 1;
2136 
2137 	event_attr_init(&attr);
2138 
2139 	evsel = perf_evsel__new(&attr);
2140 	if (evsel)
2141 		evsel->handler = trace__pgfault;
2142 
2143 	return evsel;
2144 }
2145 
2146 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2147 {
2148 	const u32 type = event->header.type;
2149 	struct perf_evsel *evsel;
2150 
2151 	if (type != PERF_RECORD_SAMPLE) {
2152 		trace__process_event(trace, trace->host, event, sample);
2153 		return;
2154 	}
2155 
2156 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2157 	if (evsel == NULL) {
2158 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2159 		return;
2160 	}
2161 
2162 	trace__set_base_time(trace, evsel, sample);
2163 
2164 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2165 	    sample->raw_data == NULL) {
2166 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2167 		       perf_evsel__name(evsel), sample->tid,
2168 		       sample->cpu, sample->raw_size);
2169 	} else {
2170 		tracepoint_handler handler = evsel->handler;
2171 		handler(trace, evsel, event, sample);
2172 	}
2173 }
2174 
2175 static int trace__add_syscall_newtp(struct trace *trace)
2176 {
2177 	int ret = -1;
2178 	struct perf_evlist *evlist = trace->evlist;
2179 	struct perf_evsel *sys_enter, *sys_exit;
2180 
2181 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2182 	if (sys_enter == NULL)
2183 		goto out;
2184 
2185 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2186 		goto out_delete_sys_enter;
2187 
2188 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2189 	if (sys_exit == NULL)
2190 		goto out_delete_sys_enter;
2191 
2192 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2193 		goto out_delete_sys_exit;
2194 
2195 	perf_evlist__add(evlist, sys_enter);
2196 	perf_evlist__add(evlist, sys_exit);
2197 
2198 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2199 		/*
2200 		 * We're interested only in the user space callchain
2201 		 * leading to the syscall, allow overriding that for
2202 		 * debugging reasons using --kernel_syscall_callchains
2203 		 */
2204 		sys_exit->attr.exclude_callchain_kernel = 1;
2205 	}
2206 
2207 	trace->syscalls.events.sys_enter = sys_enter;
2208 	trace->syscalls.events.sys_exit  = sys_exit;
2209 
2210 	ret = 0;
2211 out:
2212 	return ret;
2213 
2214 out_delete_sys_exit:
2215 	perf_evsel__delete_priv(sys_exit);
2216 out_delete_sys_enter:
2217 	perf_evsel__delete_priv(sys_enter);
2218 	goto out;
2219 }
2220 
2221 static int trace__set_ev_qualifier_filter(struct trace *trace)
2222 {
2223 	int err = -1;
2224 	struct perf_evsel *sys_exit;
2225 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2226 						trace->ev_qualifier_ids.nr,
2227 						trace->ev_qualifier_ids.entries);
2228 
2229 	if (filter == NULL)
2230 		goto out_enomem;
2231 
2232 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2233 					  filter)) {
2234 		sys_exit = trace->syscalls.events.sys_exit;
2235 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2236 	}
2237 
2238 	free(filter);
2239 out:
2240 	return err;
2241 out_enomem:
2242 	errno = ENOMEM;
2243 	goto out;
2244 }
2245 
2246 static int trace__set_filter_loop_pids(struct trace *trace)
2247 {
2248 	unsigned int nr = 1;
2249 	pid_t pids[32] = {
2250 		getpid(),
2251 	};
2252 	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2253 
2254 	while (thread && nr < ARRAY_SIZE(pids)) {
2255 		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2256 
2257 		if (parent == NULL)
2258 			break;
2259 
2260 		if (!strcmp(thread__comm_str(parent), "sshd")) {
2261 			pids[nr++] = parent->tid;
2262 			break;
2263 		}
2264 		thread = parent;
2265 	}
2266 
2267 	return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2268 }
2269 
2270 static int trace__run(struct trace *trace, int argc, const char **argv)
2271 {
2272 	struct perf_evlist *evlist = trace->evlist;
2273 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2274 	int err = -1, i;
2275 	unsigned long before;
2276 	const bool forks = argc > 0;
2277 	bool draining = false;
2278 
2279 	trace->live = true;
2280 
2281 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2282 		goto out_error_raw_syscalls;
2283 
2284 	if (trace->trace_syscalls)
2285 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2286 
2287 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2288 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2289 		if (pgfault_maj == NULL)
2290 			goto out_error_mem;
2291 		perf_evlist__add(evlist, pgfault_maj);
2292 	}
2293 
2294 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2295 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2296 		if (pgfault_min == NULL)
2297 			goto out_error_mem;
2298 		perf_evlist__add(evlist, pgfault_min);
2299 	}
2300 
2301 	if (trace->sched &&
2302 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2303 				   trace__sched_stat_runtime))
2304 		goto out_error_sched_stat_runtime;
2305 
2306 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2307 	if (err < 0) {
2308 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2309 		goto out_delete_evlist;
2310 	}
2311 
2312 	err = trace__symbols_init(trace, evlist);
2313 	if (err < 0) {
2314 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2315 		goto out_delete_evlist;
2316 	}
2317 
2318 	perf_evlist__config(evlist, &trace->opts, NULL);
2319 
2320 	if (callchain_param.enabled) {
2321 		bool use_identifier = false;
2322 
2323 		if (trace->syscalls.events.sys_exit) {
2324 			perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2325 						     &trace->opts, &callchain_param);
2326 			use_identifier = true;
2327 		}
2328 
2329 		if (pgfault_maj) {
2330 			perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2331 			use_identifier = true;
2332 		}
2333 
2334 		if (pgfault_min) {
2335 			perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2336 			use_identifier = true;
2337 		}
2338 
2339 		if (use_identifier) {
2340 		       /*
2341 			* Now we have evsels with different sample_ids, use
2342 			* PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2343 			* from a fixed position in each ring buffer record.
2344 			*
2345 			* As of this the changeset introducing this comment, this
2346 			* isn't strictly needed, as the fields that can come before
2347 			* PERF_SAMPLE_ID are all used, but we'll probably disable
2348 			* some of those for things like copying the payload of
2349 			* pointer syscall arguments, and for vfs_getname we don't
2350 			* need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2351 			* here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2352 			*/
2353 			perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2354 			perf_evlist__reset_sample_bit(evlist, ID);
2355 		}
2356 	}
2357 
2358 	signal(SIGCHLD, sig_handler);
2359 	signal(SIGINT, sig_handler);
2360 
2361 	if (forks) {
2362 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2363 						    argv, false, NULL);
2364 		if (err < 0) {
2365 			fprintf(trace->output, "Couldn't run the workload!\n");
2366 			goto out_delete_evlist;
2367 		}
2368 	}
2369 
2370 	err = perf_evlist__open(evlist);
2371 	if (err < 0)
2372 		goto out_error_open;
2373 
2374 	err = bpf__apply_obj_config();
2375 	if (err) {
2376 		char errbuf[BUFSIZ];
2377 
2378 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2379 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2380 			 errbuf);
2381 		goto out_error_open;
2382 	}
2383 
2384 	/*
2385 	 * Better not use !target__has_task() here because we need to cover the
2386 	 * case where no threads were specified in the command line, but a
2387 	 * workload was, and in that case we will fill in the thread_map when
2388 	 * we fork the workload in perf_evlist__prepare_workload.
2389 	 */
2390 	if (trace->filter_pids.nr > 0)
2391 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2392 	else if (thread_map__pid(evlist->threads, 0) == -1)
2393 		err = trace__set_filter_loop_pids(trace);
2394 
2395 	if (err < 0)
2396 		goto out_error_mem;
2397 
2398 	if (trace->ev_qualifier_ids.nr > 0) {
2399 		err = trace__set_ev_qualifier_filter(trace);
2400 		if (err < 0)
2401 			goto out_errno;
2402 
2403 		pr_debug("event qualifier tracepoint filter: %s\n",
2404 			 trace->syscalls.events.sys_exit->filter);
2405 	}
2406 
2407 	err = perf_evlist__apply_filters(evlist, &evsel);
2408 	if (err < 0)
2409 		goto out_error_apply_filters;
2410 
2411 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2412 	if (err < 0)
2413 		goto out_error_mmap;
2414 
2415 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2416 		perf_evlist__enable(evlist);
2417 
2418 	if (forks)
2419 		perf_evlist__start_workload(evlist);
2420 
2421 	if (trace->opts.initial_delay) {
2422 		usleep(trace->opts.initial_delay * 1000);
2423 		perf_evlist__enable(evlist);
2424 	}
2425 
2426 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2427 				  evlist->threads->nr > 1 ||
2428 				  perf_evlist__first(evlist)->attr.inherit;
2429 again:
2430 	before = trace->nr_events;
2431 
2432 	for (i = 0; i < evlist->nr_mmaps; i++) {
2433 		union perf_event *event;
2434 
2435 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2436 			struct perf_sample sample;
2437 
2438 			++trace->nr_events;
2439 
2440 			err = perf_evlist__parse_sample(evlist, event, &sample);
2441 			if (err) {
2442 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2443 				goto next_event;
2444 			}
2445 
2446 			trace__handle_event(trace, event, &sample);
2447 next_event:
2448 			perf_evlist__mmap_consume(evlist, i);
2449 
2450 			if (interrupted)
2451 				goto out_disable;
2452 
2453 			if (done && !draining) {
2454 				perf_evlist__disable(evlist);
2455 				draining = true;
2456 			}
2457 		}
2458 	}
2459 
2460 	if (trace->nr_events == before) {
2461 		int timeout = done ? 100 : -1;
2462 
2463 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2464 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2465 				draining = true;
2466 
2467 			goto again;
2468 		}
2469 	} else {
2470 		goto again;
2471 	}
2472 
2473 out_disable:
2474 	thread__zput(trace->current);
2475 
2476 	perf_evlist__disable(evlist);
2477 
2478 	if (!err) {
2479 		if (trace->summary)
2480 			trace__fprintf_thread_summary(trace, trace->output);
2481 
2482 		if (trace->show_tool_stats) {
2483 			fprintf(trace->output, "Stats:\n "
2484 					       " vfs_getname : %" PRIu64 "\n"
2485 					       " proc_getname: %" PRIu64 "\n",
2486 				trace->stats.vfs_getname,
2487 				trace->stats.proc_getname);
2488 		}
2489 	}
2490 
2491 out_delete_evlist:
2492 	trace__symbols__exit(trace);
2493 
2494 	perf_evlist__delete(evlist);
2495 	trace->evlist = NULL;
2496 	trace->live = false;
2497 	return err;
2498 {
2499 	char errbuf[BUFSIZ];
2500 
2501 out_error_sched_stat_runtime:
2502 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2503 	goto out_error;
2504 
2505 out_error_raw_syscalls:
2506 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2507 	goto out_error;
2508 
2509 out_error_mmap:
2510 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2511 	goto out_error;
2512 
2513 out_error_open:
2514 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2515 
2516 out_error:
2517 	fprintf(trace->output, "%s\n", errbuf);
2518 	goto out_delete_evlist;
2519 
2520 out_error_apply_filters:
2521 	fprintf(trace->output,
2522 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2523 		evsel->filter, perf_evsel__name(evsel), errno,
2524 		str_error_r(errno, errbuf, sizeof(errbuf)));
2525 	goto out_delete_evlist;
2526 }
2527 out_error_mem:
2528 	fprintf(trace->output, "Not enough memory to run!\n");
2529 	goto out_delete_evlist;
2530 
2531 out_errno:
2532 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2533 	goto out_delete_evlist;
2534 }
2535 
2536 static int trace__replay(struct trace *trace)
2537 {
2538 	const struct perf_evsel_str_handler handlers[] = {
2539 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2540 	};
2541 	struct perf_data_file file = {
2542 		.path  = input_name,
2543 		.mode  = PERF_DATA_MODE_READ,
2544 		.force = trace->force,
2545 	};
2546 	struct perf_session *session;
2547 	struct perf_evsel *evsel;
2548 	int err = -1;
2549 
2550 	trace->tool.sample	  = trace__process_sample;
2551 	trace->tool.mmap	  = perf_event__process_mmap;
2552 	trace->tool.mmap2	  = perf_event__process_mmap2;
2553 	trace->tool.comm	  = perf_event__process_comm;
2554 	trace->tool.exit	  = perf_event__process_exit;
2555 	trace->tool.fork	  = perf_event__process_fork;
2556 	trace->tool.attr	  = perf_event__process_attr;
2557 	trace->tool.tracing_data  = perf_event__process_tracing_data;
2558 	trace->tool.build_id	  = perf_event__process_build_id;
2559 	trace->tool.namespaces	  = perf_event__process_namespaces;
2560 
2561 	trace->tool.ordered_events = true;
2562 	trace->tool.ordering_requires_timestamps = true;
2563 
2564 	/* add tid to output */
2565 	trace->multiple_threads = true;
2566 
2567 	session = perf_session__new(&file, false, &trace->tool);
2568 	if (session == NULL)
2569 		return -1;
2570 
2571 	if (trace->opts.target.pid)
2572 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2573 
2574 	if (trace->opts.target.tid)
2575 		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2576 
2577 	if (symbol__init(&session->header.env) < 0)
2578 		goto out;
2579 
2580 	trace->host = &session->machines.host;
2581 
2582 	err = perf_session__set_tracepoints_handlers(session, handlers);
2583 	if (err)
2584 		goto out;
2585 
2586 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2587 						     "raw_syscalls:sys_enter");
2588 	/* older kernels have syscalls tp versus raw_syscalls */
2589 	if (evsel == NULL)
2590 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2591 							     "syscalls:sys_enter");
2592 
2593 	if (evsel &&
2594 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2595 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2596 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2597 		goto out;
2598 	}
2599 
2600 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2601 						     "raw_syscalls:sys_exit");
2602 	if (evsel == NULL)
2603 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2604 							     "syscalls:sys_exit");
2605 	if (evsel &&
2606 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2607 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2608 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2609 		goto out;
2610 	}
2611 
2612 	evlist__for_each_entry(session->evlist, evsel) {
2613 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2614 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2615 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2616 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2617 			evsel->handler = trace__pgfault;
2618 	}
2619 
2620 	setup_pager();
2621 
2622 	err = perf_session__process_events(session);
2623 	if (err)
2624 		pr_err("Failed to process events, error %d", err);
2625 
2626 	else if (trace->summary)
2627 		trace__fprintf_thread_summary(trace, trace->output);
2628 
2629 out:
2630 	perf_session__delete(session);
2631 
2632 	return err;
2633 }
2634 
2635 static size_t trace__fprintf_threads_header(FILE *fp)
2636 {
2637 	size_t printed;
2638 
2639 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2640 
2641 	return printed;
2642 }
2643 
2644 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2645 	struct stats 	*stats;
2646 	double		msecs;
2647 	int		syscall;
2648 )
2649 {
2650 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2651 	struct stats *stats = source->priv;
2652 
2653 	entry->syscall = source->i;
2654 	entry->stats   = stats;
2655 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2656 }
2657 
2658 static size_t thread__dump_stats(struct thread_trace *ttrace,
2659 				 struct trace *trace, FILE *fp)
2660 {
2661 	size_t printed = 0;
2662 	struct syscall *sc;
2663 	struct rb_node *nd;
2664 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2665 
2666 	if (syscall_stats == NULL)
2667 		return 0;
2668 
2669 	printed += fprintf(fp, "\n");
2670 
2671 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2672 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2673 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2674 
2675 	resort_rb__for_each_entry(nd, syscall_stats) {
2676 		struct stats *stats = syscall_stats_entry->stats;
2677 		if (stats) {
2678 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2679 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2680 			double avg = avg_stats(stats);
2681 			double pct;
2682 			u64 n = (u64) stats->n;
2683 
2684 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2685 			avg /= NSEC_PER_MSEC;
2686 
2687 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2688 			printed += fprintf(fp, "   %-15s", sc->name);
2689 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2690 					   n, syscall_stats_entry->msecs, min, avg);
2691 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2692 		}
2693 	}
2694 
2695 	resort_rb__delete(syscall_stats);
2696 	printed += fprintf(fp, "\n\n");
2697 
2698 	return printed;
2699 }
2700 
2701 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2702 {
2703 	size_t printed = 0;
2704 	struct thread_trace *ttrace = thread__priv(thread);
2705 	double ratio;
2706 
2707 	if (ttrace == NULL)
2708 		return 0;
2709 
2710 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2711 
2712 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2713 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2714 	printed += fprintf(fp, "%.1f%%", ratio);
2715 	if (ttrace->pfmaj)
2716 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2717 	if (ttrace->pfmin)
2718 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2719 	if (trace->sched)
2720 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2721 	else if (fputc('\n', fp) != EOF)
2722 		++printed;
2723 
2724 	printed += thread__dump_stats(ttrace, trace, fp);
2725 
2726 	return printed;
2727 }
2728 
2729 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2730 {
2731 	return ttrace ? ttrace->nr_events : 0;
2732 }
2733 
2734 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2735 	struct thread *thread;
2736 )
2737 {
2738 	entry->thread = rb_entry(nd, struct thread, rb_node);
2739 }
2740 
2741 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2742 {
2743 	DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2744 	size_t printed = trace__fprintf_threads_header(fp);
2745 	struct rb_node *nd;
2746 
2747 	if (threads == NULL) {
2748 		fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2749 		return 0;
2750 	}
2751 
2752 	resort_rb__for_each_entry(nd, threads)
2753 		printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2754 
2755 	resort_rb__delete(threads);
2756 
2757 	return printed;
2758 }
2759 
2760 static int trace__set_duration(const struct option *opt, const char *str,
2761 			       int unset __maybe_unused)
2762 {
2763 	struct trace *trace = opt->value;
2764 
2765 	trace->duration_filter = atof(str);
2766 	return 0;
2767 }
2768 
2769 static int trace__set_filter_pids(const struct option *opt, const char *str,
2770 				  int unset __maybe_unused)
2771 {
2772 	int ret = -1;
2773 	size_t i;
2774 	struct trace *trace = opt->value;
2775 	/*
2776 	 * FIXME: introduce a intarray class, plain parse csv and create a
2777 	 * { int nr, int entries[] } struct...
2778 	 */
2779 	struct intlist *list = intlist__new(str);
2780 
2781 	if (list == NULL)
2782 		return -1;
2783 
2784 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2785 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2786 
2787 	if (trace->filter_pids.entries == NULL)
2788 		goto out;
2789 
2790 	trace->filter_pids.entries[0] = getpid();
2791 
2792 	for (i = 1; i < trace->filter_pids.nr; ++i)
2793 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2794 
2795 	intlist__delete(list);
2796 	ret = 0;
2797 out:
2798 	return ret;
2799 }
2800 
2801 static int trace__open_output(struct trace *trace, const char *filename)
2802 {
2803 	struct stat st;
2804 
2805 	if (!stat(filename, &st) && st.st_size) {
2806 		char oldname[PATH_MAX];
2807 
2808 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2809 		unlink(oldname);
2810 		rename(filename, oldname);
2811 	}
2812 
2813 	trace->output = fopen(filename, "w");
2814 
2815 	return trace->output == NULL ? -errno : 0;
2816 }
2817 
2818 static int parse_pagefaults(const struct option *opt, const char *str,
2819 			    int unset __maybe_unused)
2820 {
2821 	int *trace_pgfaults = opt->value;
2822 
2823 	if (strcmp(str, "all") == 0)
2824 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2825 	else if (strcmp(str, "maj") == 0)
2826 		*trace_pgfaults |= TRACE_PFMAJ;
2827 	else if (strcmp(str, "min") == 0)
2828 		*trace_pgfaults |= TRACE_PFMIN;
2829 	else
2830 		return -1;
2831 
2832 	return 0;
2833 }
2834 
2835 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2836 {
2837 	struct perf_evsel *evsel;
2838 
2839 	evlist__for_each_entry(evlist, evsel)
2840 		evsel->handler = handler;
2841 }
2842 
2843 /*
2844  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2845  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2846  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2847  *
2848  * It'd be better to introduce a parse_options() variant that would return a
2849  * list with the terms it didn't match to an event...
2850  */
2851 static int trace__parse_events_option(const struct option *opt, const char *str,
2852 				      int unset __maybe_unused)
2853 {
2854 	struct trace *trace = (struct trace *)opt->value;
2855 	const char *s = str;
2856 	char *sep = NULL, *lists[2] = { NULL, NULL, };
2857 	int len = strlen(str) + 1, err = -1, list, idx;
2858 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2859 	char group_name[PATH_MAX];
2860 
2861 	if (strace_groups_dir == NULL)
2862 		return -1;
2863 
2864 	if (*s == '!') {
2865 		++s;
2866 		trace->not_ev_qualifier = true;
2867 	}
2868 
2869 	while (1) {
2870 		if ((sep = strchr(s, ',')) != NULL)
2871 			*sep = '\0';
2872 
2873 		list = 0;
2874 		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2875 		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2876 			list = 1;
2877 		} else {
2878 			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2879 			if (access(group_name, R_OK) == 0)
2880 				list = 1;
2881 		}
2882 
2883 		if (lists[list]) {
2884 			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2885 		} else {
2886 			lists[list] = malloc(len);
2887 			if (lists[list] == NULL)
2888 				goto out;
2889 			strcpy(lists[list], s);
2890 		}
2891 
2892 		if (!sep)
2893 			break;
2894 
2895 		*sep = ',';
2896 		s = sep + 1;
2897 	}
2898 
2899 	if (lists[1] != NULL) {
2900 		struct strlist_config slist_config = {
2901 			.dirname = strace_groups_dir,
2902 		};
2903 
2904 		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2905 		if (trace->ev_qualifier == NULL) {
2906 			fputs("Not enough memory to parse event qualifier", trace->output);
2907 			goto out;
2908 		}
2909 
2910 		if (trace__validate_ev_qualifier(trace))
2911 			goto out;
2912 	}
2913 
2914 	err = 0;
2915 
2916 	if (lists[0]) {
2917 		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2918 					       "event selector. use 'perf list' to list available events",
2919 					       parse_events_option);
2920 		err = parse_events_option(&o, lists[0], 0);
2921 	}
2922 out:
2923 	if (sep)
2924 		*sep = ',';
2925 
2926 	return err;
2927 }
2928 
2929 int cmd_trace(int argc, const char **argv)
2930 {
2931 	const char *trace_usage[] = {
2932 		"perf trace [<options>] [<command>]",
2933 		"perf trace [<options>] -- <command> [<options>]",
2934 		"perf trace record [<options>] [<command>]",
2935 		"perf trace record [<options>] -- <command> [<options>]",
2936 		NULL
2937 	};
2938 	struct trace trace = {
2939 		.syscalls = {
2940 			. max = -1,
2941 		},
2942 		.opts = {
2943 			.target = {
2944 				.uid	   = UINT_MAX,
2945 				.uses_mmap = true,
2946 			},
2947 			.user_freq     = UINT_MAX,
2948 			.user_interval = ULLONG_MAX,
2949 			.no_buffering  = true,
2950 			.mmap_pages    = UINT_MAX,
2951 			.proc_map_timeout  = 500,
2952 		},
2953 		.output = stderr,
2954 		.show_comm = true,
2955 		.trace_syscalls = true,
2956 		.kernel_syscallchains = false,
2957 		.max_stack = UINT_MAX,
2958 	};
2959 	const char *output_name = NULL;
2960 	const struct option trace_options[] = {
2961 	OPT_CALLBACK('e', "event", &trace, "event",
2962 		     "event/syscall selector. use 'perf list' to list available events",
2963 		     trace__parse_events_option),
2964 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2965 		    "show the thread COMM next to its id"),
2966 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2967 	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2968 		     trace__parse_events_option),
2969 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
2970 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2971 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2972 		    "trace events on existing process id"),
2973 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2974 		    "trace events on existing thread id"),
2975 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2976 		     "pids to filter (by the kernel)", trace__set_filter_pids),
2977 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2978 		    "system-wide collection from all CPUs"),
2979 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2980 		    "list of cpus to monitor"),
2981 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2982 		    "child tasks do not inherit counters"),
2983 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2984 		     "number of mmap data pages",
2985 		     perf_evlist__parse_mmap_pages),
2986 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2987 		   "user to profile"),
2988 	OPT_CALLBACK(0, "duration", &trace, "float",
2989 		     "show only events with duration > N.M ms",
2990 		     trace__set_duration),
2991 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2992 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2993 	OPT_BOOLEAN('T', "time", &trace.full_time,
2994 		    "Show full timestamp, not time relative to first start"),
2995 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
2996 		    "Show only syscall summary with statistics"),
2997 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
2998 		    "Show all syscalls and summary with statistics"),
2999 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3000 		     "Trace pagefaults", parse_pagefaults, "maj"),
3001 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3002 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3003 	OPT_CALLBACK(0, "call-graph", &trace.opts,
3004 		     "record_mode[,record_size]", record_callchain_help,
3005 		     &record_parse_callchain_opt),
3006 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3007 		    "Show the kernel callchains on the syscall exit path"),
3008 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3009 		     "Set the minimum stack depth when parsing the callchain, "
3010 		     "anything below the specified depth will be ignored."),
3011 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3012 		     "Set the maximum stack depth when parsing the callchain, "
3013 		     "anything beyond the specified depth will be ignored. "
3014 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3015 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3016 			"per thread proc mmap processing timeout in ms"),
3017 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3018 		     "ms to wait before starting measurement after program "
3019 		     "start"),
3020 	OPT_END()
3021 	};
3022 	bool __maybe_unused max_stack_user_set = true;
3023 	bool mmap_pages_user_set = true;
3024 	const char * const trace_subcommands[] = { "record", NULL };
3025 	int err;
3026 	char bf[BUFSIZ];
3027 
3028 	signal(SIGSEGV, sighandler_dump_stack);
3029 	signal(SIGFPE, sighandler_dump_stack);
3030 
3031 	trace.evlist = perf_evlist__new();
3032 	trace.sctbl = syscalltbl__new();
3033 
3034 	if (trace.evlist == NULL || trace.sctbl == NULL) {
3035 		pr_err("Not enough memory to run!\n");
3036 		err = -ENOMEM;
3037 		goto out;
3038 	}
3039 
3040 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3041 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3042 
3043 	err = bpf__setup_stdout(trace.evlist);
3044 	if (err) {
3045 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3046 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3047 		goto out;
3048 	}
3049 
3050 	err = -1;
3051 
3052 	if (trace.trace_pgfaults) {
3053 		trace.opts.sample_address = true;
3054 		trace.opts.sample_time = true;
3055 	}
3056 
3057 	if (trace.opts.mmap_pages == UINT_MAX)
3058 		mmap_pages_user_set = false;
3059 
3060 	if (trace.max_stack == UINT_MAX) {
3061 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3062 		max_stack_user_set = false;
3063 	}
3064 
3065 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3066 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
3067 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3068 #endif
3069 
3070 	if (callchain_param.enabled) {
3071 		if (!mmap_pages_user_set && geteuid() == 0)
3072 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3073 
3074 		symbol_conf.use_callchain = true;
3075 	}
3076 
3077 	if (trace.evlist->nr_entries > 0)
3078 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3079 
3080 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3081 		return trace__record(&trace, argc-1, &argv[1]);
3082 
3083 	/* summary_only implies summary option, but don't overwrite summary if set */
3084 	if (trace.summary_only)
3085 		trace.summary = trace.summary_only;
3086 
3087 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3088 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3089 		pr_err("Please specify something to trace.\n");
3090 		return -1;
3091 	}
3092 
3093 	if (!trace.trace_syscalls && trace.ev_qualifier) {
3094 		pr_err("The -e option can't be used with --no-syscalls.\n");
3095 		goto out;
3096 	}
3097 
3098 	if (output_name != NULL) {
3099 		err = trace__open_output(&trace, output_name);
3100 		if (err < 0) {
3101 			perror("failed to create output file");
3102 			goto out;
3103 		}
3104 	}
3105 
3106 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
3107 
3108 	err = target__validate(&trace.opts.target);
3109 	if (err) {
3110 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3111 		fprintf(trace.output, "%s", bf);
3112 		goto out_close;
3113 	}
3114 
3115 	err = target__parse_uid(&trace.opts.target);
3116 	if (err) {
3117 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3118 		fprintf(trace.output, "%s", bf);
3119 		goto out_close;
3120 	}
3121 
3122 	if (!argc && target__none(&trace.opts.target))
3123 		trace.opts.target.system_wide = true;
3124 
3125 	if (input_name)
3126 		err = trace__replay(&trace);
3127 	else
3128 		err = trace__run(&trace, argc, argv);
3129 
3130 out_close:
3131 	if (output_name != NULL)
3132 		fclose(trace.output);
3133 out:
3134 	return err;
3135 }
3136