xref: /linux/tools/perf/builtin-trace.c (revision 403f833d15a33bfd8e50dd79fa8e25fb4aa132f6)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/cgroup.h"
23 #include "util/color.h"
24 #include "util/debug.h"
25 #include "util/env.h"
26 #include "util/event.h"
27 #include "util/evlist.h"
28 #include <subcmd/exec-cmd.h>
29 #include "util/machine.h"
30 #include "util/path.h"
31 #include "util/session.h"
32 #include "util/thread.h"
33 #include <subcmd/parse-options.h>
34 #include "util/strlist.h"
35 #include "util/intlist.h"
36 #include "util/thread_map.h"
37 #include "util/stat.h"
38 #include "trace/beauty/beauty.h"
39 #include "trace-event.h"
40 #include "util/parse-events.h"
41 #include "util/bpf-loader.h"
42 #include "callchain.h"
43 #include "print_binary.h"
44 #include "string2.h"
45 #include "syscalltbl.h"
46 #include "rb_resort.h"
47 
48 #include <errno.h>
49 #include <inttypes.h>
50 #include <poll.h>
51 #include <signal.h>
52 #include <stdlib.h>
53 #include <string.h>
54 #include <linux/err.h>
55 #include <linux/filter.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 #include <fcntl.h>
61 
62 #include "sane_ctype.h"
63 
64 #ifndef O_CLOEXEC
65 # define O_CLOEXEC		02000000
66 #endif
67 
68 #ifndef F_LINUX_SPECIFIC_BASE
69 # define F_LINUX_SPECIFIC_BASE	1024
70 #endif
71 
72 struct trace {
73 	struct perf_tool	tool;
74 	struct syscalltbl	*sctbl;
75 	struct {
76 		int		max;
77 		struct syscall  *table;
78 		struct {
79 			struct perf_evsel *sys_enter,
80 					  *sys_exit,
81 					  *augmented;
82 		}		events;
83 	} syscalls;
84 	struct record_opts	opts;
85 	struct perf_evlist	*evlist;
86 	struct machine		*host;
87 	struct thread		*current;
88 	struct cgroup		*cgroup;
89 	u64			base_time;
90 	FILE			*output;
91 	unsigned long		nr_events;
92 	struct strlist		*ev_qualifier;
93 	struct {
94 		size_t		nr;
95 		int		*entries;
96 	}			ev_qualifier_ids;
97 	struct {
98 		size_t		nr;
99 		pid_t		*entries;
100 	}			filter_pids;
101 	double			duration_filter;
102 	double			runtime_ms;
103 	struct {
104 		u64		vfs_getname,
105 				proc_getname;
106 	} stats;
107 	unsigned int		max_stack;
108 	unsigned int		min_stack;
109 	bool			not_ev_qualifier;
110 	bool			live;
111 	bool			full_time;
112 	bool			sched;
113 	bool			multiple_threads;
114 	bool			summary;
115 	bool			summary_only;
116 	bool			failure_only;
117 	bool			show_comm;
118 	bool			print_sample;
119 	bool			show_tool_stats;
120 	bool			trace_syscalls;
121 	bool			kernel_syscallchains;
122 	bool			force;
123 	bool			vfs_getname;
124 	int			trace_pgfaults;
125 };
126 
127 struct tp_field {
128 	int offset;
129 	union {
130 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
131 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
132 	};
133 };
134 
135 #define TP_UINT_FIELD(bits) \
136 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
137 { \
138 	u##bits value; \
139 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
140 	return value;  \
141 }
142 
143 TP_UINT_FIELD(8);
144 TP_UINT_FIELD(16);
145 TP_UINT_FIELD(32);
146 TP_UINT_FIELD(64);
147 
148 #define TP_UINT_FIELD__SWAPPED(bits) \
149 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
150 { \
151 	u##bits value; \
152 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
153 	return bswap_##bits(value);\
154 }
155 
156 TP_UINT_FIELD__SWAPPED(16);
157 TP_UINT_FIELD__SWAPPED(32);
158 TP_UINT_FIELD__SWAPPED(64);
159 
160 static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap)
161 {
162 	field->offset = offset;
163 
164 	switch (size) {
165 	case 1:
166 		field->integer = tp_field__u8;
167 		break;
168 	case 2:
169 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
170 		break;
171 	case 4:
172 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
173 		break;
174 	case 8:
175 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
176 		break;
177 	default:
178 		return -1;
179 	}
180 
181 	return 0;
182 }
183 
184 static int tp_field__init_uint(struct tp_field *field, struct format_field *format_field, bool needs_swap)
185 {
186 	return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap);
187 }
188 
189 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
190 {
191 	return sample->raw_data + field->offset;
192 }
193 
194 static int __tp_field__init_ptr(struct tp_field *field, int offset)
195 {
196 	field->offset = offset;
197 	field->pointer = tp_field__ptr;
198 	return 0;
199 }
200 
201 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
202 {
203 	return __tp_field__init_ptr(field, format_field->offset);
204 }
205 
206 struct syscall_tp {
207 	struct tp_field id;
208 	union {
209 		struct tp_field args, ret;
210 	};
211 };
212 
213 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
214 					  struct tp_field *field,
215 					  const char *name)
216 {
217 	struct format_field *format_field = perf_evsel__field(evsel, name);
218 
219 	if (format_field == NULL)
220 		return -1;
221 
222 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
223 }
224 
225 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
226 	({ struct syscall_tp *sc = evsel->priv;\
227 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
228 
229 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
230 					 struct tp_field *field,
231 					 const char *name)
232 {
233 	struct format_field *format_field = perf_evsel__field(evsel, name);
234 
235 	if (format_field == NULL)
236 		return -1;
237 
238 	return tp_field__init_ptr(field, format_field);
239 }
240 
241 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
242 	({ struct syscall_tp *sc = evsel->priv;\
243 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
244 
245 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
246 {
247 	zfree(&evsel->priv);
248 	perf_evsel__delete(evsel);
249 }
250 
251 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel)
252 {
253 	struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
254 
255 	if (evsel->priv != NULL) {
256 		if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr"))
257 			goto out_delete;
258 		return 0;
259 	}
260 
261 	return -ENOMEM;
262 out_delete:
263 	zfree(&evsel->priv);
264 	return -ENOENT;
265 }
266 
267 static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel)
268 {
269 	struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp));
270 
271 	if (evsel->priv != NULL) {       /* field, sizeof_field, offsetof_field */
272 		if (__tp_field__init_uint(&sc->id, sizeof(long), sizeof(long long), evsel->needs_swap))
273 			goto out_delete;
274 
275 		return 0;
276 	}
277 
278 	return -ENOMEM;
279 out_delete:
280 	zfree(&evsel->priv);
281 	return -EINVAL;
282 }
283 
284 static int perf_evsel__init_augmented_syscall_tp_args(struct perf_evsel *evsel)
285 {
286 	struct syscall_tp *sc = evsel->priv;
287 
288 	return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64));
289 }
290 
291 static int perf_evsel__init_raw_syscall_tp(struct perf_evsel *evsel, void *handler)
292 {
293 	evsel->priv = malloc(sizeof(struct syscall_tp));
294 	if (evsel->priv != NULL) {
295 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
296 			goto out_delete;
297 
298 		evsel->handler = handler;
299 		return 0;
300 	}
301 
302 	return -ENOMEM;
303 
304 out_delete:
305 	zfree(&evsel->priv);
306 	return -ENOENT;
307 }
308 
309 static struct perf_evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler)
310 {
311 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
312 
313 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
314 	if (IS_ERR(evsel))
315 		evsel = perf_evsel__newtp("syscalls", direction);
316 
317 	if (IS_ERR(evsel))
318 		return NULL;
319 
320 	if (perf_evsel__init_raw_syscall_tp(evsel, handler))
321 		goto out_delete;
322 
323 	return evsel;
324 
325 out_delete:
326 	perf_evsel__delete_priv(evsel);
327 	return NULL;
328 }
329 
330 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
331 	({ struct syscall_tp *fields = evsel->priv; \
332 	   fields->name.integer(&fields->name, sample); })
333 
334 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
335 	({ struct syscall_tp *fields = evsel->priv; \
336 	   fields->name.pointer(&fields->name, sample); })
337 
338 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
339 {
340 	int idx = val - sa->offset;
341 
342 	if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL)
343 		return scnprintf(bf, size, intfmt, val);
344 
345 	return scnprintf(bf, size, "%s", sa->entries[idx]);
346 }
347 
348 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
349 						const char *intfmt,
350 					        struct syscall_arg *arg)
351 {
352 	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
353 }
354 
355 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
356 					      struct syscall_arg *arg)
357 {
358 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
359 }
360 
361 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
362 
363 struct strarrays {
364 	int		nr_entries;
365 	struct strarray **entries;
366 };
367 
368 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
369 	.nr_entries = ARRAY_SIZE(array), \
370 	.entries = array, \
371 }
372 
373 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
374 					struct syscall_arg *arg)
375 {
376 	struct strarrays *sas = arg->parm;
377 	int i;
378 
379 	for (i = 0; i < sas->nr_entries; ++i) {
380 		struct strarray *sa = sas->entries[i];
381 		int idx = arg->val - sa->offset;
382 
383 		if (idx >= 0 && idx < sa->nr_entries) {
384 			if (sa->entries[idx] == NULL)
385 				break;
386 			return scnprintf(bf, size, "%s", sa->entries[idx]);
387 		}
388 	}
389 
390 	return scnprintf(bf, size, "%d", arg->val);
391 }
392 
393 #ifndef AT_FDCWD
394 #define AT_FDCWD	-100
395 #endif
396 
397 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
398 					   struct syscall_arg *arg)
399 {
400 	int fd = arg->val;
401 
402 	if (fd == AT_FDCWD)
403 		return scnprintf(bf, size, "CWD");
404 
405 	return syscall_arg__scnprintf_fd(bf, size, arg);
406 }
407 
408 #define SCA_FDAT syscall_arg__scnprintf_fd_at
409 
410 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
411 					      struct syscall_arg *arg);
412 
413 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
414 
415 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
416 {
417 	return scnprintf(bf, size, "%#lx", arg->val);
418 }
419 
420 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
421 {
422 	return scnprintf(bf, size, "%d", arg->val);
423 }
424 
425 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
426 {
427 	return scnprintf(bf, size, "%ld", arg->val);
428 }
429 
430 static const char *bpf_cmd[] = {
431 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
432 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
433 };
434 static DEFINE_STRARRAY(bpf_cmd);
435 
436 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
437 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
438 
439 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
440 static DEFINE_STRARRAY(itimers);
441 
442 static const char *keyctl_options[] = {
443 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
444 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
445 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
446 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
447 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
448 };
449 static DEFINE_STRARRAY(keyctl_options);
450 
451 static const char *whences[] = { "SET", "CUR", "END",
452 #ifdef SEEK_DATA
453 "DATA",
454 #endif
455 #ifdef SEEK_HOLE
456 "HOLE",
457 #endif
458 };
459 static DEFINE_STRARRAY(whences);
460 
461 static const char *fcntl_cmds[] = {
462 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
463 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
464 	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
465 	"GETOWNER_UIDS",
466 };
467 static DEFINE_STRARRAY(fcntl_cmds);
468 
469 static const char *fcntl_linux_specific_cmds[] = {
470 	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
471 	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
472 	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
473 };
474 
475 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
476 
477 static struct strarray *fcntl_cmds_arrays[] = {
478 	&strarray__fcntl_cmds,
479 	&strarray__fcntl_linux_specific_cmds,
480 };
481 
482 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
483 
484 static const char *rlimit_resources[] = {
485 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
486 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
487 	"RTTIME",
488 };
489 static DEFINE_STRARRAY(rlimit_resources);
490 
491 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
492 static DEFINE_STRARRAY(sighow);
493 
494 static const char *clockid[] = {
495 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
496 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
497 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
498 };
499 static DEFINE_STRARRAY(clockid);
500 
501 static const char *socket_families[] = {
502 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
503 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
504 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
505 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
506 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
507 	"ALG", "NFC", "VSOCK",
508 };
509 static DEFINE_STRARRAY(socket_families);
510 
511 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
512 						 struct syscall_arg *arg)
513 {
514 	size_t printed = 0;
515 	int mode = arg->val;
516 
517 	if (mode == F_OK) /* 0 */
518 		return scnprintf(bf, size, "F");
519 #define	P_MODE(n) \
520 	if (mode & n##_OK) { \
521 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
522 		mode &= ~n##_OK; \
523 	}
524 
525 	P_MODE(R);
526 	P_MODE(W);
527 	P_MODE(X);
528 #undef P_MODE
529 
530 	if (mode)
531 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
532 
533 	return printed;
534 }
535 
536 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
537 
538 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
539 					      struct syscall_arg *arg);
540 
541 #define SCA_FILENAME syscall_arg__scnprintf_filename
542 
543 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
544 						struct syscall_arg *arg)
545 {
546 	int printed = 0, flags = arg->val;
547 
548 #define	P_FLAG(n) \
549 	if (flags & O_##n) { \
550 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
551 		flags &= ~O_##n; \
552 	}
553 
554 	P_FLAG(CLOEXEC);
555 	P_FLAG(NONBLOCK);
556 #undef P_FLAG
557 
558 	if (flags)
559 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
560 
561 	return printed;
562 }
563 
564 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
565 
566 #ifndef GRND_NONBLOCK
567 #define GRND_NONBLOCK	0x0001
568 #endif
569 #ifndef GRND_RANDOM
570 #define GRND_RANDOM	0x0002
571 #endif
572 
573 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
574 						   struct syscall_arg *arg)
575 {
576 	int printed = 0, flags = arg->val;
577 
578 #define	P_FLAG(n) \
579 	if (flags & GRND_##n) { \
580 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
581 		flags &= ~GRND_##n; \
582 	}
583 
584 	P_FLAG(RANDOM);
585 	P_FLAG(NONBLOCK);
586 #undef P_FLAG
587 
588 	if (flags)
589 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
590 
591 	return printed;
592 }
593 
594 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
595 
596 #define STRARRAY(name, array) \
597 	  { .scnprintf	= SCA_STRARRAY, \
598 	    .parm	= &strarray__##array, }
599 
600 #include "trace/beauty/arch_errno_names.c"
601 #include "trace/beauty/eventfd.c"
602 #include "trace/beauty/futex_op.c"
603 #include "trace/beauty/futex_val3.c"
604 #include "trace/beauty/mmap.c"
605 #include "trace/beauty/mode_t.c"
606 #include "trace/beauty/msg_flags.c"
607 #include "trace/beauty/open_flags.c"
608 #include "trace/beauty/perf_event_open.c"
609 #include "trace/beauty/pid.c"
610 #include "trace/beauty/sched_policy.c"
611 #include "trace/beauty/seccomp.c"
612 #include "trace/beauty/signum.c"
613 #include "trace/beauty/socket_type.c"
614 #include "trace/beauty/waitid_options.c"
615 
616 struct syscall_arg_fmt {
617 	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
618 	void	   *parm;
619 	const char *name;
620 	bool	   show_zero;
621 };
622 
623 static struct syscall_fmt {
624 	const char *name;
625 	const char *alias;
626 	struct syscall_arg_fmt arg[6];
627 	u8	   nr_args;
628 	bool	   errpid;
629 	bool	   timeout;
630 	bool	   hexret;
631 } syscall_fmts[] = {
632 	{ .name	    = "access",
633 	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
634 	{ .name	    = "bpf",
635 	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
636 	{ .name	    = "brk",	    .hexret = true,
637 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
638 	{ .name     = "clock_gettime",
639 	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
640 	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
641 	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
642 		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
643 		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
644 		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
645 		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
646 	{ .name	    = "close",
647 	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
648 	{ .name	    = "epoll_ctl",
649 	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
650 	{ .name	    = "eventfd2",
651 	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
652 	{ .name	    = "fchmodat",
653 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
654 	{ .name	    = "fchownat",
655 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
656 	{ .name	    = "fcntl",
657 	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
658 			   .parm      = &strarrays__fcntl_cmds_arrays,
659 			   .show_zero = true, },
660 		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
661 	{ .name	    = "flock",
662 	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
663 	{ .name	    = "fstat", .alias = "newfstat", },
664 	{ .name	    = "fstatat", .alias = "newfstatat", },
665 	{ .name	    = "futex",
666 	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ },
667 		   [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, },
668 	{ .name	    = "futimesat",
669 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
670 	{ .name	    = "getitimer",
671 	  .arg = { [0] = STRARRAY(which, itimers), }, },
672 	{ .name	    = "getpid",	    .errpid = true, },
673 	{ .name	    = "getpgid",    .errpid = true, },
674 	{ .name	    = "getppid",    .errpid = true, },
675 	{ .name	    = "getrandom",
676 	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
677 	{ .name	    = "getrlimit",
678 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
679 	{ .name	    = "gettid",	    .errpid = true, },
680 	{ .name	    = "ioctl",
681 	  .arg = {
682 #if defined(__i386__) || defined(__x86_64__)
683 /*
684  * FIXME: Make this available to all arches.
685  */
686 		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
687 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
688 #else
689 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
690 #endif
691 	{ .name	    = "kcmp",	    .nr_args = 5,
692 	  .arg = { [0] = { .name = "pid1",	.scnprintf = SCA_PID, },
693 		   [1] = { .name = "pid2",	.scnprintf = SCA_PID, },
694 		   [2] = { .name = "type",	.scnprintf = SCA_KCMP_TYPE, },
695 		   [3] = { .name = "idx1",	.scnprintf = SCA_KCMP_IDX, },
696 		   [4] = { .name = "idx2",	.scnprintf = SCA_KCMP_IDX, }, }, },
697 	{ .name	    = "keyctl",
698 	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
699 	{ .name	    = "kill",
700 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
701 	{ .name	    = "linkat",
702 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
703 	{ .name	    = "lseek",
704 	  .arg = { [2] = STRARRAY(whence, whences), }, },
705 	{ .name	    = "lstat", .alias = "newlstat", },
706 	{ .name     = "madvise",
707 	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
708 		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
709 	{ .name	    = "mkdirat",
710 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
711 	{ .name	    = "mknodat",
712 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
713 	{ .name	    = "mlock",
714 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
715 	{ .name	    = "mlockall",
716 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
717 	{ .name	    = "mmap",	    .hexret = true,
718 /* The standard mmap maps to old_mmap on s390x */
719 #if defined(__s390x__)
720 	.alias = "old_mmap",
721 #endif
722 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* addr */ },
723 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
724 		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */ }, }, },
725 	{ .name	    = "mprotect",
726 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
727 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
728 	{ .name	    = "mq_unlink",
729 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
730 	{ .name	    = "mremap",	    .hexret = true,
731 	  .arg = { [0] = { .scnprintf = SCA_HEX,	  /* addr */ },
732 		   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
733 		   [4] = { .scnprintf = SCA_HEX,	  /* new_addr */ }, }, },
734 	{ .name	    = "munlock",
735 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
736 	{ .name	    = "munmap",
737 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
738 	{ .name	    = "name_to_handle_at",
739 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
740 	{ .name	    = "newfstatat",
741 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
742 	{ .name	    = "open",
743 	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
744 	{ .name	    = "open_by_handle_at",
745 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
746 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
747 	{ .name	    = "openat",
748 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
749 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
750 	{ .name	    = "perf_event_open",
751 	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
752 		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
753 		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
754 	{ .name	    = "pipe2",
755 	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
756 	{ .name	    = "pkey_alloc",
757 	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
758 	{ .name	    = "pkey_free",
759 	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
760 	{ .name	    = "pkey_mprotect",
761 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
762 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
763 		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
764 	{ .name	    = "poll", .timeout = true, },
765 	{ .name	    = "ppoll", .timeout = true, },
766 	{ .name	    = "prctl", .alias = "arch_prctl",
767 	  .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ },
768 		   [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ },
769 		   [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, },
770 	{ .name	    = "pread", .alias = "pread64", },
771 	{ .name	    = "preadv", .alias = "pread", },
772 	{ .name	    = "prlimit64",
773 	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
774 	{ .name	    = "pwrite", .alias = "pwrite64", },
775 	{ .name	    = "readlinkat",
776 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
777 	{ .name	    = "recvfrom",
778 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
779 	{ .name	    = "recvmmsg",
780 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
781 	{ .name	    = "recvmsg",
782 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
783 	{ .name	    = "renameat",
784 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
785 	{ .name	    = "rt_sigaction",
786 	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
787 	{ .name	    = "rt_sigprocmask",
788 	  .arg = { [0] = STRARRAY(how, sighow), }, },
789 	{ .name	    = "rt_sigqueueinfo",
790 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
791 	{ .name	    = "rt_tgsigqueueinfo",
792 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
793 	{ .name	    = "sched_setscheduler",
794 	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
795 	{ .name	    = "seccomp",
796 	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
797 		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
798 	{ .name	    = "select", .timeout = true, },
799 	{ .name	    = "sendmmsg",
800 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
801 	{ .name	    = "sendmsg",
802 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
803 	{ .name	    = "sendto",
804 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
805 	{ .name	    = "set_tid_address", .errpid = true, },
806 	{ .name	    = "setitimer",
807 	  .arg = { [0] = STRARRAY(which, itimers), }, },
808 	{ .name	    = "setrlimit",
809 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
810 	{ .name	    = "socket",
811 	  .arg = { [0] = STRARRAY(family, socket_families),
812 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
813 		   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
814 	{ .name	    = "socketpair",
815 	  .arg = { [0] = STRARRAY(family, socket_families),
816 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ },
817 		   [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, },
818 	{ .name	    = "stat", .alias = "newstat", },
819 	{ .name	    = "statx",
820 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
821 		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
822 		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
823 	{ .name	    = "swapoff",
824 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
825 	{ .name	    = "swapon",
826 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
827 	{ .name	    = "symlinkat",
828 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
829 	{ .name	    = "tgkill",
830 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
831 	{ .name	    = "tkill",
832 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
833 	{ .name	    = "uname", .alias = "newuname", },
834 	{ .name	    = "unlinkat",
835 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
836 	{ .name	    = "utimensat",
837 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
838 	{ .name	    = "wait4",	    .errpid = true,
839 	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
840 	{ .name	    = "waitid",	    .errpid = true,
841 	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
842 };
843 
844 static int syscall_fmt__cmp(const void *name, const void *fmtp)
845 {
846 	const struct syscall_fmt *fmt = fmtp;
847 	return strcmp(name, fmt->name);
848 }
849 
850 static struct syscall_fmt *syscall_fmt__find(const char *name)
851 {
852 	const int nmemb = ARRAY_SIZE(syscall_fmts);
853 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
854 }
855 
856 /*
857  * is_exit: is this "exit" or "exit_group"?
858  * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter.
859  * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc.
860  */
861 struct syscall {
862 	struct event_format *tp_format;
863 	int		    nr_args;
864 	int		    args_size;
865 	bool		    is_exit;
866 	bool		    is_open;
867 	struct format_field *args;
868 	const char	    *name;
869 	struct syscall_fmt  *fmt;
870 	struct syscall_arg_fmt *arg_fmt;
871 };
872 
873 /*
874  * We need to have this 'calculated' boolean because in some cases we really
875  * don't know what is the duration of a syscall, for instance, when we start
876  * a session and some threads are waiting for a syscall to finish, say 'poll',
877  * in which case all we can do is to print "( ? ) for duration and for the
878  * start timestamp.
879  */
880 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
881 {
882 	double duration = (double)t / NSEC_PER_MSEC;
883 	size_t printed = fprintf(fp, "(");
884 
885 	if (!calculated)
886 		printed += fprintf(fp, "         ");
887 	else if (duration >= 1.0)
888 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
889 	else if (duration >= 0.01)
890 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
891 	else
892 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
893 	return printed + fprintf(fp, "): ");
894 }
895 
896 /**
897  * filename.ptr: The filename char pointer that will be vfs_getname'd
898  * filename.entry_str_pos: Where to insert the string translated from
899  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
900  * ret_scnprintf: syscall args may set this to a different syscall return
901  *                formatter, for instance, fcntl may return fds, file flags, etc.
902  */
903 struct thread_trace {
904 	u64		  entry_time;
905 	bool		  entry_pending;
906 	unsigned long	  nr_events;
907 	unsigned long	  pfmaj, pfmin;
908 	char		  *entry_str;
909 	double		  runtime_ms;
910 	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
911         struct {
912 		unsigned long ptr;
913 		short int     entry_str_pos;
914 		bool	      pending_open;
915 		unsigned int  namelen;
916 		char	      *name;
917 	} filename;
918 	struct {
919 		int	  max;
920 		char	  **table;
921 	} paths;
922 
923 	struct intlist *syscall_stats;
924 };
925 
926 static struct thread_trace *thread_trace__new(void)
927 {
928 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
929 
930 	if (ttrace)
931 		ttrace->paths.max = -1;
932 
933 	ttrace->syscall_stats = intlist__new(NULL);
934 
935 	return ttrace;
936 }
937 
938 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
939 {
940 	struct thread_trace *ttrace;
941 
942 	if (thread == NULL)
943 		goto fail;
944 
945 	if (thread__priv(thread) == NULL)
946 		thread__set_priv(thread, thread_trace__new());
947 
948 	if (thread__priv(thread) == NULL)
949 		goto fail;
950 
951 	ttrace = thread__priv(thread);
952 	++ttrace->nr_events;
953 
954 	return ttrace;
955 fail:
956 	color_fprintf(fp, PERF_COLOR_RED,
957 		      "WARNING: not enough memory, dropping samples!\n");
958 	return NULL;
959 }
960 
961 
962 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
963 				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
964 {
965 	struct thread_trace *ttrace = thread__priv(arg->thread);
966 
967 	ttrace->ret_scnprintf = ret_scnprintf;
968 }
969 
970 #define TRACE_PFMAJ		(1 << 0)
971 #define TRACE_PFMIN		(1 << 1)
972 
973 static const size_t trace__entry_str_size = 2048;
974 
975 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
976 {
977 	struct thread_trace *ttrace = thread__priv(thread);
978 
979 	if (fd > ttrace->paths.max) {
980 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
981 
982 		if (npath == NULL)
983 			return -1;
984 
985 		if (ttrace->paths.max != -1) {
986 			memset(npath + ttrace->paths.max + 1, 0,
987 			       (fd - ttrace->paths.max) * sizeof(char *));
988 		} else {
989 			memset(npath, 0, (fd + 1) * sizeof(char *));
990 		}
991 
992 		ttrace->paths.table = npath;
993 		ttrace->paths.max   = fd;
994 	}
995 
996 	ttrace->paths.table[fd] = strdup(pathname);
997 
998 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
999 }
1000 
1001 static int thread__read_fd_path(struct thread *thread, int fd)
1002 {
1003 	char linkname[PATH_MAX], pathname[PATH_MAX];
1004 	struct stat st;
1005 	int ret;
1006 
1007 	if (thread->pid_ == thread->tid) {
1008 		scnprintf(linkname, sizeof(linkname),
1009 			  "/proc/%d/fd/%d", thread->pid_, fd);
1010 	} else {
1011 		scnprintf(linkname, sizeof(linkname),
1012 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1013 	}
1014 
1015 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1016 		return -1;
1017 
1018 	ret = readlink(linkname, pathname, sizeof(pathname));
1019 
1020 	if (ret < 0 || ret > st.st_size)
1021 		return -1;
1022 
1023 	pathname[ret] = '\0';
1024 	return trace__set_fd_pathname(thread, fd, pathname);
1025 }
1026 
1027 static const char *thread__fd_path(struct thread *thread, int fd,
1028 				   struct trace *trace)
1029 {
1030 	struct thread_trace *ttrace = thread__priv(thread);
1031 
1032 	if (ttrace == NULL)
1033 		return NULL;
1034 
1035 	if (fd < 0)
1036 		return NULL;
1037 
1038 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1039 		if (!trace->live)
1040 			return NULL;
1041 		++trace->stats.proc_getname;
1042 		if (thread__read_fd_path(thread, fd))
1043 			return NULL;
1044 	}
1045 
1046 	return ttrace->paths.table[fd];
1047 }
1048 
1049 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
1050 {
1051 	int fd = arg->val;
1052 	size_t printed = scnprintf(bf, size, "%d", fd);
1053 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1054 
1055 	if (path)
1056 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1057 
1058 	return printed;
1059 }
1060 
1061 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size)
1062 {
1063         size_t printed = scnprintf(bf, size, "%d", fd);
1064 	struct thread *thread = machine__find_thread(trace->host, pid, pid);
1065 
1066 	if (thread) {
1067 		const char *path = thread__fd_path(thread, fd, trace);
1068 
1069 		if (path)
1070 			printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1071 
1072 		thread__put(thread);
1073 	}
1074 
1075         return printed;
1076 }
1077 
1078 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1079 					      struct syscall_arg *arg)
1080 {
1081 	int fd = arg->val;
1082 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1083 	struct thread_trace *ttrace = thread__priv(arg->thread);
1084 
1085 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1086 		zfree(&ttrace->paths.table[fd]);
1087 
1088 	return printed;
1089 }
1090 
1091 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1092 				     unsigned long ptr)
1093 {
1094 	struct thread_trace *ttrace = thread__priv(thread);
1095 
1096 	ttrace->filename.ptr = ptr;
1097 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1098 }
1099 
1100 static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size)
1101 {
1102 	struct augmented_arg *augmented_arg = arg->augmented.args;
1103 
1104 	return scnprintf(bf, size, "%.*s", augmented_arg->size, augmented_arg->value);
1105 }
1106 
1107 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1108 					      struct syscall_arg *arg)
1109 {
1110 	unsigned long ptr = arg->val;
1111 
1112 	if (arg->augmented.args)
1113 		return syscall_arg__scnprintf_augmented_string(arg, bf, size);
1114 
1115 	if (!arg->trace->vfs_getname)
1116 		return scnprintf(bf, size, "%#x", ptr);
1117 
1118 	thread__set_filename_pos(arg->thread, bf, ptr);
1119 	return 0;
1120 }
1121 
1122 static bool trace__filter_duration(struct trace *trace, double t)
1123 {
1124 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1125 }
1126 
1127 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1128 {
1129 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1130 
1131 	return fprintf(fp, "%10.3f ", ts);
1132 }
1133 
1134 /*
1135  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1136  * using ttrace->entry_time for a thread that receives a sys_exit without
1137  * first having received a sys_enter ("poll" issued before tracing session
1138  * starts, lost sys_enter exit due to ring buffer overflow).
1139  */
1140 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1141 {
1142 	if (tstamp > 0)
1143 		return __trace__fprintf_tstamp(trace, tstamp, fp);
1144 
1145 	return fprintf(fp, "         ? ");
1146 }
1147 
1148 static bool done = false;
1149 static bool interrupted = false;
1150 
1151 static void sig_handler(int sig)
1152 {
1153 	done = true;
1154 	interrupted = sig == SIGINT;
1155 }
1156 
1157 static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp)
1158 {
1159 	size_t printed = 0;
1160 
1161 	if (trace->multiple_threads) {
1162 		if (trace->show_comm)
1163 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1164 		printed += fprintf(fp, "%d ", thread->tid);
1165 	}
1166 
1167 	return printed;
1168 }
1169 
1170 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1171 					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1172 {
1173 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1174 	printed += fprintf_duration(duration, duration_calculated, fp);
1175 	return printed + trace__fprintf_comm_tid(trace, thread, fp);
1176 }
1177 
1178 static int trace__process_event(struct trace *trace, struct machine *machine,
1179 				union perf_event *event, struct perf_sample *sample)
1180 {
1181 	int ret = 0;
1182 
1183 	switch (event->header.type) {
1184 	case PERF_RECORD_LOST:
1185 		color_fprintf(trace->output, PERF_COLOR_RED,
1186 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1187 		ret = machine__process_lost_event(machine, event, sample);
1188 		break;
1189 	default:
1190 		ret = machine__process_event(machine, event, sample);
1191 		break;
1192 	}
1193 
1194 	return ret;
1195 }
1196 
1197 static int trace__tool_process(struct perf_tool *tool,
1198 			       union perf_event *event,
1199 			       struct perf_sample *sample,
1200 			       struct machine *machine)
1201 {
1202 	struct trace *trace = container_of(tool, struct trace, tool);
1203 	return trace__process_event(trace, machine, event, sample);
1204 }
1205 
1206 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1207 {
1208 	struct machine *machine = vmachine;
1209 
1210 	if (machine->kptr_restrict_warned)
1211 		return NULL;
1212 
1213 	if (symbol_conf.kptr_restrict) {
1214 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1215 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1216 			   "Kernel samples will not be resolved.\n");
1217 		machine->kptr_restrict_warned = true;
1218 		return NULL;
1219 	}
1220 
1221 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1222 }
1223 
1224 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1225 {
1226 	int err = symbol__init(NULL);
1227 
1228 	if (err)
1229 		return err;
1230 
1231 	trace->host = machine__new_host();
1232 	if (trace->host == NULL)
1233 		return -ENOMEM;
1234 
1235 	err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr);
1236 	if (err < 0)
1237 		goto out;
1238 
1239 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1240 					    evlist->threads, trace__tool_process, false,
1241 					    trace->opts.proc_map_timeout, 1);
1242 out:
1243 	if (err)
1244 		symbol__exit();
1245 
1246 	return err;
1247 }
1248 
1249 static void trace__symbols__exit(struct trace *trace)
1250 {
1251 	machine__exit(trace->host);
1252 	trace->host = NULL;
1253 
1254 	symbol__exit();
1255 }
1256 
1257 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1258 {
1259 	int idx;
1260 
1261 	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1262 		nr_args = sc->fmt->nr_args;
1263 
1264 	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1265 	if (sc->arg_fmt == NULL)
1266 		return -1;
1267 
1268 	for (idx = 0; idx < nr_args; ++idx) {
1269 		if (sc->fmt)
1270 			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1271 	}
1272 
1273 	sc->nr_args = nr_args;
1274 	return 0;
1275 }
1276 
1277 static int syscall__set_arg_fmts(struct syscall *sc)
1278 {
1279 	struct format_field *field, *last_field = NULL;
1280 	int idx = 0, len;
1281 
1282 	for (field = sc->args; field; field = field->next, ++idx) {
1283 		last_field = field;
1284 
1285 		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1286 			continue;
1287 
1288 		if (strcmp(field->type, "const char *") == 0 &&
1289 			 (strcmp(field->name, "filename") == 0 ||
1290 			  strcmp(field->name, "path") == 0 ||
1291 			  strcmp(field->name, "pathname") == 0))
1292 			sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1293 		else if (field->flags & FIELD_IS_POINTER)
1294 			sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1295 		else if (strcmp(field->type, "pid_t") == 0)
1296 			sc->arg_fmt[idx].scnprintf = SCA_PID;
1297 		else if (strcmp(field->type, "umode_t") == 0)
1298 			sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1299 		else if ((strcmp(field->type, "int") == 0 ||
1300 			  strcmp(field->type, "unsigned int") == 0 ||
1301 			  strcmp(field->type, "long") == 0) &&
1302 			 (len = strlen(field->name)) >= 2 &&
1303 			 strcmp(field->name + len - 2, "fd") == 0) {
1304 			/*
1305 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1306 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1307 			 * 65 int
1308 			 * 23 unsigned int
1309 			 * 7 unsigned long
1310 			 */
1311 			sc->arg_fmt[idx].scnprintf = SCA_FD;
1312 		}
1313 	}
1314 
1315 	if (last_field)
1316 		sc->args_size = last_field->offset + last_field->size;
1317 
1318 	return 0;
1319 }
1320 
1321 static int trace__read_syscall_info(struct trace *trace, int id)
1322 {
1323 	char tp_name[128];
1324 	struct syscall *sc;
1325 	const char *name = syscalltbl__name(trace->sctbl, id);
1326 
1327 	if (name == NULL)
1328 		return -1;
1329 
1330 	if (id > trace->syscalls.max) {
1331 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1332 
1333 		if (nsyscalls == NULL)
1334 			return -1;
1335 
1336 		if (trace->syscalls.max != -1) {
1337 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1338 			       (id - trace->syscalls.max) * sizeof(*sc));
1339 		} else {
1340 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1341 		}
1342 
1343 		trace->syscalls.table = nsyscalls;
1344 		trace->syscalls.max   = id;
1345 	}
1346 
1347 	sc = trace->syscalls.table + id;
1348 	sc->name = name;
1349 
1350 	sc->fmt  = syscall_fmt__find(sc->name);
1351 
1352 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1353 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1354 
1355 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1356 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1357 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1358 	}
1359 
1360 	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1361 		return -1;
1362 
1363 	if (IS_ERR(sc->tp_format))
1364 		return -1;
1365 
1366 	sc->args = sc->tp_format->format.fields;
1367 	/*
1368 	 * We need to check and discard the first variable '__syscall_nr'
1369 	 * or 'nr' that mean the syscall number. It is needless here.
1370 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1371 	 */
1372 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1373 		sc->args = sc->args->next;
1374 		--sc->nr_args;
1375 	}
1376 
1377 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1378 	sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat");
1379 
1380 	return syscall__set_arg_fmts(sc);
1381 }
1382 
1383 static int trace__validate_ev_qualifier(struct trace *trace)
1384 {
1385 	int err = 0, i;
1386 	size_t nr_allocated;
1387 	struct str_node *pos;
1388 
1389 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1390 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1391 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1392 
1393 	if (trace->ev_qualifier_ids.entries == NULL) {
1394 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1395 		       trace->output);
1396 		err = -EINVAL;
1397 		goto out;
1398 	}
1399 
1400 	nr_allocated = trace->ev_qualifier_ids.nr;
1401 	i = 0;
1402 
1403 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1404 		const char *sc = pos->s;
1405 		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1406 
1407 		if (id < 0) {
1408 			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1409 			if (id >= 0)
1410 				goto matches;
1411 
1412 			if (err == 0) {
1413 				fputs("Error:\tInvalid syscall ", trace->output);
1414 				err = -EINVAL;
1415 			} else {
1416 				fputs(", ", trace->output);
1417 			}
1418 
1419 			fputs(sc, trace->output);
1420 		}
1421 matches:
1422 		trace->ev_qualifier_ids.entries[i++] = id;
1423 		if (match_next == -1)
1424 			continue;
1425 
1426 		while (1) {
1427 			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1428 			if (id < 0)
1429 				break;
1430 			if (nr_allocated == trace->ev_qualifier_ids.nr) {
1431 				void *entries;
1432 
1433 				nr_allocated += 8;
1434 				entries = realloc(trace->ev_qualifier_ids.entries,
1435 						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1436 				if (entries == NULL) {
1437 					err = -ENOMEM;
1438 					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1439 					goto out_free;
1440 				}
1441 				trace->ev_qualifier_ids.entries = entries;
1442 			}
1443 			trace->ev_qualifier_ids.nr++;
1444 			trace->ev_qualifier_ids.entries[i++] = id;
1445 		}
1446 	}
1447 
1448 	if (err < 0) {
1449 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1450 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1451 out_free:
1452 		zfree(&trace->ev_qualifier_ids.entries);
1453 		trace->ev_qualifier_ids.nr = 0;
1454 	}
1455 out:
1456 	return err;
1457 }
1458 
1459 /*
1460  * args is to be interpreted as a series of longs but we need to handle
1461  * 8-byte unaligned accesses. args points to raw_data within the event
1462  * and raw_data is guaranteed to be 8-byte unaligned because it is
1463  * preceded by raw_size which is a u32. So we need to copy args to a temp
1464  * variable to read it. Most notably this avoids extended load instructions
1465  * on unaligned addresses
1466  */
1467 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1468 {
1469 	unsigned long val;
1470 	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1471 
1472 	memcpy(&val, p, sizeof(val));
1473 	return val;
1474 }
1475 
1476 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1477 				      struct syscall_arg *arg)
1478 {
1479 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1480 		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1481 
1482 	return scnprintf(bf, size, "arg%d: ", arg->idx);
1483 }
1484 
1485 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1486 				     struct syscall_arg *arg, unsigned long val)
1487 {
1488 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1489 		arg->val = val;
1490 		if (sc->arg_fmt[arg->idx].parm)
1491 			arg->parm = sc->arg_fmt[arg->idx].parm;
1492 		return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1493 	}
1494 	return scnprintf(bf, size, "%ld", val);
1495 }
1496 
1497 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1498 				      unsigned char *args, void *augmented_args, int augmented_args_size,
1499 				      struct trace *trace, struct thread *thread)
1500 {
1501 	size_t printed = 0;
1502 	unsigned long val;
1503 	u8 bit = 1;
1504 	struct syscall_arg arg = {
1505 		.args	= args,
1506 		.augmented = {
1507 			.size = augmented_args_size,
1508 			.args = augmented_args,
1509 		},
1510 		.idx	= 0,
1511 		.mask	= 0,
1512 		.trace  = trace,
1513 		.thread = thread,
1514 	};
1515 	struct thread_trace *ttrace = thread__priv(thread);
1516 
1517 	/*
1518 	 * Things like fcntl will set this in its 'cmd' formatter to pick the
1519 	 * right formatter for the return value (an fd? file flags?), which is
1520 	 * not needed for syscalls that always return a given type, say an fd.
1521 	 */
1522 	ttrace->ret_scnprintf = NULL;
1523 
1524 	if (sc->args != NULL) {
1525 		struct format_field *field;
1526 
1527 		for (field = sc->args; field;
1528 		     field = field->next, ++arg.idx, bit <<= 1) {
1529 			if (arg.mask & bit)
1530 				continue;
1531 
1532 			val = syscall_arg__val(&arg, arg.idx);
1533 
1534 			/*
1535  			 * Suppress this argument if its value is zero and
1536  			 * and we don't have a string associated in an
1537  			 * strarray for it.
1538  			 */
1539 			if (val == 0 &&
1540 			    !(sc->arg_fmt &&
1541 			      (sc->arg_fmt[arg.idx].show_zero ||
1542 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1543 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1544 			      sc->arg_fmt[arg.idx].parm))
1545 				continue;
1546 
1547 			printed += scnprintf(bf + printed, size - printed,
1548 					     "%s%s: ", printed ? ", " : "", field->name);
1549 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1550 		}
1551 	} else if (IS_ERR(sc->tp_format)) {
1552 		/*
1553 		 * If we managed to read the tracepoint /format file, then we
1554 		 * may end up not having any args, like with gettid(), so only
1555 		 * print the raw args when we didn't manage to read it.
1556 		 */
1557 		while (arg.idx < sc->nr_args) {
1558 			if (arg.mask & bit)
1559 				goto next_arg;
1560 			val = syscall_arg__val(&arg, arg.idx);
1561 			if (printed)
1562 				printed += scnprintf(bf + printed, size - printed, ", ");
1563 			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1564 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1565 next_arg:
1566 			++arg.idx;
1567 			bit <<= 1;
1568 		}
1569 	}
1570 
1571 	return printed;
1572 }
1573 
1574 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1575 				  union perf_event *event,
1576 				  struct perf_sample *sample);
1577 
1578 static struct syscall *trace__syscall_info(struct trace *trace,
1579 					   struct perf_evsel *evsel, int id)
1580 {
1581 
1582 	if (id < 0) {
1583 
1584 		/*
1585 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1586 		 * before that, leaving at a higher verbosity level till that is
1587 		 * explained. Reproduced with plain ftrace with:
1588 		 *
1589 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1590 		 * grep "NR -1 " /t/trace_pipe
1591 		 *
1592 		 * After generating some load on the machine.
1593  		 */
1594 		if (verbose > 1) {
1595 			static u64 n;
1596 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1597 				id, perf_evsel__name(evsel), ++n);
1598 		}
1599 		return NULL;
1600 	}
1601 
1602 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1603 	    trace__read_syscall_info(trace, id))
1604 		goto out_cant_read;
1605 
1606 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1607 		goto out_cant_read;
1608 
1609 	return &trace->syscalls.table[id];
1610 
1611 out_cant_read:
1612 	if (verbose > 0) {
1613 		fprintf(trace->output, "Problems reading syscall %d", id);
1614 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1615 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1616 		fputs(" information\n", trace->output);
1617 	}
1618 	return NULL;
1619 }
1620 
1621 static void thread__update_stats(struct thread_trace *ttrace,
1622 				 int id, struct perf_sample *sample)
1623 {
1624 	struct int_node *inode;
1625 	struct stats *stats;
1626 	u64 duration = 0;
1627 
1628 	inode = intlist__findnew(ttrace->syscall_stats, id);
1629 	if (inode == NULL)
1630 		return;
1631 
1632 	stats = inode->priv;
1633 	if (stats == NULL) {
1634 		stats = malloc(sizeof(struct stats));
1635 		if (stats == NULL)
1636 			return;
1637 		init_stats(stats);
1638 		inode->priv = stats;
1639 	}
1640 
1641 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1642 		duration = sample->time - ttrace->entry_time;
1643 
1644 	update_stats(stats, duration);
1645 }
1646 
1647 static int trace__printf_interrupted_entry(struct trace *trace)
1648 {
1649 	struct thread_trace *ttrace;
1650 	size_t printed;
1651 
1652 	if (trace->failure_only || trace->current == NULL)
1653 		return 0;
1654 
1655 	ttrace = thread__priv(trace->current);
1656 
1657 	if (!ttrace->entry_pending)
1658 		return 0;
1659 
1660 	printed  = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output);
1661 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1662 	ttrace->entry_pending = false;
1663 
1664 	return printed;
1665 }
1666 
1667 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel,
1668 				 struct perf_sample *sample, struct thread *thread)
1669 {
1670 	int printed = 0;
1671 
1672 	if (trace->print_sample) {
1673 		double ts = (double)sample->time / NSEC_PER_MSEC;
1674 
1675 		printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n",
1676 				   perf_evsel__name(evsel), ts,
1677 				   thread__comm_str(thread),
1678 				   sample->pid, sample->tid, sample->cpu);
1679 	}
1680 
1681 	return printed;
1682 }
1683 
1684 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1685 			    union perf_event *event __maybe_unused,
1686 			    struct perf_sample *sample)
1687 {
1688 	char *msg;
1689 	void *args;
1690 	size_t printed = 0;
1691 	struct thread *thread;
1692 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1693 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1694 	struct thread_trace *ttrace;
1695 
1696 	if (sc == NULL)
1697 		return -1;
1698 
1699 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1700 	ttrace = thread__trace(thread, trace->output);
1701 	if (ttrace == NULL)
1702 		goto out_put;
1703 
1704 	trace__fprintf_sample(trace, evsel, sample, thread);
1705 
1706 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1707 
1708 	if (ttrace->entry_str == NULL) {
1709 		ttrace->entry_str = malloc(trace__entry_str_size);
1710 		if (!ttrace->entry_str)
1711 			goto out_put;
1712 	}
1713 
1714 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1715 		trace__printf_interrupted_entry(trace);
1716 
1717 	ttrace->entry_time = sample->time;
1718 	msg = ttrace->entry_str;
1719 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1720 
1721 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1722 					   args, NULL, 0, trace, thread);
1723 
1724 	if (sc->is_exit) {
1725 		if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) {
1726 			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1727 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1728 		}
1729 	} else {
1730 		ttrace->entry_pending = true;
1731 		/* See trace__vfs_getname & trace__sys_exit */
1732 		ttrace->filename.pending_open = false;
1733 	}
1734 
1735 	if (trace->current != thread) {
1736 		thread__put(trace->current);
1737 		trace->current = thread__get(thread);
1738 	}
1739 	err = 0;
1740 out_put:
1741 	thread__put(thread);
1742 	return err;
1743 }
1744 
1745 static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evsel,
1746 				    struct perf_sample *sample)
1747 {
1748 	struct thread_trace *ttrace;
1749 	struct thread *thread;
1750 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1751 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1752 	char msg[1024];
1753 	void *args, *augmented_args = NULL;
1754 	int augmented_args_size;
1755 
1756 	if (sc == NULL)
1757 		return -1;
1758 
1759 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1760 	ttrace = thread__trace(thread, trace->output);
1761 	/*
1762 	 * We need to get ttrace just to make sure it is there when syscall__scnprintf_args()
1763 	 * and the rest of the beautifiers accessing it via struct syscall_arg touches it.
1764 	 */
1765 	if (ttrace == NULL)
1766 		goto out_put;
1767 
1768 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1769 	augmented_args_size = sample->raw_size - sc->args_size;
1770 	if (augmented_args_size > 0)
1771 		augmented_args = sample->raw_data + sc->args_size;
1772 
1773 	syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread);
1774 	fprintf(trace->output, "%s", msg);
1775 	err = 0;
1776 out_put:
1777 	thread__put(thread);
1778 	return err;
1779 }
1780 
1781 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1782 				    struct perf_sample *sample,
1783 				    struct callchain_cursor *cursor)
1784 {
1785 	struct addr_location al;
1786 	int max_stack = evsel->attr.sample_max_stack ?
1787 			evsel->attr.sample_max_stack :
1788 			trace->max_stack;
1789 
1790 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1791 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack))
1792 		return -1;
1793 
1794 	return 0;
1795 }
1796 
1797 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1798 {
1799 	/* TODO: user-configurable print_opts */
1800 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1801 				        EVSEL__PRINT_DSO |
1802 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1803 
1804 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1805 }
1806 
1807 static const char *errno_to_name(struct perf_evsel *evsel, int err)
1808 {
1809 	struct perf_env *env = perf_evsel__env(evsel);
1810 	const char *arch_name = perf_env__arch(env);
1811 
1812 	return arch_syscalls__strerrno(arch_name, err);
1813 }
1814 
1815 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1816 			   union perf_event *event __maybe_unused,
1817 			   struct perf_sample *sample)
1818 {
1819 	long ret;
1820 	u64 duration = 0;
1821 	bool duration_calculated = false;
1822 	struct thread *thread;
1823 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1824 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1825 	struct thread_trace *ttrace;
1826 
1827 	if (sc == NULL)
1828 		return -1;
1829 
1830 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1831 	ttrace = thread__trace(thread, trace->output);
1832 	if (ttrace == NULL)
1833 		goto out_put;
1834 
1835 	trace__fprintf_sample(trace, evsel, sample, thread);
1836 
1837 	if (trace->summary)
1838 		thread__update_stats(ttrace, id, sample);
1839 
1840 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1841 
1842 	if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) {
1843 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1844 		ttrace->filename.pending_open = false;
1845 		++trace->stats.vfs_getname;
1846 	}
1847 
1848 	if (ttrace->entry_time) {
1849 		duration = sample->time - ttrace->entry_time;
1850 		if (trace__filter_duration(trace, duration))
1851 			goto out;
1852 		duration_calculated = true;
1853 	} else if (trace->duration_filter)
1854 		goto out;
1855 
1856 	if (sample->callchain) {
1857 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1858 		if (callchain_ret == 0) {
1859 			if (callchain_cursor.nr < trace->min_stack)
1860 				goto out;
1861 			callchain_ret = 1;
1862 		}
1863 	}
1864 
1865 	if (trace->summary_only || (ret >= 0 && trace->failure_only))
1866 		goto out;
1867 
1868 	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1869 
1870 	if (ttrace->entry_pending) {
1871 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1872 	} else {
1873 		fprintf(trace->output, " ... [");
1874 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1875 		fprintf(trace->output, "]: %s()", sc->name);
1876 	}
1877 
1878 	if (sc->fmt == NULL) {
1879 		if (ret < 0)
1880 			goto errno_print;
1881 signed_print:
1882 		fprintf(trace->output, ") = %ld", ret);
1883 	} else if (ret < 0) {
1884 errno_print: {
1885 		char bf[STRERR_BUFSIZE];
1886 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1887 			   *e = errno_to_name(evsel, -ret);
1888 
1889 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1890 	}
1891 	} else if (ret == 0 && sc->fmt->timeout)
1892 		fprintf(trace->output, ") = 0 Timeout");
1893 	else if (ttrace->ret_scnprintf) {
1894 		char bf[1024];
1895 		struct syscall_arg arg = {
1896 			.val	= ret,
1897 			.thread	= thread,
1898 			.trace	= trace,
1899 		};
1900 		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1901 		ttrace->ret_scnprintf = NULL;
1902 		fprintf(trace->output, ") = %s", bf);
1903 	} else if (sc->fmt->hexret)
1904 		fprintf(trace->output, ") = %#lx", ret);
1905 	else if (sc->fmt->errpid) {
1906 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1907 
1908 		if (child != NULL) {
1909 			fprintf(trace->output, ") = %ld", ret);
1910 			if (child->comm_set)
1911 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1912 			thread__put(child);
1913 		}
1914 	} else
1915 		goto signed_print;
1916 
1917 	fputc('\n', trace->output);
1918 
1919 	if (callchain_ret > 0)
1920 		trace__fprintf_callchain(trace, sample);
1921 	else if (callchain_ret < 0)
1922 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1923 out:
1924 	ttrace->entry_pending = false;
1925 	err = 0;
1926 out_put:
1927 	thread__put(thread);
1928 	return err;
1929 }
1930 
1931 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1932 			      union perf_event *event __maybe_unused,
1933 			      struct perf_sample *sample)
1934 {
1935 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1936 	struct thread_trace *ttrace;
1937 	size_t filename_len, entry_str_len, to_move;
1938 	ssize_t remaining_space;
1939 	char *pos;
1940 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1941 
1942 	if (!thread)
1943 		goto out;
1944 
1945 	ttrace = thread__priv(thread);
1946 	if (!ttrace)
1947 		goto out_put;
1948 
1949 	filename_len = strlen(filename);
1950 	if (filename_len == 0)
1951 		goto out_put;
1952 
1953 	if (ttrace->filename.namelen < filename_len) {
1954 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1955 
1956 		if (f == NULL)
1957 			goto out_put;
1958 
1959 		ttrace->filename.namelen = filename_len;
1960 		ttrace->filename.name = f;
1961 	}
1962 
1963 	strcpy(ttrace->filename.name, filename);
1964 	ttrace->filename.pending_open = true;
1965 
1966 	if (!ttrace->filename.ptr)
1967 		goto out_put;
1968 
1969 	entry_str_len = strlen(ttrace->entry_str);
1970 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1971 	if (remaining_space <= 0)
1972 		goto out_put;
1973 
1974 	if (filename_len > (size_t)remaining_space) {
1975 		filename += filename_len - remaining_space;
1976 		filename_len = remaining_space;
1977 	}
1978 
1979 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1980 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1981 	memmove(pos + filename_len, pos, to_move);
1982 	memcpy(pos, filename, filename_len);
1983 
1984 	ttrace->filename.ptr = 0;
1985 	ttrace->filename.entry_str_pos = 0;
1986 out_put:
1987 	thread__put(thread);
1988 out:
1989 	return 0;
1990 }
1991 
1992 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1993 				     union perf_event *event __maybe_unused,
1994 				     struct perf_sample *sample)
1995 {
1996         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1997 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1998 	struct thread *thread = machine__findnew_thread(trace->host,
1999 							sample->pid,
2000 							sample->tid);
2001 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
2002 
2003 	if (ttrace == NULL)
2004 		goto out_dump;
2005 
2006 	ttrace->runtime_ms += runtime_ms;
2007 	trace->runtime_ms += runtime_ms;
2008 out_put:
2009 	thread__put(thread);
2010 	return 0;
2011 
2012 out_dump:
2013 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2014 	       evsel->name,
2015 	       perf_evsel__strval(evsel, sample, "comm"),
2016 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
2017 	       runtime,
2018 	       perf_evsel__intval(evsel, sample, "vruntime"));
2019 	goto out_put;
2020 }
2021 
2022 static int bpf_output__printer(enum binary_printer_ops op,
2023 			       unsigned int val, void *extra __maybe_unused, FILE *fp)
2024 {
2025 	unsigned char ch = (unsigned char)val;
2026 
2027 	switch (op) {
2028 	case BINARY_PRINT_CHAR_DATA:
2029 		return fprintf(fp, "%c", isprint(ch) ? ch : '.');
2030 	case BINARY_PRINT_DATA_BEGIN:
2031 	case BINARY_PRINT_LINE_BEGIN:
2032 	case BINARY_PRINT_ADDR:
2033 	case BINARY_PRINT_NUM_DATA:
2034 	case BINARY_PRINT_NUM_PAD:
2035 	case BINARY_PRINT_SEP:
2036 	case BINARY_PRINT_CHAR_PAD:
2037 	case BINARY_PRINT_LINE_END:
2038 	case BINARY_PRINT_DATA_END:
2039 	default:
2040 		break;
2041 	}
2042 
2043 	return 0;
2044 }
2045 
2046 static void bpf_output__fprintf(struct trace *trace,
2047 				struct perf_sample *sample)
2048 {
2049 	binary__fprintf(sample->raw_data, sample->raw_size, 8,
2050 			bpf_output__printer, NULL, trace->output);
2051 }
2052 
2053 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2054 				union perf_event *event __maybe_unused,
2055 				struct perf_sample *sample)
2056 {
2057 	int callchain_ret = 0;
2058 
2059 	if (sample->callchain) {
2060 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2061 		if (callchain_ret == 0) {
2062 			if (callchain_cursor.nr < trace->min_stack)
2063 				goto out;
2064 			callchain_ret = 1;
2065 		}
2066 	}
2067 
2068 	trace__printf_interrupted_entry(trace);
2069 	trace__fprintf_tstamp(trace, sample->time, trace->output);
2070 
2071 	if (trace->trace_syscalls)
2072 		fprintf(trace->output, "(         ): ");
2073 
2074 	if (evsel == trace->syscalls.events.augmented) {
2075 		int id = perf_evsel__sc_tp_uint(evsel, id, sample);
2076 		struct syscall *sc = trace__syscall_info(trace, evsel, id);
2077 
2078 		if (sc) {
2079 			struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2080 
2081 			if (thread) {
2082 				trace__fprintf_comm_tid(trace, thread, trace->output);
2083 				fprintf(trace->output, "%s(", sc->name);
2084 				trace__fprintf_sys_enter(trace, evsel, sample);
2085 				fputc(')', trace->output);
2086 				thread__put(thread);
2087 				goto newline;
2088 			}
2089 		}
2090 
2091 		/*
2092 		 * XXX: Not having the associated syscall info or not finding/adding
2093 		 * 	the thread should never happen, but if it does...
2094 		 * 	fall thru and print it as a bpf_output event.
2095 		 */
2096 	}
2097 
2098 	fprintf(trace->output, "%s:", evsel->name);
2099 
2100 	if (perf_evsel__is_bpf_output(evsel)) {
2101 		bpf_output__fprintf(trace, sample);
2102 	} else if (evsel->tp_format) {
2103 		if (strncmp(evsel->tp_format->name, "sys_enter_", 10) ||
2104 		    trace__fprintf_sys_enter(trace, evsel, sample)) {
2105 			event_format__fprintf(evsel->tp_format, sample->cpu,
2106 					      sample->raw_data, sample->raw_size,
2107 					      trace->output);
2108 		}
2109 	}
2110 
2111 newline:
2112 	fprintf(trace->output, "\n");
2113 
2114 	if (callchain_ret > 0)
2115 		trace__fprintf_callchain(trace, sample);
2116 	else if (callchain_ret < 0)
2117 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2118 out:
2119 	return 0;
2120 }
2121 
2122 static void print_location(FILE *f, struct perf_sample *sample,
2123 			   struct addr_location *al,
2124 			   bool print_dso, bool print_sym)
2125 {
2126 
2127 	if ((verbose > 0 || print_dso) && al->map)
2128 		fprintf(f, "%s@", al->map->dso->long_name);
2129 
2130 	if ((verbose > 0 || print_sym) && al->sym)
2131 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2132 			al->addr - al->sym->start);
2133 	else if (al->map)
2134 		fprintf(f, "0x%" PRIx64, al->addr);
2135 	else
2136 		fprintf(f, "0x%" PRIx64, sample->addr);
2137 }
2138 
2139 static int trace__pgfault(struct trace *trace,
2140 			  struct perf_evsel *evsel,
2141 			  union perf_event *event __maybe_unused,
2142 			  struct perf_sample *sample)
2143 {
2144 	struct thread *thread;
2145 	struct addr_location al;
2146 	char map_type = 'd';
2147 	struct thread_trace *ttrace;
2148 	int err = -1;
2149 	int callchain_ret = 0;
2150 
2151 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2152 
2153 	if (sample->callchain) {
2154 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
2155 		if (callchain_ret == 0) {
2156 			if (callchain_cursor.nr < trace->min_stack)
2157 				goto out_put;
2158 			callchain_ret = 1;
2159 		}
2160 	}
2161 
2162 	ttrace = thread__trace(thread, trace->output);
2163 	if (ttrace == NULL)
2164 		goto out_put;
2165 
2166 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2167 		ttrace->pfmaj++;
2168 	else
2169 		ttrace->pfmin++;
2170 
2171 	if (trace->summary_only)
2172 		goto out;
2173 
2174 	thread__find_symbol(thread, sample->cpumode, sample->ip, &al);
2175 
2176 	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
2177 
2178 	fprintf(trace->output, "%sfault [",
2179 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2180 		"maj" : "min");
2181 
2182 	print_location(trace->output, sample, &al, false, true);
2183 
2184 	fprintf(trace->output, "] => ");
2185 
2186 	thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2187 
2188 	if (!al.map) {
2189 		thread__find_symbol(thread, sample->cpumode, sample->addr, &al);
2190 
2191 		if (al.map)
2192 			map_type = 'x';
2193 		else
2194 			map_type = '?';
2195 	}
2196 
2197 	print_location(trace->output, sample, &al, true, false);
2198 
2199 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2200 
2201 	if (callchain_ret > 0)
2202 		trace__fprintf_callchain(trace, sample);
2203 	else if (callchain_ret < 0)
2204 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
2205 out:
2206 	err = 0;
2207 out_put:
2208 	thread__put(thread);
2209 	return err;
2210 }
2211 
2212 static void trace__set_base_time(struct trace *trace,
2213 				 struct perf_evsel *evsel,
2214 				 struct perf_sample *sample)
2215 {
2216 	/*
2217 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2218 	 * and don't use sample->time unconditionally, we may end up having
2219 	 * some other event in the future without PERF_SAMPLE_TIME for good
2220 	 * reason, i.e. we may not be interested in its timestamps, just in
2221 	 * it taking place, picking some piece of information when it
2222 	 * appears in our event stream (vfs_getname comes to mind).
2223 	 */
2224 	if (trace->base_time == 0 && !trace->full_time &&
2225 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2226 		trace->base_time = sample->time;
2227 }
2228 
2229 static int trace__process_sample(struct perf_tool *tool,
2230 				 union perf_event *event,
2231 				 struct perf_sample *sample,
2232 				 struct perf_evsel *evsel,
2233 				 struct machine *machine __maybe_unused)
2234 {
2235 	struct trace *trace = container_of(tool, struct trace, tool);
2236 	struct thread *thread;
2237 	int err = 0;
2238 
2239 	tracepoint_handler handler = evsel->handler;
2240 
2241 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2242 	if (thread && thread__is_filtered(thread))
2243 		goto out;
2244 
2245 	trace__set_base_time(trace, evsel, sample);
2246 
2247 	if (handler) {
2248 		++trace->nr_events;
2249 		handler(trace, evsel, event, sample);
2250 	}
2251 out:
2252 	thread__put(thread);
2253 	return err;
2254 }
2255 
2256 static int trace__record(struct trace *trace, int argc, const char **argv)
2257 {
2258 	unsigned int rec_argc, i, j;
2259 	const char **rec_argv;
2260 	const char * const record_args[] = {
2261 		"record",
2262 		"-R",
2263 		"-m", "1024",
2264 		"-c", "1",
2265 	};
2266 
2267 	const char * const sc_args[] = { "-e", };
2268 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2269 	const char * const majpf_args[] = { "-e", "major-faults" };
2270 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2271 	const char * const minpf_args[] = { "-e", "minor-faults" };
2272 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2273 
2274 	/* +1 is for the event string below */
2275 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2276 		majpf_args_nr + minpf_args_nr + argc;
2277 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2278 
2279 	if (rec_argv == NULL)
2280 		return -ENOMEM;
2281 
2282 	j = 0;
2283 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2284 		rec_argv[j++] = record_args[i];
2285 
2286 	if (trace->trace_syscalls) {
2287 		for (i = 0; i < sc_args_nr; i++)
2288 			rec_argv[j++] = sc_args[i];
2289 
2290 		/* event string may be different for older kernels - e.g., RHEL6 */
2291 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2292 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2293 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2294 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2295 		else {
2296 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2297 			free(rec_argv);
2298 			return -1;
2299 		}
2300 	}
2301 
2302 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2303 		for (i = 0; i < majpf_args_nr; i++)
2304 			rec_argv[j++] = majpf_args[i];
2305 
2306 	if (trace->trace_pgfaults & TRACE_PFMIN)
2307 		for (i = 0; i < minpf_args_nr; i++)
2308 			rec_argv[j++] = minpf_args[i];
2309 
2310 	for (i = 0; i < (unsigned int)argc; i++)
2311 		rec_argv[j++] = argv[i];
2312 
2313 	return cmd_record(j, rec_argv);
2314 }
2315 
2316 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2317 
2318 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2319 {
2320 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2321 
2322 	if (IS_ERR(evsel))
2323 		return false;
2324 
2325 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2326 		perf_evsel__delete(evsel);
2327 		return false;
2328 	}
2329 
2330 	evsel->handler = trace__vfs_getname;
2331 	perf_evlist__add(evlist, evsel);
2332 	return true;
2333 }
2334 
2335 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2336 {
2337 	struct perf_evsel *evsel;
2338 	struct perf_event_attr attr = {
2339 		.type = PERF_TYPE_SOFTWARE,
2340 		.mmap_data = 1,
2341 	};
2342 
2343 	attr.config = config;
2344 	attr.sample_period = 1;
2345 
2346 	event_attr_init(&attr);
2347 
2348 	evsel = perf_evsel__new(&attr);
2349 	if (evsel)
2350 		evsel->handler = trace__pgfault;
2351 
2352 	return evsel;
2353 }
2354 
2355 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2356 {
2357 	const u32 type = event->header.type;
2358 	struct perf_evsel *evsel;
2359 
2360 	if (type != PERF_RECORD_SAMPLE) {
2361 		trace__process_event(trace, trace->host, event, sample);
2362 		return;
2363 	}
2364 
2365 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2366 	if (evsel == NULL) {
2367 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2368 		return;
2369 	}
2370 
2371 	trace__set_base_time(trace, evsel, sample);
2372 
2373 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2374 	    sample->raw_data == NULL) {
2375 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2376 		       perf_evsel__name(evsel), sample->tid,
2377 		       sample->cpu, sample->raw_size);
2378 	} else {
2379 		tracepoint_handler handler = evsel->handler;
2380 		handler(trace, evsel, event, sample);
2381 	}
2382 }
2383 
2384 static int trace__add_syscall_newtp(struct trace *trace)
2385 {
2386 	int ret = -1;
2387 	struct perf_evlist *evlist = trace->evlist;
2388 	struct perf_evsel *sys_enter, *sys_exit;
2389 
2390 	sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter);
2391 	if (sys_enter == NULL)
2392 		goto out;
2393 
2394 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2395 		goto out_delete_sys_enter;
2396 
2397 	sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit);
2398 	if (sys_exit == NULL)
2399 		goto out_delete_sys_enter;
2400 
2401 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2402 		goto out_delete_sys_exit;
2403 
2404 	perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param);
2405 	perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param);
2406 
2407 	perf_evlist__add(evlist, sys_enter);
2408 	perf_evlist__add(evlist, sys_exit);
2409 
2410 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2411 		/*
2412 		 * We're interested only in the user space callchain
2413 		 * leading to the syscall, allow overriding that for
2414 		 * debugging reasons using --kernel_syscall_callchains
2415 		 */
2416 		sys_exit->attr.exclude_callchain_kernel = 1;
2417 	}
2418 
2419 	trace->syscalls.events.sys_enter = sys_enter;
2420 	trace->syscalls.events.sys_exit  = sys_exit;
2421 
2422 	ret = 0;
2423 out:
2424 	return ret;
2425 
2426 out_delete_sys_exit:
2427 	perf_evsel__delete_priv(sys_exit);
2428 out_delete_sys_enter:
2429 	perf_evsel__delete_priv(sys_enter);
2430 	goto out;
2431 }
2432 
2433 static int trace__set_ev_qualifier_filter(struct trace *trace)
2434 {
2435 	int err = -1;
2436 	struct perf_evsel *sys_exit;
2437 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2438 						trace->ev_qualifier_ids.nr,
2439 						trace->ev_qualifier_ids.entries);
2440 
2441 	if (filter == NULL)
2442 		goto out_enomem;
2443 
2444 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2445 					  filter)) {
2446 		sys_exit = trace->syscalls.events.sys_exit;
2447 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2448 	}
2449 
2450 	free(filter);
2451 out:
2452 	return err;
2453 out_enomem:
2454 	errno = ENOMEM;
2455 	goto out;
2456 }
2457 
2458 static int trace__set_filter_loop_pids(struct trace *trace)
2459 {
2460 	unsigned int nr = 1;
2461 	pid_t pids[32] = {
2462 		getpid(),
2463 	};
2464 	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2465 
2466 	while (thread && nr < ARRAY_SIZE(pids)) {
2467 		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2468 
2469 		if (parent == NULL)
2470 			break;
2471 
2472 		if (!strcmp(thread__comm_str(parent), "sshd")) {
2473 			pids[nr++] = parent->tid;
2474 			break;
2475 		}
2476 		thread = parent;
2477 	}
2478 
2479 	return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2480 }
2481 
2482 static int trace__run(struct trace *trace, int argc, const char **argv)
2483 {
2484 	struct perf_evlist *evlist = trace->evlist;
2485 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2486 	int err = -1, i;
2487 	unsigned long before;
2488 	const bool forks = argc > 0;
2489 	bool draining = false;
2490 
2491 	trace->live = true;
2492 
2493 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2494 		goto out_error_raw_syscalls;
2495 
2496 	if (trace->trace_syscalls)
2497 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2498 
2499 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2500 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2501 		if (pgfault_maj == NULL)
2502 			goto out_error_mem;
2503 		perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2504 		perf_evlist__add(evlist, pgfault_maj);
2505 	}
2506 
2507 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2508 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2509 		if (pgfault_min == NULL)
2510 			goto out_error_mem;
2511 		perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2512 		perf_evlist__add(evlist, pgfault_min);
2513 	}
2514 
2515 	if (trace->sched &&
2516 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2517 				   trace__sched_stat_runtime))
2518 		goto out_error_sched_stat_runtime;
2519 
2520 	/*
2521 	 * If a global cgroup was set, apply it to all the events without an
2522 	 * explicit cgroup. I.e.:
2523 	 *
2524 	 * 	trace -G A -e sched:*switch
2525 	 *
2526 	 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc
2527 	 * _and_ sched:sched_switch to the 'A' cgroup, while:
2528 	 *
2529 	 * trace -e sched:*switch -G A
2530 	 *
2531 	 * will only set the sched:sched_switch event to the 'A' cgroup, all the
2532 	 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without"
2533 	 * a cgroup (on the root cgroup, sys wide, etc).
2534 	 *
2535 	 * Multiple cgroups:
2536 	 *
2537 	 * trace -G A -e sched:*switch -G B
2538 	 *
2539 	 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes
2540 	 * to the 'B' cgroup.
2541 	 *
2542 	 * evlist__set_default_cgroup() grabs a reference of the passed cgroup
2543 	 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL.
2544 	 */
2545 	if (trace->cgroup)
2546 		evlist__set_default_cgroup(trace->evlist, trace->cgroup);
2547 
2548 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2549 	if (err < 0) {
2550 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2551 		goto out_delete_evlist;
2552 	}
2553 
2554 	err = trace__symbols_init(trace, evlist);
2555 	if (err < 0) {
2556 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2557 		goto out_delete_evlist;
2558 	}
2559 
2560 	perf_evlist__config(evlist, &trace->opts, &callchain_param);
2561 
2562 	signal(SIGCHLD, sig_handler);
2563 	signal(SIGINT, sig_handler);
2564 
2565 	if (forks) {
2566 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2567 						    argv, false, NULL);
2568 		if (err < 0) {
2569 			fprintf(trace->output, "Couldn't run the workload!\n");
2570 			goto out_delete_evlist;
2571 		}
2572 	}
2573 
2574 	err = perf_evlist__open(evlist);
2575 	if (err < 0)
2576 		goto out_error_open;
2577 
2578 	err = bpf__apply_obj_config();
2579 	if (err) {
2580 		char errbuf[BUFSIZ];
2581 
2582 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2583 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2584 			 errbuf);
2585 		goto out_error_open;
2586 	}
2587 
2588 	/*
2589 	 * Better not use !target__has_task() here because we need to cover the
2590 	 * case where no threads were specified in the command line, but a
2591 	 * workload was, and in that case we will fill in the thread_map when
2592 	 * we fork the workload in perf_evlist__prepare_workload.
2593 	 */
2594 	if (trace->filter_pids.nr > 0)
2595 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2596 	else if (thread_map__pid(evlist->threads, 0) == -1)
2597 		err = trace__set_filter_loop_pids(trace);
2598 
2599 	if (err < 0)
2600 		goto out_error_mem;
2601 
2602 	if (trace->ev_qualifier_ids.nr > 0) {
2603 		err = trace__set_ev_qualifier_filter(trace);
2604 		if (err < 0)
2605 			goto out_errno;
2606 
2607 		pr_debug("event qualifier tracepoint filter: %s\n",
2608 			 trace->syscalls.events.sys_exit->filter);
2609 	}
2610 
2611 	err = perf_evlist__apply_filters(evlist, &evsel);
2612 	if (err < 0)
2613 		goto out_error_apply_filters;
2614 
2615 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages);
2616 	if (err < 0)
2617 		goto out_error_mmap;
2618 
2619 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2620 		perf_evlist__enable(evlist);
2621 
2622 	if (forks)
2623 		perf_evlist__start_workload(evlist);
2624 
2625 	if (trace->opts.initial_delay) {
2626 		usleep(trace->opts.initial_delay * 1000);
2627 		perf_evlist__enable(evlist);
2628 	}
2629 
2630 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2631 				  evlist->threads->nr > 1 ||
2632 				  perf_evlist__first(evlist)->attr.inherit;
2633 
2634 	/*
2635 	 * Now that we already used evsel->attr to ask the kernel to setup the
2636 	 * events, lets reuse evsel->attr.sample_max_stack as the limit in
2637 	 * trace__resolve_callchain(), allowing per-event max-stack settings
2638 	 * to override an explicitely set --max-stack global setting.
2639 	 */
2640 	evlist__for_each_entry(evlist, evsel) {
2641 		if (evsel__has_callchain(evsel) &&
2642 		    evsel->attr.sample_max_stack == 0)
2643 			evsel->attr.sample_max_stack = trace->max_stack;
2644 	}
2645 again:
2646 	before = trace->nr_events;
2647 
2648 	for (i = 0; i < evlist->nr_mmaps; i++) {
2649 		union perf_event *event;
2650 		struct perf_mmap *md;
2651 
2652 		md = &evlist->mmap[i];
2653 		if (perf_mmap__read_init(md) < 0)
2654 			continue;
2655 
2656 		while ((event = perf_mmap__read_event(md)) != NULL) {
2657 			struct perf_sample sample;
2658 
2659 			++trace->nr_events;
2660 
2661 			err = perf_evlist__parse_sample(evlist, event, &sample);
2662 			if (err) {
2663 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2664 				goto next_event;
2665 			}
2666 
2667 			trace__handle_event(trace, event, &sample);
2668 next_event:
2669 			perf_mmap__consume(md);
2670 
2671 			if (interrupted)
2672 				goto out_disable;
2673 
2674 			if (done && !draining) {
2675 				perf_evlist__disable(evlist);
2676 				draining = true;
2677 			}
2678 		}
2679 		perf_mmap__read_done(md);
2680 	}
2681 
2682 	if (trace->nr_events == before) {
2683 		int timeout = done ? 100 : -1;
2684 
2685 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2686 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2687 				draining = true;
2688 
2689 			goto again;
2690 		}
2691 	} else {
2692 		goto again;
2693 	}
2694 
2695 out_disable:
2696 	thread__zput(trace->current);
2697 
2698 	perf_evlist__disable(evlist);
2699 
2700 	if (!err) {
2701 		if (trace->summary)
2702 			trace__fprintf_thread_summary(trace, trace->output);
2703 
2704 		if (trace->show_tool_stats) {
2705 			fprintf(trace->output, "Stats:\n "
2706 					       " vfs_getname : %" PRIu64 "\n"
2707 					       " proc_getname: %" PRIu64 "\n",
2708 				trace->stats.vfs_getname,
2709 				trace->stats.proc_getname);
2710 		}
2711 	}
2712 
2713 out_delete_evlist:
2714 	trace__symbols__exit(trace);
2715 
2716 	perf_evlist__delete(evlist);
2717 	cgroup__put(trace->cgroup);
2718 	trace->evlist = NULL;
2719 	trace->live = false;
2720 	return err;
2721 {
2722 	char errbuf[BUFSIZ];
2723 
2724 out_error_sched_stat_runtime:
2725 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2726 	goto out_error;
2727 
2728 out_error_raw_syscalls:
2729 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2730 	goto out_error;
2731 
2732 out_error_mmap:
2733 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2734 	goto out_error;
2735 
2736 out_error_open:
2737 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2738 
2739 out_error:
2740 	fprintf(trace->output, "%s\n", errbuf);
2741 	goto out_delete_evlist;
2742 
2743 out_error_apply_filters:
2744 	fprintf(trace->output,
2745 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2746 		evsel->filter, perf_evsel__name(evsel), errno,
2747 		str_error_r(errno, errbuf, sizeof(errbuf)));
2748 	goto out_delete_evlist;
2749 }
2750 out_error_mem:
2751 	fprintf(trace->output, "Not enough memory to run!\n");
2752 	goto out_delete_evlist;
2753 
2754 out_errno:
2755 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2756 	goto out_delete_evlist;
2757 }
2758 
2759 static int trace__replay(struct trace *trace)
2760 {
2761 	const struct perf_evsel_str_handler handlers[] = {
2762 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2763 	};
2764 	struct perf_data data = {
2765 		.file      = {
2766 			.path = input_name,
2767 		},
2768 		.mode      = PERF_DATA_MODE_READ,
2769 		.force     = trace->force,
2770 	};
2771 	struct perf_session *session;
2772 	struct perf_evsel *evsel;
2773 	int err = -1;
2774 
2775 	trace->tool.sample	  = trace__process_sample;
2776 	trace->tool.mmap	  = perf_event__process_mmap;
2777 	trace->tool.mmap2	  = perf_event__process_mmap2;
2778 	trace->tool.comm	  = perf_event__process_comm;
2779 	trace->tool.exit	  = perf_event__process_exit;
2780 	trace->tool.fork	  = perf_event__process_fork;
2781 	trace->tool.attr	  = perf_event__process_attr;
2782 	trace->tool.tracing_data  = perf_event__process_tracing_data;
2783 	trace->tool.build_id	  = perf_event__process_build_id;
2784 	trace->tool.namespaces	  = perf_event__process_namespaces;
2785 
2786 	trace->tool.ordered_events = true;
2787 	trace->tool.ordering_requires_timestamps = true;
2788 
2789 	/* add tid to output */
2790 	trace->multiple_threads = true;
2791 
2792 	session = perf_session__new(&data, false, &trace->tool);
2793 	if (session == NULL)
2794 		return -1;
2795 
2796 	if (trace->opts.target.pid)
2797 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2798 
2799 	if (trace->opts.target.tid)
2800 		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2801 
2802 	if (symbol__init(&session->header.env) < 0)
2803 		goto out;
2804 
2805 	trace->host = &session->machines.host;
2806 
2807 	err = perf_session__set_tracepoints_handlers(session, handlers);
2808 	if (err)
2809 		goto out;
2810 
2811 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2812 						     "raw_syscalls:sys_enter");
2813 	/* older kernels have syscalls tp versus raw_syscalls */
2814 	if (evsel == NULL)
2815 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2816 							     "syscalls:sys_enter");
2817 
2818 	if (evsel &&
2819 	    (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 ||
2820 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2821 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2822 		goto out;
2823 	}
2824 
2825 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2826 						     "raw_syscalls:sys_exit");
2827 	if (evsel == NULL)
2828 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2829 							     "syscalls:sys_exit");
2830 	if (evsel &&
2831 	    (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 ||
2832 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2833 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2834 		goto out;
2835 	}
2836 
2837 	evlist__for_each_entry(session->evlist, evsel) {
2838 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2839 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2840 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2841 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2842 			evsel->handler = trace__pgfault;
2843 	}
2844 
2845 	setup_pager();
2846 
2847 	err = perf_session__process_events(session);
2848 	if (err)
2849 		pr_err("Failed to process events, error %d", err);
2850 
2851 	else if (trace->summary)
2852 		trace__fprintf_thread_summary(trace, trace->output);
2853 
2854 out:
2855 	perf_session__delete(session);
2856 
2857 	return err;
2858 }
2859 
2860 static size_t trace__fprintf_threads_header(FILE *fp)
2861 {
2862 	size_t printed;
2863 
2864 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2865 
2866 	return printed;
2867 }
2868 
2869 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2870 	struct stats 	*stats;
2871 	double		msecs;
2872 	int		syscall;
2873 )
2874 {
2875 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2876 	struct stats *stats = source->priv;
2877 
2878 	entry->syscall = source->i;
2879 	entry->stats   = stats;
2880 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2881 }
2882 
2883 static size_t thread__dump_stats(struct thread_trace *ttrace,
2884 				 struct trace *trace, FILE *fp)
2885 {
2886 	size_t printed = 0;
2887 	struct syscall *sc;
2888 	struct rb_node *nd;
2889 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2890 
2891 	if (syscall_stats == NULL)
2892 		return 0;
2893 
2894 	printed += fprintf(fp, "\n");
2895 
2896 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2897 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2898 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2899 
2900 	resort_rb__for_each_entry(nd, syscall_stats) {
2901 		struct stats *stats = syscall_stats_entry->stats;
2902 		if (stats) {
2903 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2904 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2905 			double avg = avg_stats(stats);
2906 			double pct;
2907 			u64 n = (u64) stats->n;
2908 
2909 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2910 			avg /= NSEC_PER_MSEC;
2911 
2912 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2913 			printed += fprintf(fp, "   %-15s", sc->name);
2914 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2915 					   n, syscall_stats_entry->msecs, min, avg);
2916 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2917 		}
2918 	}
2919 
2920 	resort_rb__delete(syscall_stats);
2921 	printed += fprintf(fp, "\n\n");
2922 
2923 	return printed;
2924 }
2925 
2926 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2927 {
2928 	size_t printed = 0;
2929 	struct thread_trace *ttrace = thread__priv(thread);
2930 	double ratio;
2931 
2932 	if (ttrace == NULL)
2933 		return 0;
2934 
2935 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2936 
2937 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2938 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2939 	printed += fprintf(fp, "%.1f%%", ratio);
2940 	if (ttrace->pfmaj)
2941 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2942 	if (ttrace->pfmin)
2943 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2944 	if (trace->sched)
2945 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2946 	else if (fputc('\n', fp) != EOF)
2947 		++printed;
2948 
2949 	printed += thread__dump_stats(ttrace, trace, fp);
2950 
2951 	return printed;
2952 }
2953 
2954 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2955 {
2956 	return ttrace ? ttrace->nr_events : 0;
2957 }
2958 
2959 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2960 	struct thread *thread;
2961 )
2962 {
2963 	entry->thread = rb_entry(nd, struct thread, rb_node);
2964 }
2965 
2966 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2967 {
2968 	size_t printed = trace__fprintf_threads_header(fp);
2969 	struct rb_node *nd;
2970 	int i;
2971 
2972 	for (i = 0; i < THREADS__TABLE_SIZE; i++) {
2973 		DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i);
2974 
2975 		if (threads == NULL) {
2976 			fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2977 			return 0;
2978 		}
2979 
2980 		resort_rb__for_each_entry(nd, threads)
2981 			printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2982 
2983 		resort_rb__delete(threads);
2984 	}
2985 	return printed;
2986 }
2987 
2988 static int trace__set_duration(const struct option *opt, const char *str,
2989 			       int unset __maybe_unused)
2990 {
2991 	struct trace *trace = opt->value;
2992 
2993 	trace->duration_filter = atof(str);
2994 	return 0;
2995 }
2996 
2997 static int trace__set_filter_pids(const struct option *opt, const char *str,
2998 				  int unset __maybe_unused)
2999 {
3000 	int ret = -1;
3001 	size_t i;
3002 	struct trace *trace = opt->value;
3003 	/*
3004 	 * FIXME: introduce a intarray class, plain parse csv and create a
3005 	 * { int nr, int entries[] } struct...
3006 	 */
3007 	struct intlist *list = intlist__new(str);
3008 
3009 	if (list == NULL)
3010 		return -1;
3011 
3012 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
3013 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
3014 
3015 	if (trace->filter_pids.entries == NULL)
3016 		goto out;
3017 
3018 	trace->filter_pids.entries[0] = getpid();
3019 
3020 	for (i = 1; i < trace->filter_pids.nr; ++i)
3021 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
3022 
3023 	intlist__delete(list);
3024 	ret = 0;
3025 out:
3026 	return ret;
3027 }
3028 
3029 static int trace__open_output(struct trace *trace, const char *filename)
3030 {
3031 	struct stat st;
3032 
3033 	if (!stat(filename, &st) && st.st_size) {
3034 		char oldname[PATH_MAX];
3035 
3036 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
3037 		unlink(oldname);
3038 		rename(filename, oldname);
3039 	}
3040 
3041 	trace->output = fopen(filename, "w");
3042 
3043 	return trace->output == NULL ? -errno : 0;
3044 }
3045 
3046 static int parse_pagefaults(const struct option *opt, const char *str,
3047 			    int unset __maybe_unused)
3048 {
3049 	int *trace_pgfaults = opt->value;
3050 
3051 	if (strcmp(str, "all") == 0)
3052 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
3053 	else if (strcmp(str, "maj") == 0)
3054 		*trace_pgfaults |= TRACE_PFMAJ;
3055 	else if (strcmp(str, "min") == 0)
3056 		*trace_pgfaults |= TRACE_PFMIN;
3057 	else
3058 		return -1;
3059 
3060 	return 0;
3061 }
3062 
3063 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
3064 {
3065 	struct perf_evsel *evsel;
3066 
3067 	evlist__for_each_entry(evlist, evsel)
3068 		evsel->handler = handler;
3069 }
3070 
3071 static int evlist__set_syscall_tp_fields(struct perf_evlist *evlist)
3072 {
3073 	struct perf_evsel *evsel;
3074 
3075 	evlist__for_each_entry(evlist, evsel) {
3076 		if (evsel->priv || !evsel->tp_format)
3077 			continue;
3078 
3079 		if (strcmp(evsel->tp_format->system, "syscalls"))
3080 			continue;
3081 
3082 		if (perf_evsel__init_syscall_tp(evsel))
3083 			return -1;
3084 
3085 		if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) {
3086 			struct syscall_tp *sc = evsel->priv;
3087 
3088 			if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)))
3089 				return -1;
3090 		} else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) {
3091 			struct syscall_tp *sc = evsel->priv;
3092 
3093 			if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap))
3094 				return -1;
3095 		}
3096 	}
3097 
3098 	return 0;
3099 }
3100 
3101 /*
3102  * XXX: Hackish, just splitting the combined -e+--event (syscalls
3103  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
3104  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
3105  *
3106  * It'd be better to introduce a parse_options() variant that would return a
3107  * list with the terms it didn't match to an event...
3108  */
3109 static int trace__parse_events_option(const struct option *opt, const char *str,
3110 				      int unset __maybe_unused)
3111 {
3112 	struct trace *trace = (struct trace *)opt->value;
3113 	const char *s = str;
3114 	char *sep = NULL, *lists[2] = { NULL, NULL, };
3115 	int len = strlen(str) + 1, err = -1, list, idx;
3116 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
3117 	char group_name[PATH_MAX];
3118 
3119 	if (strace_groups_dir == NULL)
3120 		return -1;
3121 
3122 	if (*s == '!') {
3123 		++s;
3124 		trace->not_ev_qualifier = true;
3125 	}
3126 
3127 	while (1) {
3128 		if ((sep = strchr(s, ',')) != NULL)
3129 			*sep = '\0';
3130 
3131 		list = 0;
3132 		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
3133 		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
3134 			list = 1;
3135 		} else {
3136 			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
3137 			if (access(group_name, R_OK) == 0)
3138 				list = 1;
3139 		}
3140 
3141 		if (lists[list]) {
3142 			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
3143 		} else {
3144 			lists[list] = malloc(len);
3145 			if (lists[list] == NULL)
3146 				goto out;
3147 			strcpy(lists[list], s);
3148 		}
3149 
3150 		if (!sep)
3151 			break;
3152 
3153 		*sep = ',';
3154 		s = sep + 1;
3155 	}
3156 
3157 	if (lists[1] != NULL) {
3158 		struct strlist_config slist_config = {
3159 			.dirname = strace_groups_dir,
3160 		};
3161 
3162 		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
3163 		if (trace->ev_qualifier == NULL) {
3164 			fputs("Not enough memory to parse event qualifier", trace->output);
3165 			goto out;
3166 		}
3167 
3168 		if (trace__validate_ev_qualifier(trace))
3169 			goto out;
3170 		trace->trace_syscalls = true;
3171 	}
3172 
3173 	err = 0;
3174 
3175 	if (lists[0]) {
3176 		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
3177 					       "event selector. use 'perf list' to list available events",
3178 					       parse_events_option);
3179 		err = parse_events_option(&o, lists[0], 0);
3180 	}
3181 out:
3182 	if (sep)
3183 		*sep = ',';
3184 
3185 	return err;
3186 }
3187 
3188 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset)
3189 {
3190 	struct trace *trace = opt->value;
3191 
3192 	if (!list_empty(&trace->evlist->entries))
3193 		return parse_cgroups(opt, str, unset);
3194 
3195 	trace->cgroup = evlist__findnew_cgroup(trace->evlist, str);
3196 
3197 	return 0;
3198 }
3199 
3200 int cmd_trace(int argc, const char **argv)
3201 {
3202 	const char *trace_usage[] = {
3203 		"perf trace [<options>] [<command>]",
3204 		"perf trace [<options>] -- <command> [<options>]",
3205 		"perf trace record [<options>] [<command>]",
3206 		"perf trace record [<options>] -- <command> [<options>]",
3207 		NULL
3208 	};
3209 	struct trace trace = {
3210 		.syscalls = {
3211 			. max = -1,
3212 		},
3213 		.opts = {
3214 			.target = {
3215 				.uid	   = UINT_MAX,
3216 				.uses_mmap = true,
3217 			},
3218 			.user_freq     = UINT_MAX,
3219 			.user_interval = ULLONG_MAX,
3220 			.no_buffering  = true,
3221 			.mmap_pages    = UINT_MAX,
3222 			.proc_map_timeout  = 500,
3223 		},
3224 		.output = stderr,
3225 		.show_comm = true,
3226 		.trace_syscalls = false,
3227 		.kernel_syscallchains = false,
3228 		.max_stack = UINT_MAX,
3229 	};
3230 	const char *output_name = NULL;
3231 	const struct option trace_options[] = {
3232 	OPT_CALLBACK('e', "event", &trace, "event",
3233 		     "event/syscall selector. use 'perf list' to list available events",
3234 		     trace__parse_events_option),
3235 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
3236 		    "show the thread COMM next to its id"),
3237 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3238 	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
3239 		     trace__parse_events_option),
3240 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
3241 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3242 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3243 		    "trace events on existing process id"),
3244 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3245 		    "trace events on existing thread id"),
3246 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3247 		     "pids to filter (by the kernel)", trace__set_filter_pids),
3248 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3249 		    "system-wide collection from all CPUs"),
3250 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3251 		    "list of cpus to monitor"),
3252 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3253 		    "child tasks do not inherit counters"),
3254 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3255 		     "number of mmap data pages",
3256 		     perf_evlist__parse_mmap_pages),
3257 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3258 		   "user to profile"),
3259 	OPT_CALLBACK(0, "duration", &trace, "float",
3260 		     "show only events with duration > N.M ms",
3261 		     trace__set_duration),
3262 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3263 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3264 	OPT_BOOLEAN('T', "time", &trace.full_time,
3265 		    "Show full timestamp, not time relative to first start"),
3266 	OPT_BOOLEAN(0, "failure", &trace.failure_only,
3267 		    "Show only syscalls that failed"),
3268 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
3269 		    "Show only syscall summary with statistics"),
3270 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
3271 		    "Show all syscalls and summary with statistics"),
3272 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3273 		     "Trace pagefaults", parse_pagefaults, "maj"),
3274 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3275 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3276 	OPT_CALLBACK(0, "call-graph", &trace.opts,
3277 		     "record_mode[,record_size]", record_callchain_help,
3278 		     &record_parse_callchain_opt),
3279 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3280 		    "Show the kernel callchains on the syscall exit path"),
3281 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3282 		     "Set the minimum stack depth when parsing the callchain, "
3283 		     "anything below the specified depth will be ignored."),
3284 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3285 		     "Set the maximum stack depth when parsing the callchain, "
3286 		     "anything beyond the specified depth will be ignored. "
3287 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3288 	OPT_BOOLEAN(0, "print-sample", &trace.print_sample,
3289 			"print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"),
3290 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3291 			"per thread proc mmap processing timeout in ms"),
3292 	OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only",
3293 		     trace__parse_cgroups),
3294 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3295 		     "ms to wait before starting measurement after program "
3296 		     "start"),
3297 	OPT_END()
3298 	};
3299 	bool __maybe_unused max_stack_user_set = true;
3300 	bool mmap_pages_user_set = true;
3301 	struct perf_evsel *evsel;
3302 	const char * const trace_subcommands[] = { "record", NULL };
3303 	int err = -1;
3304 	char bf[BUFSIZ];
3305 
3306 	signal(SIGSEGV, sighandler_dump_stack);
3307 	signal(SIGFPE, sighandler_dump_stack);
3308 
3309 	trace.evlist = perf_evlist__new();
3310 	trace.sctbl = syscalltbl__new();
3311 
3312 	if (trace.evlist == NULL || trace.sctbl == NULL) {
3313 		pr_err("Not enough memory to run!\n");
3314 		err = -ENOMEM;
3315 		goto out;
3316 	}
3317 
3318 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3319 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3320 
3321 	if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) {
3322 		usage_with_options_msg(trace_usage, trace_options,
3323 				       "cgroup monitoring only available in system-wide mode");
3324 	}
3325 
3326 	evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__");
3327 	if (IS_ERR(evsel)) {
3328 		bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf));
3329 		pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf);
3330 		goto out;
3331 	}
3332 
3333 	if (evsel) {
3334 		if (perf_evsel__init_augmented_syscall_tp(evsel) ||
3335 		    perf_evsel__init_augmented_syscall_tp_args(evsel))
3336 			goto out;
3337 		trace.syscalls.events.augmented = evsel;
3338 	}
3339 
3340 	err = bpf__setup_stdout(trace.evlist);
3341 	if (err) {
3342 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3343 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3344 		goto out;
3345 	}
3346 
3347 	err = -1;
3348 
3349 	if (trace.trace_pgfaults) {
3350 		trace.opts.sample_address = true;
3351 		trace.opts.sample_time = true;
3352 	}
3353 
3354 	if (trace.opts.mmap_pages == UINT_MAX)
3355 		mmap_pages_user_set = false;
3356 
3357 	if (trace.max_stack == UINT_MAX) {
3358 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack();
3359 		max_stack_user_set = false;
3360 	}
3361 
3362 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3363 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) {
3364 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3365 	}
3366 #endif
3367 
3368 	if (callchain_param.enabled) {
3369 		if (!mmap_pages_user_set && geteuid() == 0)
3370 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3371 
3372 		symbol_conf.use_callchain = true;
3373 	}
3374 
3375 	if (trace.evlist->nr_entries > 0) {
3376 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3377 		if (evlist__set_syscall_tp_fields(trace.evlist)) {
3378 			perror("failed to set syscalls:* tracepoint fields");
3379 			goto out;
3380 		}
3381 	}
3382 
3383 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3384 		return trace__record(&trace, argc-1, &argv[1]);
3385 
3386 	/* summary_only implies summary option, but don't overwrite summary if set */
3387 	if (trace.summary_only)
3388 		trace.summary = trace.summary_only;
3389 
3390 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3391 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3392 		trace.trace_syscalls = true;
3393 	}
3394 
3395 	if (output_name != NULL) {
3396 		err = trace__open_output(&trace, output_name);
3397 		if (err < 0) {
3398 			perror("failed to create output file");
3399 			goto out;
3400 		}
3401 	}
3402 
3403 	err = target__validate(&trace.opts.target);
3404 	if (err) {
3405 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3406 		fprintf(trace.output, "%s", bf);
3407 		goto out_close;
3408 	}
3409 
3410 	err = target__parse_uid(&trace.opts.target);
3411 	if (err) {
3412 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3413 		fprintf(trace.output, "%s", bf);
3414 		goto out_close;
3415 	}
3416 
3417 	if (!argc && target__none(&trace.opts.target))
3418 		trace.opts.target.system_wide = true;
3419 
3420 	if (input_name)
3421 		err = trace__replay(&trace);
3422 	else
3423 		err = trace__run(&trace, argc, argv);
3424 
3425 out_close:
3426 	if (output_name != NULL)
3427 		fclose(trace.output);
3428 out:
3429 	return err;
3430 }
3431