xref: /linux/tools/perf/builtin-trace.c (revision ea4539652eccc87b14fbcbc90467ebcb87f02ddb)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 #include "util/bpf-loader.h"
37 #include "callchain.h"
38 #include "syscalltbl.h"
39 
40 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
41 #include <stdlib.h>
42 #include <sys/mman.h>
43 #include <linux/futex.h>
44 #include <linux/err.h>
45 #include <linux/seccomp.h>
46 #include <linux/filter.h>
47 #include <linux/audit.h>
48 #include <sys/ptrace.h>
49 #include <linux/random.h>
50 
51 /* For older distros: */
52 #ifndef MAP_STACK
53 # define MAP_STACK		0x20000
54 #endif
55 
56 #ifndef MADV_HWPOISON
57 # define MADV_HWPOISON		100
58 
59 #endif
60 
61 #ifndef MADV_MERGEABLE
62 # define MADV_MERGEABLE		12
63 #endif
64 
65 #ifndef MADV_UNMERGEABLE
66 # define MADV_UNMERGEABLE	13
67 #endif
68 
69 #ifndef EFD_SEMAPHORE
70 # define EFD_SEMAPHORE		1
71 #endif
72 
73 #ifndef EFD_NONBLOCK
74 # define EFD_NONBLOCK		00004000
75 #endif
76 
77 #ifndef EFD_CLOEXEC
78 # define EFD_CLOEXEC		02000000
79 #endif
80 
81 #ifndef O_CLOEXEC
82 # define O_CLOEXEC		02000000
83 #endif
84 
85 #ifndef SOCK_DCCP
86 # define SOCK_DCCP		6
87 #endif
88 
89 #ifndef SOCK_CLOEXEC
90 # define SOCK_CLOEXEC		02000000
91 #endif
92 
93 #ifndef SOCK_NONBLOCK
94 # define SOCK_NONBLOCK		00004000
95 #endif
96 
97 #ifndef MSG_CMSG_CLOEXEC
98 # define MSG_CMSG_CLOEXEC	0x40000000
99 #endif
100 
101 #ifndef PERF_FLAG_FD_NO_GROUP
102 # define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
103 #endif
104 
105 #ifndef PERF_FLAG_FD_OUTPUT
106 # define PERF_FLAG_FD_OUTPUT		(1UL << 1)
107 #endif
108 
109 #ifndef PERF_FLAG_PID_CGROUP
110 # define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
111 #endif
112 
113 #ifndef PERF_FLAG_FD_CLOEXEC
114 # define PERF_FLAG_FD_CLOEXEC		(1UL << 3) /* O_CLOEXEC */
115 #endif
116 
117 struct trace {
118 	struct perf_tool	tool;
119 	struct syscalltbl	*sctbl;
120 	struct {
121 		int		max;
122 		struct syscall  *table;
123 		struct {
124 			struct perf_evsel *sys_enter,
125 					  *sys_exit;
126 		}		events;
127 	} syscalls;
128 	struct record_opts	opts;
129 	struct perf_evlist	*evlist;
130 	struct machine		*host;
131 	struct thread		*current;
132 	u64			base_time;
133 	FILE			*output;
134 	unsigned long		nr_events;
135 	struct strlist		*ev_qualifier;
136 	struct {
137 		size_t		nr;
138 		int		*entries;
139 	}			ev_qualifier_ids;
140 	struct intlist		*tid_list;
141 	struct intlist		*pid_list;
142 	struct {
143 		size_t		nr;
144 		pid_t		*entries;
145 	}			filter_pids;
146 	double			duration_filter;
147 	double			runtime_ms;
148 	struct {
149 		u64		vfs_getname,
150 				proc_getname;
151 	} stats;
152 	bool			not_ev_qualifier;
153 	bool			live;
154 	bool			full_time;
155 	bool			sched;
156 	bool			multiple_threads;
157 	bool			summary;
158 	bool			summary_only;
159 	bool			show_comm;
160 	bool			show_tool_stats;
161 	bool			trace_syscalls;
162 	bool			force;
163 	bool			vfs_getname;
164 	int			trace_pgfaults;
165 	int			open_id;
166 };
167 
168 struct tp_field {
169 	int offset;
170 	union {
171 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
172 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
173 	};
174 };
175 
176 #define TP_UINT_FIELD(bits) \
177 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
178 { \
179 	u##bits value; \
180 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
181 	return value;  \
182 }
183 
184 TP_UINT_FIELD(8);
185 TP_UINT_FIELD(16);
186 TP_UINT_FIELD(32);
187 TP_UINT_FIELD(64);
188 
189 #define TP_UINT_FIELD__SWAPPED(bits) \
190 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
191 { \
192 	u##bits value; \
193 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
194 	return bswap_##bits(value);\
195 }
196 
197 TP_UINT_FIELD__SWAPPED(16);
198 TP_UINT_FIELD__SWAPPED(32);
199 TP_UINT_FIELD__SWAPPED(64);
200 
201 static int tp_field__init_uint(struct tp_field *field,
202 			       struct format_field *format_field,
203 			       bool needs_swap)
204 {
205 	field->offset = format_field->offset;
206 
207 	switch (format_field->size) {
208 	case 1:
209 		field->integer = tp_field__u8;
210 		break;
211 	case 2:
212 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
213 		break;
214 	case 4:
215 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
216 		break;
217 	case 8:
218 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
219 		break;
220 	default:
221 		return -1;
222 	}
223 
224 	return 0;
225 }
226 
227 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
228 {
229 	return sample->raw_data + field->offset;
230 }
231 
232 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
233 {
234 	field->offset = format_field->offset;
235 	field->pointer = tp_field__ptr;
236 	return 0;
237 }
238 
239 struct syscall_tp {
240 	struct tp_field id;
241 	union {
242 		struct tp_field args, ret;
243 	};
244 };
245 
246 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
247 					  struct tp_field *field,
248 					  const char *name)
249 {
250 	struct format_field *format_field = perf_evsel__field(evsel, name);
251 
252 	if (format_field == NULL)
253 		return -1;
254 
255 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
256 }
257 
258 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
259 	({ struct syscall_tp *sc = evsel->priv;\
260 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
261 
262 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
263 					 struct tp_field *field,
264 					 const char *name)
265 {
266 	struct format_field *format_field = perf_evsel__field(evsel, name);
267 
268 	if (format_field == NULL)
269 		return -1;
270 
271 	return tp_field__init_ptr(field, format_field);
272 }
273 
274 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
275 	({ struct syscall_tp *sc = evsel->priv;\
276 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
277 
278 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
279 {
280 	zfree(&evsel->priv);
281 	perf_evsel__delete(evsel);
282 }
283 
284 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
285 {
286 	evsel->priv = malloc(sizeof(struct syscall_tp));
287 	if (evsel->priv != NULL) {
288 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
289 			goto out_delete;
290 
291 		evsel->handler = handler;
292 		return 0;
293 	}
294 
295 	return -ENOMEM;
296 
297 out_delete:
298 	zfree(&evsel->priv);
299 	return -ENOENT;
300 }
301 
302 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
303 {
304 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
305 
306 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
307 	if (IS_ERR(evsel))
308 		evsel = perf_evsel__newtp("syscalls", direction);
309 
310 	if (IS_ERR(evsel))
311 		return NULL;
312 
313 	if (perf_evsel__init_syscall_tp(evsel, handler))
314 		goto out_delete;
315 
316 	return evsel;
317 
318 out_delete:
319 	perf_evsel__delete_priv(evsel);
320 	return NULL;
321 }
322 
323 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
324 	({ struct syscall_tp *fields = evsel->priv; \
325 	   fields->name.integer(&fields->name, sample); })
326 
327 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
328 	({ struct syscall_tp *fields = evsel->priv; \
329 	   fields->name.pointer(&fields->name, sample); })
330 
331 struct syscall_arg {
332 	unsigned long val;
333 	struct thread *thread;
334 	struct trace  *trace;
335 	void	      *parm;
336 	u8	      idx;
337 	u8	      mask;
338 };
339 
340 struct strarray {
341 	int	    offset;
342 	int	    nr_entries;
343 	const char **entries;
344 };
345 
346 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
347 	.nr_entries = ARRAY_SIZE(array), \
348 	.entries = array, \
349 }
350 
351 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
352 	.offset	    = off, \
353 	.nr_entries = ARRAY_SIZE(array), \
354 	.entries = array, \
355 }
356 
357 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
358 						const char *intfmt,
359 					        struct syscall_arg *arg)
360 {
361 	struct strarray *sa = arg->parm;
362 	int idx = arg->val - sa->offset;
363 
364 	if (idx < 0 || idx >= sa->nr_entries)
365 		return scnprintf(bf, size, intfmt, arg->val);
366 
367 	return scnprintf(bf, size, "%s", sa->entries[idx]);
368 }
369 
370 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
371 					      struct syscall_arg *arg)
372 {
373 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
374 }
375 
376 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
377 
378 #if defined(__i386__) || defined(__x86_64__)
379 /*
380  * FIXME: Make this available to all arches as soon as the ioctl beautifier
381  * 	  gets rewritten to support all arches.
382  */
383 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
384 						 struct syscall_arg *arg)
385 {
386 	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
387 }
388 
389 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
390 #endif /* defined(__i386__) || defined(__x86_64__) */
391 
392 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
393 					struct syscall_arg *arg);
394 
395 #define SCA_FD syscall_arg__scnprintf_fd
396 
397 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
398 					   struct syscall_arg *arg)
399 {
400 	int fd = arg->val;
401 
402 	if (fd == AT_FDCWD)
403 		return scnprintf(bf, size, "CWD");
404 
405 	return syscall_arg__scnprintf_fd(bf, size, arg);
406 }
407 
408 #define SCA_FDAT syscall_arg__scnprintf_fd_at
409 
410 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
411 					      struct syscall_arg *arg);
412 
413 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
414 
415 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
416 					 struct syscall_arg *arg)
417 {
418 	return scnprintf(bf, size, "%#lx", arg->val);
419 }
420 
421 #define SCA_HEX syscall_arg__scnprintf_hex
422 
423 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
424 					 struct syscall_arg *arg)
425 {
426 	return scnprintf(bf, size, "%d", arg->val);
427 }
428 
429 #define SCA_INT syscall_arg__scnprintf_int
430 
431 static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
432 					       struct syscall_arg *arg)
433 {
434 	int printed = 0, prot = arg->val;
435 
436 	if (prot == PROT_NONE)
437 		return scnprintf(bf, size, "NONE");
438 #define	P_MMAP_PROT(n) \
439 	if (prot & PROT_##n) { \
440 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
441 		prot &= ~PROT_##n; \
442 	}
443 
444 	P_MMAP_PROT(EXEC);
445 	P_MMAP_PROT(READ);
446 	P_MMAP_PROT(WRITE);
447 #ifdef PROT_SEM
448 	P_MMAP_PROT(SEM);
449 #endif
450 	P_MMAP_PROT(GROWSDOWN);
451 	P_MMAP_PROT(GROWSUP);
452 #undef P_MMAP_PROT
453 
454 	if (prot)
455 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
456 
457 	return printed;
458 }
459 
460 #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
461 
462 static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
463 						struct syscall_arg *arg)
464 {
465 	int printed = 0, flags = arg->val;
466 
467 #define	P_MMAP_FLAG(n) \
468 	if (flags & MAP_##n) { \
469 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
470 		flags &= ~MAP_##n; \
471 	}
472 
473 	P_MMAP_FLAG(SHARED);
474 	P_MMAP_FLAG(PRIVATE);
475 #ifdef MAP_32BIT
476 	P_MMAP_FLAG(32BIT);
477 #endif
478 	P_MMAP_FLAG(ANONYMOUS);
479 	P_MMAP_FLAG(DENYWRITE);
480 	P_MMAP_FLAG(EXECUTABLE);
481 	P_MMAP_FLAG(FILE);
482 	P_MMAP_FLAG(FIXED);
483 	P_MMAP_FLAG(GROWSDOWN);
484 #ifdef MAP_HUGETLB
485 	P_MMAP_FLAG(HUGETLB);
486 #endif
487 	P_MMAP_FLAG(LOCKED);
488 	P_MMAP_FLAG(NONBLOCK);
489 	P_MMAP_FLAG(NORESERVE);
490 	P_MMAP_FLAG(POPULATE);
491 	P_MMAP_FLAG(STACK);
492 #ifdef MAP_UNINITIALIZED
493 	P_MMAP_FLAG(UNINITIALIZED);
494 #endif
495 #undef P_MMAP_FLAG
496 
497 	if (flags)
498 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
499 
500 	return printed;
501 }
502 
503 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
504 
505 static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
506 						  struct syscall_arg *arg)
507 {
508 	int printed = 0, flags = arg->val;
509 
510 #define P_MREMAP_FLAG(n) \
511 	if (flags & MREMAP_##n) { \
512 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
513 		flags &= ~MREMAP_##n; \
514 	}
515 
516 	P_MREMAP_FLAG(MAYMOVE);
517 #ifdef MREMAP_FIXED
518 	P_MREMAP_FLAG(FIXED);
519 #endif
520 #undef P_MREMAP_FLAG
521 
522 	if (flags)
523 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
524 
525 	return printed;
526 }
527 
528 #define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
529 
530 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
531 						      struct syscall_arg *arg)
532 {
533 	int behavior = arg->val;
534 
535 	switch (behavior) {
536 #define	P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
537 	P_MADV_BHV(NORMAL);
538 	P_MADV_BHV(RANDOM);
539 	P_MADV_BHV(SEQUENTIAL);
540 	P_MADV_BHV(WILLNEED);
541 	P_MADV_BHV(DONTNEED);
542 	P_MADV_BHV(REMOVE);
543 	P_MADV_BHV(DONTFORK);
544 	P_MADV_BHV(DOFORK);
545 	P_MADV_BHV(HWPOISON);
546 #ifdef MADV_SOFT_OFFLINE
547 	P_MADV_BHV(SOFT_OFFLINE);
548 #endif
549 	P_MADV_BHV(MERGEABLE);
550 	P_MADV_BHV(UNMERGEABLE);
551 #ifdef MADV_HUGEPAGE
552 	P_MADV_BHV(HUGEPAGE);
553 #endif
554 #ifdef MADV_NOHUGEPAGE
555 	P_MADV_BHV(NOHUGEPAGE);
556 #endif
557 #ifdef MADV_DONTDUMP
558 	P_MADV_BHV(DONTDUMP);
559 #endif
560 #ifdef MADV_DODUMP
561 	P_MADV_BHV(DODUMP);
562 #endif
563 #undef P_MADV_PHV
564 	default: break;
565 	}
566 
567 	return scnprintf(bf, size, "%#x", behavior);
568 }
569 
570 #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
571 
572 static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
573 					   struct syscall_arg *arg)
574 {
575 	int printed = 0, op = arg->val;
576 
577 	if (op == 0)
578 		return scnprintf(bf, size, "NONE");
579 #define	P_CMD(cmd) \
580 	if ((op & LOCK_##cmd) == LOCK_##cmd) { \
581 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
582 		op &= ~LOCK_##cmd; \
583 	}
584 
585 	P_CMD(SH);
586 	P_CMD(EX);
587 	P_CMD(NB);
588 	P_CMD(UN);
589 	P_CMD(MAND);
590 	P_CMD(RW);
591 	P_CMD(READ);
592 	P_CMD(WRITE);
593 #undef P_OP
594 
595 	if (op)
596 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
597 
598 	return printed;
599 }
600 
601 #define SCA_FLOCK syscall_arg__scnprintf_flock
602 
603 static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
604 {
605 	enum syscall_futex_args {
606 		SCF_UADDR   = (1 << 0),
607 		SCF_OP	    = (1 << 1),
608 		SCF_VAL	    = (1 << 2),
609 		SCF_TIMEOUT = (1 << 3),
610 		SCF_UADDR2  = (1 << 4),
611 		SCF_VAL3    = (1 << 5),
612 	};
613 	int op = arg->val;
614 	int cmd = op & FUTEX_CMD_MASK;
615 	size_t printed = 0;
616 
617 	switch (cmd) {
618 #define	P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
619 	P_FUTEX_OP(WAIT);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
620 	P_FUTEX_OP(WAKE);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
621 	P_FUTEX_OP(FD);		    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
622 	P_FUTEX_OP(REQUEUE);	    arg->mask |= SCF_VAL3|SCF_TIMEOUT;	          break;
623 	P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;			  break;
624 	P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;			  break;
625 	P_FUTEX_OP(WAKE_OP);							  break;
626 	P_FUTEX_OP(LOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
627 	P_FUTEX_OP(UNLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
628 	P_FUTEX_OP(TRYLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
629 	P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;			  break;
630 	P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;			  break;
631 	P_FUTEX_OP(WAIT_REQUEUE_PI);						  break;
632 	default: printed = scnprintf(bf, size, "%#x", cmd);			  break;
633 	}
634 
635 	if (op & FUTEX_PRIVATE_FLAG)
636 		printed += scnprintf(bf + printed, size - printed, "|PRIV");
637 
638 	if (op & FUTEX_CLOCK_REALTIME)
639 		printed += scnprintf(bf + printed, size - printed, "|CLKRT");
640 
641 	return printed;
642 }
643 
644 #define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
645 
646 static const char *bpf_cmd[] = {
647 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
648 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
649 };
650 static DEFINE_STRARRAY(bpf_cmd);
651 
652 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
653 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
654 
655 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
656 static DEFINE_STRARRAY(itimers);
657 
658 static const char *keyctl_options[] = {
659 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
660 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
661 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
662 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
663 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
664 };
665 static DEFINE_STRARRAY(keyctl_options);
666 
667 static const char *whences[] = { "SET", "CUR", "END",
668 #ifdef SEEK_DATA
669 "DATA",
670 #endif
671 #ifdef SEEK_HOLE
672 "HOLE",
673 #endif
674 };
675 static DEFINE_STRARRAY(whences);
676 
677 static const char *fcntl_cmds[] = {
678 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
679 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
680 	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
681 	"F_GETOWNER_UIDS",
682 };
683 static DEFINE_STRARRAY(fcntl_cmds);
684 
685 static const char *rlimit_resources[] = {
686 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
687 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
688 	"RTTIME",
689 };
690 static DEFINE_STRARRAY(rlimit_resources);
691 
692 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
693 static DEFINE_STRARRAY(sighow);
694 
695 static const char *clockid[] = {
696 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
697 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
698 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
699 };
700 static DEFINE_STRARRAY(clockid);
701 
702 static const char *socket_families[] = {
703 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
704 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
705 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
706 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
707 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
708 	"ALG", "NFC", "VSOCK",
709 };
710 static DEFINE_STRARRAY(socket_families);
711 
712 #ifndef SOCK_TYPE_MASK
713 #define SOCK_TYPE_MASK 0xf
714 #endif
715 
716 static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
717 						      struct syscall_arg *arg)
718 {
719 	size_t printed;
720 	int type = arg->val,
721 	    flags = type & ~SOCK_TYPE_MASK;
722 
723 	type &= SOCK_TYPE_MASK;
724 	/*
725  	 * Can't use a strarray, MIPS may override for ABI reasons.
726  	 */
727 	switch (type) {
728 #define	P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
729 	P_SK_TYPE(STREAM);
730 	P_SK_TYPE(DGRAM);
731 	P_SK_TYPE(RAW);
732 	P_SK_TYPE(RDM);
733 	P_SK_TYPE(SEQPACKET);
734 	P_SK_TYPE(DCCP);
735 	P_SK_TYPE(PACKET);
736 #undef P_SK_TYPE
737 	default:
738 		printed = scnprintf(bf, size, "%#x", type);
739 	}
740 
741 #define	P_SK_FLAG(n) \
742 	if (flags & SOCK_##n) { \
743 		printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
744 		flags &= ~SOCK_##n; \
745 	}
746 
747 	P_SK_FLAG(CLOEXEC);
748 	P_SK_FLAG(NONBLOCK);
749 #undef P_SK_FLAG
750 
751 	if (flags)
752 		printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
753 
754 	return printed;
755 }
756 
757 #define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
758 
759 #ifndef MSG_PROBE
760 #define MSG_PROBE	     0x10
761 #endif
762 #ifndef MSG_WAITFORONE
763 #define MSG_WAITFORONE	0x10000
764 #endif
765 #ifndef MSG_SENDPAGE_NOTLAST
766 #define MSG_SENDPAGE_NOTLAST 0x20000
767 #endif
768 #ifndef MSG_FASTOPEN
769 #define MSG_FASTOPEN	     0x20000000
770 #endif
771 
772 static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
773 					       struct syscall_arg *arg)
774 {
775 	int printed = 0, flags = arg->val;
776 
777 	if (flags == 0)
778 		return scnprintf(bf, size, "NONE");
779 #define	P_MSG_FLAG(n) \
780 	if (flags & MSG_##n) { \
781 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
782 		flags &= ~MSG_##n; \
783 	}
784 
785 	P_MSG_FLAG(OOB);
786 	P_MSG_FLAG(PEEK);
787 	P_MSG_FLAG(DONTROUTE);
788 	P_MSG_FLAG(TRYHARD);
789 	P_MSG_FLAG(CTRUNC);
790 	P_MSG_FLAG(PROBE);
791 	P_MSG_FLAG(TRUNC);
792 	P_MSG_FLAG(DONTWAIT);
793 	P_MSG_FLAG(EOR);
794 	P_MSG_FLAG(WAITALL);
795 	P_MSG_FLAG(FIN);
796 	P_MSG_FLAG(SYN);
797 	P_MSG_FLAG(CONFIRM);
798 	P_MSG_FLAG(RST);
799 	P_MSG_FLAG(ERRQUEUE);
800 	P_MSG_FLAG(NOSIGNAL);
801 	P_MSG_FLAG(MORE);
802 	P_MSG_FLAG(WAITFORONE);
803 	P_MSG_FLAG(SENDPAGE_NOTLAST);
804 	P_MSG_FLAG(FASTOPEN);
805 	P_MSG_FLAG(CMSG_CLOEXEC);
806 #undef P_MSG_FLAG
807 
808 	if (flags)
809 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
810 
811 	return printed;
812 }
813 
814 #define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
815 
816 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
817 						 struct syscall_arg *arg)
818 {
819 	size_t printed = 0;
820 	int mode = arg->val;
821 
822 	if (mode == F_OK) /* 0 */
823 		return scnprintf(bf, size, "F");
824 #define	P_MODE(n) \
825 	if (mode & n##_OK) { \
826 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
827 		mode &= ~n##_OK; \
828 	}
829 
830 	P_MODE(R);
831 	P_MODE(W);
832 	P_MODE(X);
833 #undef P_MODE
834 
835 	if (mode)
836 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
837 
838 	return printed;
839 }
840 
841 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
842 
843 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
844 					      struct syscall_arg *arg);
845 
846 #define SCA_FILENAME syscall_arg__scnprintf_filename
847 
848 static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
849 					       struct syscall_arg *arg)
850 {
851 	int printed = 0, flags = arg->val;
852 
853 	if (!(flags & O_CREAT))
854 		arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
855 
856 	if (flags == 0)
857 		return scnprintf(bf, size, "RDONLY");
858 #define	P_FLAG(n) \
859 	if (flags & O_##n) { \
860 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
861 		flags &= ~O_##n; \
862 	}
863 
864 	P_FLAG(APPEND);
865 	P_FLAG(ASYNC);
866 	P_FLAG(CLOEXEC);
867 	P_FLAG(CREAT);
868 	P_FLAG(DIRECT);
869 	P_FLAG(DIRECTORY);
870 	P_FLAG(EXCL);
871 	P_FLAG(LARGEFILE);
872 	P_FLAG(NOATIME);
873 	P_FLAG(NOCTTY);
874 #ifdef O_NONBLOCK
875 	P_FLAG(NONBLOCK);
876 #elif O_NDELAY
877 	P_FLAG(NDELAY);
878 #endif
879 #ifdef O_PATH
880 	P_FLAG(PATH);
881 #endif
882 	P_FLAG(RDWR);
883 #ifdef O_DSYNC
884 	if ((flags & O_SYNC) == O_SYNC)
885 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
886 	else {
887 		P_FLAG(DSYNC);
888 	}
889 #else
890 	P_FLAG(SYNC);
891 #endif
892 	P_FLAG(TRUNC);
893 	P_FLAG(WRONLY);
894 #undef P_FLAG
895 
896 	if (flags)
897 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
898 
899 	return printed;
900 }
901 
902 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
903 
904 static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
905 						struct syscall_arg *arg)
906 {
907 	int printed = 0, flags = arg->val;
908 
909 	if (flags == 0)
910 		return 0;
911 
912 #define	P_FLAG(n) \
913 	if (flags & PERF_FLAG_##n) { \
914 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
915 		flags &= ~PERF_FLAG_##n; \
916 	}
917 
918 	P_FLAG(FD_NO_GROUP);
919 	P_FLAG(FD_OUTPUT);
920 	P_FLAG(PID_CGROUP);
921 	P_FLAG(FD_CLOEXEC);
922 #undef P_FLAG
923 
924 	if (flags)
925 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
926 
927 	return printed;
928 }
929 
930 #define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
931 
932 static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
933 						   struct syscall_arg *arg)
934 {
935 	int printed = 0, flags = arg->val;
936 
937 	if (flags == 0)
938 		return scnprintf(bf, size, "NONE");
939 #define	P_FLAG(n) \
940 	if (flags & EFD_##n) { \
941 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
942 		flags &= ~EFD_##n; \
943 	}
944 
945 	P_FLAG(SEMAPHORE);
946 	P_FLAG(CLOEXEC);
947 	P_FLAG(NONBLOCK);
948 #undef P_FLAG
949 
950 	if (flags)
951 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
952 
953 	return printed;
954 }
955 
956 #define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
957 
958 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
959 						struct syscall_arg *arg)
960 {
961 	int printed = 0, flags = arg->val;
962 
963 #define	P_FLAG(n) \
964 	if (flags & O_##n) { \
965 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
966 		flags &= ~O_##n; \
967 	}
968 
969 	P_FLAG(CLOEXEC);
970 	P_FLAG(NONBLOCK);
971 #undef P_FLAG
972 
973 	if (flags)
974 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
975 
976 	return printed;
977 }
978 
979 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
980 
981 static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
982 {
983 	int sig = arg->val;
984 
985 	switch (sig) {
986 #define	P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
987 	P_SIGNUM(HUP);
988 	P_SIGNUM(INT);
989 	P_SIGNUM(QUIT);
990 	P_SIGNUM(ILL);
991 	P_SIGNUM(TRAP);
992 	P_SIGNUM(ABRT);
993 	P_SIGNUM(BUS);
994 	P_SIGNUM(FPE);
995 	P_SIGNUM(KILL);
996 	P_SIGNUM(USR1);
997 	P_SIGNUM(SEGV);
998 	P_SIGNUM(USR2);
999 	P_SIGNUM(PIPE);
1000 	P_SIGNUM(ALRM);
1001 	P_SIGNUM(TERM);
1002 	P_SIGNUM(CHLD);
1003 	P_SIGNUM(CONT);
1004 	P_SIGNUM(STOP);
1005 	P_SIGNUM(TSTP);
1006 	P_SIGNUM(TTIN);
1007 	P_SIGNUM(TTOU);
1008 	P_SIGNUM(URG);
1009 	P_SIGNUM(XCPU);
1010 	P_SIGNUM(XFSZ);
1011 	P_SIGNUM(VTALRM);
1012 	P_SIGNUM(PROF);
1013 	P_SIGNUM(WINCH);
1014 	P_SIGNUM(IO);
1015 	P_SIGNUM(PWR);
1016 	P_SIGNUM(SYS);
1017 #ifdef SIGEMT
1018 	P_SIGNUM(EMT);
1019 #endif
1020 #ifdef SIGSTKFLT
1021 	P_SIGNUM(STKFLT);
1022 #endif
1023 #ifdef SIGSWI
1024 	P_SIGNUM(SWI);
1025 #endif
1026 	default: break;
1027 	}
1028 
1029 	return scnprintf(bf, size, "%#x", sig);
1030 }
1031 
1032 #define SCA_SIGNUM syscall_arg__scnprintf_signum
1033 
1034 #if defined(__i386__) || defined(__x86_64__)
1035 /*
1036  * FIXME: Make this available to all arches.
1037  */
1038 #define TCGETS		0x5401
1039 
1040 static const char *tioctls[] = {
1041 	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
1042 	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
1043 	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
1044 	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
1045 	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
1046 	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
1047 	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
1048 	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
1049 	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
1050 	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
1051 	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
1052 	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
1053 	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
1054 	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
1055 	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
1056 };
1057 
1058 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
1059 #endif /* defined(__i386__) || defined(__x86_64__) */
1060 
1061 static size_t syscall_arg__scnprintf_seccomp_op(char *bf, size_t size, struct syscall_arg *arg)
1062 {
1063 	int op = arg->val;
1064 	size_t printed = 0;
1065 
1066 	switch (op) {
1067 #define	P_SECCOMP_SET_MODE_OP(n) case SECCOMP_SET_MODE_##n: printed = scnprintf(bf, size, #n); break
1068 	P_SECCOMP_SET_MODE_OP(STRICT);
1069 	P_SECCOMP_SET_MODE_OP(FILTER);
1070 #undef P_SECCOMP_SET_MODE_OP
1071 	default: printed = scnprintf(bf, size, "%#x", op);			  break;
1072 	}
1073 
1074 	return printed;
1075 }
1076 
1077 #define SCA_SECCOMP_OP  syscall_arg__scnprintf_seccomp_op
1078 
1079 static size_t syscall_arg__scnprintf_seccomp_flags(char *bf, size_t size,
1080 						   struct syscall_arg *arg)
1081 {
1082 	int printed = 0, flags = arg->val;
1083 
1084 #define	P_FLAG(n) \
1085 	if (flags & SECCOMP_FILTER_FLAG_##n) { \
1086 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
1087 		flags &= ~SECCOMP_FILTER_FLAG_##n; \
1088 	}
1089 
1090 	P_FLAG(TSYNC);
1091 #undef P_FLAG
1092 
1093 	if (flags)
1094 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
1095 
1096 	return printed;
1097 }
1098 
1099 #define SCA_SECCOMP_FLAGS syscall_arg__scnprintf_seccomp_flags
1100 
1101 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
1102 						   struct syscall_arg *arg)
1103 {
1104 	int printed = 0, flags = arg->val;
1105 
1106 #define	P_FLAG(n) \
1107 	if (flags & GRND_##n) { \
1108 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
1109 		flags &= ~GRND_##n; \
1110 	}
1111 
1112 	P_FLAG(RANDOM);
1113 	P_FLAG(NONBLOCK);
1114 #undef P_FLAG
1115 
1116 	if (flags)
1117 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
1118 
1119 	return printed;
1120 }
1121 
1122 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
1123 
1124 #define STRARRAY(arg, name, array) \
1125 	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
1126 	  .arg_parm	 = { [arg] = &strarray__##array, }
1127 
1128 #include "trace/beauty/pid.c"
1129 #include "trace/beauty/mode_t.c"
1130 #include "trace/beauty/sched_policy.c"
1131 #include "trace/beauty/waitid_options.c"
1132 
1133 static struct syscall_fmt {
1134 	const char *name;
1135 	const char *alias;
1136 	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
1137 	void	   *arg_parm[6];
1138 	bool	   errmsg;
1139 	bool	   errpid;
1140 	bool	   timeout;
1141 	bool	   hexret;
1142 } syscall_fmts[] = {
1143 	{ .name	    = "access",	    .errmsg = true,
1144 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */
1145 			     [1] = SCA_ACCMODE,  /* mode */ }, },
1146 	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
1147 	{ .name	    = "bpf",	    .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
1148 	{ .name	    = "brk",	    .hexret = true,
1149 	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
1150 	{ .name	    = "chdir",	    .errmsg = true,
1151 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1152 	{ .name	    = "chmod",	    .errmsg = true,
1153 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1154 	{ .name	    = "chroot",	    .errmsg = true,
1155 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1156 	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
1157 	{ .name	    = "clone",	    .errpid = true, },
1158 	{ .name	    = "close",	    .errmsg = true,
1159 	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
1160 	{ .name	    = "connect",    .errmsg = true, },
1161 	{ .name	    = "creat",	    .errmsg = true,
1162 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1163 	{ .name	    = "dup",	    .errmsg = true,
1164 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1165 	{ .name	    = "dup2",	    .errmsg = true,
1166 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1167 	{ .name	    = "dup3",	    .errmsg = true,
1168 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1169 	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
1170 	{ .name	    = "eventfd2",   .errmsg = true,
1171 	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
1172 	{ .name	    = "faccessat",  .errmsg = true,
1173 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1174 			     [1] = SCA_FILENAME, /* filename */ }, },
1175 	{ .name	    = "fadvise64",  .errmsg = true,
1176 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1177 	{ .name	    = "fallocate",  .errmsg = true,
1178 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1179 	{ .name	    = "fchdir",	    .errmsg = true,
1180 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1181 	{ .name	    = "fchmod",	    .errmsg = true,
1182 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1183 	{ .name	    = "fchmodat",   .errmsg = true,
1184 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1185 			     [1] = SCA_FILENAME, /* filename */ }, },
1186 	{ .name	    = "fchown",	    .errmsg = true,
1187 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1188 	{ .name	    = "fchownat",   .errmsg = true,
1189 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1190 			     [1] = SCA_FILENAME, /* filename */ }, },
1191 	{ .name	    = "fcntl",	    .errmsg = true,
1192 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1193 			     [1] = SCA_STRARRAY, /* cmd */ },
1194 	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
1195 	{ .name	    = "fdatasync",  .errmsg = true,
1196 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1197 	{ .name	    = "flock",	    .errmsg = true,
1198 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1199 			     [1] = SCA_FLOCK, /* cmd */ }, },
1200 	{ .name	    = "fsetxattr",  .errmsg = true,
1201 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1202 	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat",
1203 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1204 	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat",
1205 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1206 			     [1] = SCA_FILENAME, /* filename */ }, },
1207 	{ .name	    = "fstatfs",    .errmsg = true,
1208 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1209 	{ .name	    = "fsync",    .errmsg = true,
1210 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1211 	{ .name	    = "ftruncate", .errmsg = true,
1212 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1213 	{ .name	    = "futex",	    .errmsg = true,
1214 	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
1215 	{ .name	    = "futimesat", .errmsg = true,
1216 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1217 			     [1] = SCA_FILENAME, /* filename */ }, },
1218 	{ .name	    = "getdents",   .errmsg = true,
1219 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1220 	{ .name	    = "getdents64", .errmsg = true,
1221 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1222 	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1223 	{ .name	    = "getpid",	    .errpid = true, },
1224 	{ .name	    = "getpgid",    .errpid = true, },
1225 	{ .name	    = "getppid",    .errpid = true, },
1226 	{ .name	    = "getrandom",  .errmsg = true,
1227 	  .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
1228 	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1229 	{ .name	    = "getxattr",    .errmsg = true,
1230 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1231 	{ .name	    = "inotify_add_watch",	    .errmsg = true,
1232 	  .arg_scnprintf = { [1] = SCA_FILENAME, /* pathname */ }, },
1233 	{ .name	    = "ioctl",	    .errmsg = true,
1234 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1235 #if defined(__i386__) || defined(__x86_64__)
1236 /*
1237  * FIXME: Make this available to all arches.
1238  */
1239 			     [1] = SCA_STRHEXARRAY, /* cmd */
1240 			     [2] = SCA_HEX, /* arg */ },
1241 	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
1242 #else
1243 			     [2] = SCA_HEX, /* arg */ }, },
1244 #endif
1245 	{ .name	    = "keyctl",	    .errmsg = true, STRARRAY(0, option, keyctl_options), },
1246 	{ .name	    = "kill",	    .errmsg = true,
1247 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1248 	{ .name	    = "lchown",    .errmsg = true,
1249 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1250 	{ .name	    = "lgetxattr",  .errmsg = true,
1251 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1252 	{ .name	    = "linkat",	    .errmsg = true,
1253 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1254 	{ .name	    = "listxattr",  .errmsg = true,
1255 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1256 	{ .name	    = "llistxattr", .errmsg = true,
1257 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1258 	{ .name	    = "lremovexattr",  .errmsg = true,
1259 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1260 	{ .name	    = "lseek",	    .errmsg = true,
1261 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1262 			     [2] = SCA_STRARRAY, /* whence */ },
1263 	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
1264 	{ .name	    = "lsetxattr",  .errmsg = true,
1265 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1266 	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat",
1267 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1268 	{ .name	    = "lsxattr",    .errmsg = true,
1269 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1270 	{ .name     = "madvise",    .errmsg = true,
1271 	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
1272 			     [2] = SCA_MADV_BHV, /* behavior */ }, },
1273 	{ .name	    = "mkdir",    .errmsg = true,
1274 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1275 	{ .name	    = "mkdirat",    .errmsg = true,
1276 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1277 			     [1] = SCA_FILENAME, /* pathname */ }, },
1278 	{ .name	    = "mknod",      .errmsg = true,
1279 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1280 	{ .name	    = "mknodat",    .errmsg = true,
1281 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1282 			     [1] = SCA_FILENAME, /* filename */ }, },
1283 	{ .name	    = "mlock",	    .errmsg = true,
1284 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1285 	{ .name	    = "mlockall",   .errmsg = true,
1286 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1287 	{ .name	    = "mmap",	    .hexret = true,
1288 	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
1289 			     [2] = SCA_MMAP_PROT, /* prot */
1290 			     [3] = SCA_MMAP_FLAGS, /* flags */
1291 			     [4] = SCA_FD, 	  /* fd */ }, },
1292 	{ .name	    = "mprotect",   .errmsg = true,
1293 	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
1294 			     [2] = SCA_MMAP_PROT, /* prot */ }, },
1295 	{ .name	    = "mq_unlink", .errmsg = true,
1296 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
1297 	{ .name	    = "mremap",	    .hexret = true,
1298 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1299 			     [3] = SCA_MREMAP_FLAGS, /* flags */
1300 			     [4] = SCA_HEX, /* new_addr */ }, },
1301 	{ .name	    = "munlock",    .errmsg = true,
1302 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1303 	{ .name	    = "munmap",	    .errmsg = true,
1304 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1305 	{ .name	    = "name_to_handle_at", .errmsg = true,
1306 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1307 	{ .name	    = "newfstatat", .errmsg = true,
1308 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1309 			     [1] = SCA_FILENAME, /* filename */ }, },
1310 	{ .name	    = "open",	    .errmsg = true,
1311 	  .arg_scnprintf = { [0] = SCA_FILENAME,   /* filename */
1312 			     [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1313 	{ .name	    = "open_by_handle_at", .errmsg = true,
1314 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1315 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1316 	{ .name	    = "openat",	    .errmsg = true,
1317 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1318 			     [1] = SCA_FILENAME, /* filename */
1319 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1320 	{ .name	    = "perf_event_open", .errmsg = true,
1321 	  .arg_scnprintf = { [1] = SCA_INT, /* pid */
1322 			     [2] = SCA_INT, /* cpu */
1323 			     [3] = SCA_FD,  /* group_fd */
1324 			     [4] = SCA_PERF_FLAGS,  /* flags */ }, },
1325 	{ .name	    = "pipe2",	    .errmsg = true,
1326 	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1327 	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
1328 	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
1329 	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64",
1330 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1331 	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread",
1332 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1333 	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1334 	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64",
1335 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1336 	{ .name	    = "pwritev",    .errmsg = true,
1337 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1338 	{ .name	    = "read",	    .errmsg = true,
1339 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1340 	{ .name	    = "readlink",   .errmsg = true,
1341 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
1342 	{ .name	    = "readlinkat", .errmsg = true,
1343 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1344 			     [1] = SCA_FILENAME, /* pathname */ }, },
1345 	{ .name	    = "readv",	    .errmsg = true,
1346 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1347 	{ .name	    = "recvfrom",   .errmsg = true,
1348 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1349 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
1350 	{ .name	    = "recvmmsg",   .errmsg = true,
1351 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1352 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
1353 	{ .name	    = "recvmsg",    .errmsg = true,
1354 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1355 			     [2] = SCA_MSG_FLAGS, /* flags */ }, },
1356 	{ .name	    = "removexattr", .errmsg = true,
1357 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1358 	{ .name	    = "renameat",   .errmsg = true,
1359 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1360 	{ .name	    = "rmdir",    .errmsg = true,
1361 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1362 	{ .name	    = "rt_sigaction", .errmsg = true,
1363 	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1364 	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
1365 	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
1366 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1367 	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
1368 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1369 	{ .name	    = "sched_setscheduler",   .errmsg = true,
1370 	  .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
1371 	{ .name	    = "seccomp", .errmsg = true,
1372 	  .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
1373 			     [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
1374 	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
1375 	{ .name	    = "sendmmsg",    .errmsg = true,
1376 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1377 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
1378 	{ .name	    = "sendmsg",    .errmsg = true,
1379 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1380 			     [2] = SCA_MSG_FLAGS, /* flags */ }, },
1381 	{ .name	    = "sendto",	    .errmsg = true,
1382 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1383 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
1384 	{ .name	    = "set_tid_address", .errpid = true, },
1385 	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1386 	{ .name	    = "setpgid",    .errmsg = true, },
1387 	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1388 	{ .name	    = "setxattr",   .errmsg = true,
1389 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1390 	{ .name	    = "shutdown",   .errmsg = true,
1391 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1392 	{ .name	    = "socket",	    .errmsg = true,
1393 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1394 			     [1] = SCA_SK_TYPE, /* type */ },
1395 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
1396 	{ .name	    = "socketpair", .errmsg = true,
1397 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1398 			     [1] = SCA_SK_TYPE, /* type */ },
1399 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
1400 	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat",
1401 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1402 	{ .name	    = "statfs",	    .errmsg = true,
1403 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1404 	{ .name	    = "swapoff",    .errmsg = true,
1405 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
1406 	{ .name	    = "swapon",	    .errmsg = true,
1407 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
1408 	{ .name	    = "symlinkat",  .errmsg = true,
1409 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1410 	{ .name	    = "tgkill",	    .errmsg = true,
1411 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1412 	{ .name	    = "tkill",	    .errmsg = true,
1413 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1414 	{ .name	    = "truncate",   .errmsg = true,
1415 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
1416 	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
1417 	{ .name	    = "unlinkat",   .errmsg = true,
1418 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1419 			     [1] = SCA_FILENAME, /* pathname */ }, },
1420 	{ .name	    = "utime",  .errmsg = true,
1421 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1422 	{ .name	    = "utimensat",  .errmsg = true,
1423 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */
1424 			     [1] = SCA_FILENAME, /* filename */ }, },
1425 	{ .name	    = "utimes",  .errmsg = true,
1426 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1427 	{ .name	    = "vmsplice",  .errmsg = true,
1428 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1429 	{ .name	    = "wait4",	    .errpid = true,
1430 	  .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
1431 	{ .name	    = "waitid",	    .errpid = true,
1432 	  .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
1433 	{ .name	    = "write",	    .errmsg = true,
1434 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1435 	{ .name	    = "writev",	    .errmsg = true,
1436 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1437 };
1438 
1439 static int syscall_fmt__cmp(const void *name, const void *fmtp)
1440 {
1441 	const struct syscall_fmt *fmt = fmtp;
1442 	return strcmp(name, fmt->name);
1443 }
1444 
1445 static struct syscall_fmt *syscall_fmt__find(const char *name)
1446 {
1447 	const int nmemb = ARRAY_SIZE(syscall_fmts);
1448 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1449 }
1450 
1451 struct syscall {
1452 	struct event_format *tp_format;
1453 	int		    nr_args;
1454 	struct format_field *args;
1455 	const char	    *name;
1456 	bool		    is_exit;
1457 	struct syscall_fmt  *fmt;
1458 	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1459 	void		    **arg_parm;
1460 };
1461 
1462 static size_t fprintf_duration(unsigned long t, FILE *fp)
1463 {
1464 	double duration = (double)t / NSEC_PER_MSEC;
1465 	size_t printed = fprintf(fp, "(");
1466 
1467 	if (duration >= 1.0)
1468 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1469 	else if (duration >= 0.01)
1470 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1471 	else
1472 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1473 	return printed + fprintf(fp, "): ");
1474 }
1475 
1476 /**
1477  * filename.ptr: The filename char pointer that will be vfs_getname'd
1478  * filename.entry_str_pos: Where to insert the string translated from
1479  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
1480  */
1481 struct thread_trace {
1482 	u64		  entry_time;
1483 	u64		  exit_time;
1484 	bool		  entry_pending;
1485 	unsigned long	  nr_events;
1486 	unsigned long	  pfmaj, pfmin;
1487 	char		  *entry_str;
1488 	double		  runtime_ms;
1489         struct {
1490 		unsigned long ptr;
1491 		short int     entry_str_pos;
1492 		bool	      pending_open;
1493 		unsigned int  namelen;
1494 		char	      *name;
1495 	} filename;
1496 	struct {
1497 		int	  max;
1498 		char	  **table;
1499 	} paths;
1500 
1501 	struct intlist *syscall_stats;
1502 };
1503 
1504 static struct thread_trace *thread_trace__new(void)
1505 {
1506 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1507 
1508 	if (ttrace)
1509 		ttrace->paths.max = -1;
1510 
1511 	ttrace->syscall_stats = intlist__new(NULL);
1512 
1513 	return ttrace;
1514 }
1515 
1516 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1517 {
1518 	struct thread_trace *ttrace;
1519 
1520 	if (thread == NULL)
1521 		goto fail;
1522 
1523 	if (thread__priv(thread) == NULL)
1524 		thread__set_priv(thread, thread_trace__new());
1525 
1526 	if (thread__priv(thread) == NULL)
1527 		goto fail;
1528 
1529 	ttrace = thread__priv(thread);
1530 	++ttrace->nr_events;
1531 
1532 	return ttrace;
1533 fail:
1534 	color_fprintf(fp, PERF_COLOR_RED,
1535 		      "WARNING: not enough memory, dropping samples!\n");
1536 	return NULL;
1537 }
1538 
1539 #define TRACE_PFMAJ		(1 << 0)
1540 #define TRACE_PFMIN		(1 << 1)
1541 
1542 static const size_t trace__entry_str_size = 2048;
1543 
1544 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1545 {
1546 	struct thread_trace *ttrace = thread__priv(thread);
1547 
1548 	if (fd > ttrace->paths.max) {
1549 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1550 
1551 		if (npath == NULL)
1552 			return -1;
1553 
1554 		if (ttrace->paths.max != -1) {
1555 			memset(npath + ttrace->paths.max + 1, 0,
1556 			       (fd - ttrace->paths.max) * sizeof(char *));
1557 		} else {
1558 			memset(npath, 0, (fd + 1) * sizeof(char *));
1559 		}
1560 
1561 		ttrace->paths.table = npath;
1562 		ttrace->paths.max   = fd;
1563 	}
1564 
1565 	ttrace->paths.table[fd] = strdup(pathname);
1566 
1567 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
1568 }
1569 
1570 static int thread__read_fd_path(struct thread *thread, int fd)
1571 {
1572 	char linkname[PATH_MAX], pathname[PATH_MAX];
1573 	struct stat st;
1574 	int ret;
1575 
1576 	if (thread->pid_ == thread->tid) {
1577 		scnprintf(linkname, sizeof(linkname),
1578 			  "/proc/%d/fd/%d", thread->pid_, fd);
1579 	} else {
1580 		scnprintf(linkname, sizeof(linkname),
1581 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1582 	}
1583 
1584 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1585 		return -1;
1586 
1587 	ret = readlink(linkname, pathname, sizeof(pathname));
1588 
1589 	if (ret < 0 || ret > st.st_size)
1590 		return -1;
1591 
1592 	pathname[ret] = '\0';
1593 	return trace__set_fd_pathname(thread, fd, pathname);
1594 }
1595 
1596 static const char *thread__fd_path(struct thread *thread, int fd,
1597 				   struct trace *trace)
1598 {
1599 	struct thread_trace *ttrace = thread__priv(thread);
1600 
1601 	if (ttrace == NULL)
1602 		return NULL;
1603 
1604 	if (fd < 0)
1605 		return NULL;
1606 
1607 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1608 		if (!trace->live)
1609 			return NULL;
1610 		++trace->stats.proc_getname;
1611 		if (thread__read_fd_path(thread, fd))
1612 			return NULL;
1613 	}
1614 
1615 	return ttrace->paths.table[fd];
1616 }
1617 
1618 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1619 					struct syscall_arg *arg)
1620 {
1621 	int fd = arg->val;
1622 	size_t printed = scnprintf(bf, size, "%d", fd);
1623 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1624 
1625 	if (path)
1626 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1627 
1628 	return printed;
1629 }
1630 
1631 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1632 					      struct syscall_arg *arg)
1633 {
1634 	int fd = arg->val;
1635 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1636 	struct thread_trace *ttrace = thread__priv(arg->thread);
1637 
1638 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1639 		zfree(&ttrace->paths.table[fd]);
1640 
1641 	return printed;
1642 }
1643 
1644 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1645 				     unsigned long ptr)
1646 {
1647 	struct thread_trace *ttrace = thread__priv(thread);
1648 
1649 	ttrace->filename.ptr = ptr;
1650 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1651 }
1652 
1653 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1654 					      struct syscall_arg *arg)
1655 {
1656 	unsigned long ptr = arg->val;
1657 
1658 	if (!arg->trace->vfs_getname)
1659 		return scnprintf(bf, size, "%#x", ptr);
1660 
1661 	thread__set_filename_pos(arg->thread, bf, ptr);
1662 	return 0;
1663 }
1664 
1665 static bool trace__filter_duration(struct trace *trace, double t)
1666 {
1667 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1668 }
1669 
1670 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1671 {
1672 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1673 
1674 	return fprintf(fp, "%10.3f ", ts);
1675 }
1676 
1677 static bool done = false;
1678 static bool interrupted = false;
1679 
1680 static void sig_handler(int sig)
1681 {
1682 	done = true;
1683 	interrupted = sig == SIGINT;
1684 }
1685 
1686 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1687 					u64 duration, u64 tstamp, FILE *fp)
1688 {
1689 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1690 	printed += fprintf_duration(duration, fp);
1691 
1692 	if (trace->multiple_threads) {
1693 		if (trace->show_comm)
1694 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1695 		printed += fprintf(fp, "%d ", thread->tid);
1696 	}
1697 
1698 	return printed;
1699 }
1700 
1701 static int trace__process_event(struct trace *trace, struct machine *machine,
1702 				union perf_event *event, struct perf_sample *sample)
1703 {
1704 	int ret = 0;
1705 
1706 	switch (event->header.type) {
1707 	case PERF_RECORD_LOST:
1708 		color_fprintf(trace->output, PERF_COLOR_RED,
1709 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1710 		ret = machine__process_lost_event(machine, event, sample);
1711 		break;
1712 	default:
1713 		ret = machine__process_event(machine, event, sample);
1714 		break;
1715 	}
1716 
1717 	return ret;
1718 }
1719 
1720 static int trace__tool_process(struct perf_tool *tool,
1721 			       union perf_event *event,
1722 			       struct perf_sample *sample,
1723 			       struct machine *machine)
1724 {
1725 	struct trace *trace = container_of(tool, struct trace, tool);
1726 	return trace__process_event(trace, machine, event, sample);
1727 }
1728 
1729 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1730 {
1731 	int err = symbol__init(NULL);
1732 
1733 	if (err)
1734 		return err;
1735 
1736 	trace->host = machine__new_host();
1737 	if (trace->host == NULL)
1738 		return -ENOMEM;
1739 
1740 	if (trace_event__register_resolver(trace->host, machine__resolve_kernel_addr) < 0)
1741 		return -errno;
1742 
1743 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1744 					    evlist->threads, trace__tool_process, false,
1745 					    trace->opts.proc_map_timeout);
1746 	if (err)
1747 		symbol__exit();
1748 
1749 	return err;
1750 }
1751 
1752 static int syscall__set_arg_fmts(struct syscall *sc)
1753 {
1754 	struct format_field *field;
1755 	int idx = 0;
1756 
1757 	sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1758 	if (sc->arg_scnprintf == NULL)
1759 		return -1;
1760 
1761 	if (sc->fmt)
1762 		sc->arg_parm = sc->fmt->arg_parm;
1763 
1764 	for (field = sc->args; field; field = field->next) {
1765 		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1766 			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1767 		else if (field->flags & FIELD_IS_POINTER)
1768 			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1769 		else if (strcmp(field->type, "pid_t") == 0)
1770 			sc->arg_scnprintf[idx] = SCA_PID;
1771 		else if (strcmp(field->type, "umode_t") == 0)
1772 			sc->arg_scnprintf[idx] = SCA_MODE_T;
1773 		++idx;
1774 	}
1775 
1776 	return 0;
1777 }
1778 
1779 static int trace__read_syscall_info(struct trace *trace, int id)
1780 {
1781 	char tp_name[128];
1782 	struct syscall *sc;
1783 	const char *name = syscalltbl__name(trace->sctbl, id);
1784 
1785 	if (name == NULL)
1786 		return -1;
1787 
1788 	if (id > trace->syscalls.max) {
1789 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1790 
1791 		if (nsyscalls == NULL)
1792 			return -1;
1793 
1794 		if (trace->syscalls.max != -1) {
1795 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1796 			       (id - trace->syscalls.max) * sizeof(*sc));
1797 		} else {
1798 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1799 		}
1800 
1801 		trace->syscalls.table = nsyscalls;
1802 		trace->syscalls.max   = id;
1803 	}
1804 
1805 	sc = trace->syscalls.table + id;
1806 	sc->name = name;
1807 
1808 	sc->fmt  = syscall_fmt__find(sc->name);
1809 
1810 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1811 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1812 
1813 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1814 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1815 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1816 	}
1817 
1818 	if (IS_ERR(sc->tp_format))
1819 		return -1;
1820 
1821 	sc->args = sc->tp_format->format.fields;
1822 	sc->nr_args = sc->tp_format->format.nr_fields;
1823 	/*
1824 	 * We need to check and discard the first variable '__syscall_nr'
1825 	 * or 'nr' that mean the syscall number. It is needless here.
1826 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1827 	 */
1828 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1829 		sc->args = sc->args->next;
1830 		--sc->nr_args;
1831 	}
1832 
1833 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1834 
1835 	return syscall__set_arg_fmts(sc);
1836 }
1837 
1838 static int trace__validate_ev_qualifier(struct trace *trace)
1839 {
1840 	int err = 0, i;
1841 	struct str_node *pos;
1842 
1843 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1844 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1845 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1846 
1847 	if (trace->ev_qualifier_ids.entries == NULL) {
1848 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1849 		       trace->output);
1850 		err = -EINVAL;
1851 		goto out;
1852 	}
1853 
1854 	i = 0;
1855 
1856 	strlist__for_each(pos, trace->ev_qualifier) {
1857 		const char *sc = pos->s;
1858 		int id = syscalltbl__id(trace->sctbl, sc);
1859 
1860 		if (id < 0) {
1861 			if (err == 0) {
1862 				fputs("Error:\tInvalid syscall ", trace->output);
1863 				err = -EINVAL;
1864 			} else {
1865 				fputs(", ", trace->output);
1866 			}
1867 
1868 			fputs(sc, trace->output);
1869 		}
1870 
1871 		trace->ev_qualifier_ids.entries[i++] = id;
1872 	}
1873 
1874 	if (err < 0) {
1875 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1876 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1877 		zfree(&trace->ev_qualifier_ids.entries);
1878 		trace->ev_qualifier_ids.nr = 0;
1879 	}
1880 out:
1881 	return err;
1882 }
1883 
1884 /*
1885  * args is to be interpreted as a series of longs but we need to handle
1886  * 8-byte unaligned accesses. args points to raw_data within the event
1887  * and raw_data is guaranteed to be 8-byte unaligned because it is
1888  * preceded by raw_size which is a u32. So we need to copy args to a temp
1889  * variable to read it. Most notably this avoids extended load instructions
1890  * on unaligned addresses
1891  */
1892 
1893 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1894 				      unsigned char *args, struct trace *trace,
1895 				      struct thread *thread)
1896 {
1897 	size_t printed = 0;
1898 	unsigned char *p;
1899 	unsigned long val;
1900 
1901 	if (sc->args != NULL) {
1902 		struct format_field *field;
1903 		u8 bit = 1;
1904 		struct syscall_arg arg = {
1905 			.idx	= 0,
1906 			.mask	= 0,
1907 			.trace  = trace,
1908 			.thread = thread,
1909 		};
1910 
1911 		for (field = sc->args; field;
1912 		     field = field->next, ++arg.idx, bit <<= 1) {
1913 			if (arg.mask & bit)
1914 				continue;
1915 
1916 			/* special care for unaligned accesses */
1917 			p = args + sizeof(unsigned long) * arg.idx;
1918 			memcpy(&val, p, sizeof(val));
1919 
1920 			/*
1921  			 * Suppress this argument if its value is zero and
1922  			 * and we don't have a string associated in an
1923  			 * strarray for it.
1924  			 */
1925 			if (val == 0 &&
1926 			    !(sc->arg_scnprintf &&
1927 			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1928 			      sc->arg_parm[arg.idx]))
1929 				continue;
1930 
1931 			printed += scnprintf(bf + printed, size - printed,
1932 					     "%s%s: ", printed ? ", " : "", field->name);
1933 			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1934 				arg.val = val;
1935 				if (sc->arg_parm)
1936 					arg.parm = sc->arg_parm[arg.idx];
1937 				printed += sc->arg_scnprintf[arg.idx](bf + printed,
1938 								      size - printed, &arg);
1939 			} else {
1940 				printed += scnprintf(bf + printed, size - printed,
1941 						     "%ld", val);
1942 			}
1943 		}
1944 	} else {
1945 		int i = 0;
1946 
1947 		while (i < 6) {
1948 			/* special care for unaligned accesses */
1949 			p = args + sizeof(unsigned long) * i;
1950 			memcpy(&val, p, sizeof(val));
1951 			printed += scnprintf(bf + printed, size - printed,
1952 					     "%sarg%d: %ld",
1953 					     printed ? ", " : "", i, val);
1954 			++i;
1955 		}
1956 	}
1957 
1958 	return printed;
1959 }
1960 
1961 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1962 				  union perf_event *event,
1963 				  struct perf_sample *sample);
1964 
1965 static struct syscall *trace__syscall_info(struct trace *trace,
1966 					   struct perf_evsel *evsel, int id)
1967 {
1968 
1969 	if (id < 0) {
1970 
1971 		/*
1972 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1973 		 * before that, leaving at a higher verbosity level till that is
1974 		 * explained. Reproduced with plain ftrace with:
1975 		 *
1976 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1977 		 * grep "NR -1 " /t/trace_pipe
1978 		 *
1979 		 * After generating some load on the machine.
1980  		 */
1981 		if (verbose > 1) {
1982 			static u64 n;
1983 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1984 				id, perf_evsel__name(evsel), ++n);
1985 		}
1986 		return NULL;
1987 	}
1988 
1989 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1990 	    trace__read_syscall_info(trace, id))
1991 		goto out_cant_read;
1992 
1993 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1994 		goto out_cant_read;
1995 
1996 	return &trace->syscalls.table[id];
1997 
1998 out_cant_read:
1999 	if (verbose) {
2000 		fprintf(trace->output, "Problems reading syscall %d", id);
2001 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
2002 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
2003 		fputs(" information\n", trace->output);
2004 	}
2005 	return NULL;
2006 }
2007 
2008 static void thread__update_stats(struct thread_trace *ttrace,
2009 				 int id, struct perf_sample *sample)
2010 {
2011 	struct int_node *inode;
2012 	struct stats *stats;
2013 	u64 duration = 0;
2014 
2015 	inode = intlist__findnew(ttrace->syscall_stats, id);
2016 	if (inode == NULL)
2017 		return;
2018 
2019 	stats = inode->priv;
2020 	if (stats == NULL) {
2021 		stats = malloc(sizeof(struct stats));
2022 		if (stats == NULL)
2023 			return;
2024 		init_stats(stats);
2025 		inode->priv = stats;
2026 	}
2027 
2028 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
2029 		duration = sample->time - ttrace->entry_time;
2030 
2031 	update_stats(stats, duration);
2032 }
2033 
2034 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
2035 {
2036 	struct thread_trace *ttrace;
2037 	u64 duration;
2038 	size_t printed;
2039 
2040 	if (trace->current == NULL)
2041 		return 0;
2042 
2043 	ttrace = thread__priv(trace->current);
2044 
2045 	if (!ttrace->entry_pending)
2046 		return 0;
2047 
2048 	duration = sample->time - ttrace->entry_time;
2049 
2050 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
2051 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
2052 	ttrace->entry_pending = false;
2053 
2054 	return printed;
2055 }
2056 
2057 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
2058 			    union perf_event *event __maybe_unused,
2059 			    struct perf_sample *sample)
2060 {
2061 	char *msg;
2062 	void *args;
2063 	size_t printed = 0;
2064 	struct thread *thread;
2065 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2066 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
2067 	struct thread_trace *ttrace;
2068 
2069 	if (sc == NULL)
2070 		return -1;
2071 
2072 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2073 	ttrace = thread__trace(thread, trace->output);
2074 	if (ttrace == NULL)
2075 		goto out_put;
2076 
2077 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
2078 
2079 	if (ttrace->entry_str == NULL) {
2080 		ttrace->entry_str = malloc(trace__entry_str_size);
2081 		if (!ttrace->entry_str)
2082 			goto out_put;
2083 	}
2084 
2085 	if (!trace->summary_only)
2086 		trace__printf_interrupted_entry(trace, sample);
2087 
2088 	ttrace->entry_time = sample->time;
2089 	msg = ttrace->entry_str;
2090 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
2091 
2092 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
2093 					   args, trace, thread);
2094 
2095 	if (sc->is_exit) {
2096 		if (!trace->duration_filter && !trace->summary_only) {
2097 			trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
2098 			fprintf(trace->output, "%-70s\n", ttrace->entry_str);
2099 		}
2100 	} else {
2101 		ttrace->entry_pending = true;
2102 		/* See trace__vfs_getname & trace__sys_exit */
2103 		ttrace->filename.pending_open = false;
2104 	}
2105 
2106 	if (trace->current != thread) {
2107 		thread__put(trace->current);
2108 		trace->current = thread__get(thread);
2109 	}
2110 	err = 0;
2111 out_put:
2112 	thread__put(thread);
2113 	return err;
2114 }
2115 
2116 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
2117 			   union perf_event *event __maybe_unused,
2118 			   struct perf_sample *sample)
2119 {
2120 	long ret;
2121 	u64 duration = 0;
2122 	struct thread *thread;
2123 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2124 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
2125 	struct thread_trace *ttrace;
2126 
2127 	if (sc == NULL)
2128 		return -1;
2129 
2130 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2131 	ttrace = thread__trace(thread, trace->output);
2132 	if (ttrace == NULL)
2133 		goto out_put;
2134 
2135 	if (trace->summary)
2136 		thread__update_stats(ttrace, id, sample);
2137 
2138 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2139 
2140 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
2141 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2142 		ttrace->filename.pending_open = false;
2143 		++trace->stats.vfs_getname;
2144 	}
2145 
2146 	ttrace->exit_time = sample->time;
2147 
2148 	if (ttrace->entry_time) {
2149 		duration = sample->time - ttrace->entry_time;
2150 		if (trace__filter_duration(trace, duration))
2151 			goto out;
2152 	} else if (trace->duration_filter)
2153 		goto out;
2154 
2155 	if (trace->summary_only)
2156 		goto out;
2157 
2158 	trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
2159 
2160 	if (ttrace->entry_pending) {
2161 		fprintf(trace->output, "%-70s", ttrace->entry_str);
2162 	} else {
2163 		fprintf(trace->output, " ... [");
2164 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2165 		fprintf(trace->output, "]: %s()", sc->name);
2166 	}
2167 
2168 	if (sc->fmt == NULL) {
2169 signed_print:
2170 		fprintf(trace->output, ") = %ld", ret);
2171 	} else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
2172 		char bf[STRERR_BUFSIZE];
2173 		const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
2174 			   *e = audit_errno_to_name(-ret);
2175 
2176 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
2177 	} else if (ret == 0 && sc->fmt->timeout)
2178 		fprintf(trace->output, ") = 0 Timeout");
2179 	else if (sc->fmt->hexret)
2180 		fprintf(trace->output, ") = %#lx", ret);
2181 	else if (sc->fmt->errpid) {
2182 		struct thread *child = machine__find_thread(trace->host, ret, ret);
2183 
2184 		if (child != NULL) {
2185 			fprintf(trace->output, ") = %ld", ret);
2186 			if (child->comm_set)
2187 				fprintf(trace->output, " (%s)", thread__comm_str(child));
2188 			thread__put(child);
2189 		}
2190 	} else
2191 		goto signed_print;
2192 
2193 	fputc('\n', trace->output);
2194 
2195 	if (sample->callchain) {
2196 		struct addr_location al;
2197 		/* TODO: user-configurable print_opts */
2198 		const unsigned int print_opts = PRINT_IP_OPT_SYM
2199 					      | PRINT_IP_OPT_DSO;
2200 
2201 		if (machine__resolve(trace->host, &al, sample) < 0) {
2202 			pr_err("problem processing %d event, skipping it.\n",
2203 			       event->header.type);
2204 			goto out_put;
2205 		}
2206 		perf_evsel__fprintf_callchain(evsel, sample, &al, 38, print_opts,
2207 					      scripting_max_stack, trace->output);
2208 	}
2209 out:
2210 	ttrace->entry_pending = false;
2211 	err = 0;
2212 out_put:
2213 	thread__put(thread);
2214 	return err;
2215 }
2216 
2217 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
2218 			      union perf_event *event __maybe_unused,
2219 			      struct perf_sample *sample)
2220 {
2221 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2222 	struct thread_trace *ttrace;
2223 	size_t filename_len, entry_str_len, to_move;
2224 	ssize_t remaining_space;
2225 	char *pos;
2226 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
2227 
2228 	if (!thread)
2229 		goto out;
2230 
2231 	ttrace = thread__priv(thread);
2232 	if (!ttrace)
2233 		goto out;
2234 
2235 	filename_len = strlen(filename);
2236 
2237 	if (ttrace->filename.namelen < filename_len) {
2238 		char *f = realloc(ttrace->filename.name, filename_len + 1);
2239 
2240 		if (f == NULL)
2241 				goto out;
2242 
2243 		ttrace->filename.namelen = filename_len;
2244 		ttrace->filename.name = f;
2245 	}
2246 
2247 	strcpy(ttrace->filename.name, filename);
2248 	ttrace->filename.pending_open = true;
2249 
2250 	if (!ttrace->filename.ptr)
2251 		goto out;
2252 
2253 	entry_str_len = strlen(ttrace->entry_str);
2254 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2255 	if (remaining_space <= 0)
2256 		goto out;
2257 
2258 	if (filename_len > (size_t)remaining_space) {
2259 		filename += filename_len - remaining_space;
2260 		filename_len = remaining_space;
2261 	}
2262 
2263 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2264 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2265 	memmove(pos + filename_len, pos, to_move);
2266 	memcpy(pos, filename, filename_len);
2267 
2268 	ttrace->filename.ptr = 0;
2269 	ttrace->filename.entry_str_pos = 0;
2270 out:
2271 	return 0;
2272 }
2273 
2274 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
2275 				     union perf_event *event __maybe_unused,
2276 				     struct perf_sample *sample)
2277 {
2278         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
2279 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2280 	struct thread *thread = machine__findnew_thread(trace->host,
2281 							sample->pid,
2282 							sample->tid);
2283 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
2284 
2285 	if (ttrace == NULL)
2286 		goto out_dump;
2287 
2288 	ttrace->runtime_ms += runtime_ms;
2289 	trace->runtime_ms += runtime_ms;
2290 	thread__put(thread);
2291 	return 0;
2292 
2293 out_dump:
2294 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2295 	       evsel->name,
2296 	       perf_evsel__strval(evsel, sample, "comm"),
2297 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
2298 	       runtime,
2299 	       perf_evsel__intval(evsel, sample, "vruntime"));
2300 	thread__put(thread);
2301 	return 0;
2302 }
2303 
2304 static void bpf_output__printer(enum binary_printer_ops op,
2305 				unsigned int val, void *extra)
2306 {
2307 	FILE *output = extra;
2308 	unsigned char ch = (unsigned char)val;
2309 
2310 	switch (op) {
2311 	case BINARY_PRINT_CHAR_DATA:
2312 		fprintf(output, "%c", isprint(ch) ? ch : '.');
2313 		break;
2314 	case BINARY_PRINT_DATA_BEGIN:
2315 	case BINARY_PRINT_LINE_BEGIN:
2316 	case BINARY_PRINT_ADDR:
2317 	case BINARY_PRINT_NUM_DATA:
2318 	case BINARY_PRINT_NUM_PAD:
2319 	case BINARY_PRINT_SEP:
2320 	case BINARY_PRINT_CHAR_PAD:
2321 	case BINARY_PRINT_LINE_END:
2322 	case BINARY_PRINT_DATA_END:
2323 	default:
2324 		break;
2325 	}
2326 }
2327 
2328 static void bpf_output__fprintf(struct trace *trace,
2329 				struct perf_sample *sample)
2330 {
2331 	print_binary(sample->raw_data, sample->raw_size, 8,
2332 		     bpf_output__printer, trace->output);
2333 }
2334 
2335 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2336 				union perf_event *event __maybe_unused,
2337 				struct perf_sample *sample)
2338 {
2339 	trace__printf_interrupted_entry(trace, sample);
2340 	trace__fprintf_tstamp(trace, sample->time, trace->output);
2341 
2342 	if (trace->trace_syscalls)
2343 		fprintf(trace->output, "(         ): ");
2344 
2345 	fprintf(trace->output, "%s:", evsel->name);
2346 
2347 	if (perf_evsel__is_bpf_output(evsel)) {
2348 		bpf_output__fprintf(trace, sample);
2349 	} else if (evsel->tp_format) {
2350 		event_format__fprintf(evsel->tp_format, sample->cpu,
2351 				      sample->raw_data, sample->raw_size,
2352 				      trace->output);
2353 	}
2354 
2355 	fprintf(trace->output, ")\n");
2356 	return 0;
2357 }
2358 
2359 static void print_location(FILE *f, struct perf_sample *sample,
2360 			   struct addr_location *al,
2361 			   bool print_dso, bool print_sym)
2362 {
2363 
2364 	if ((verbose || print_dso) && al->map)
2365 		fprintf(f, "%s@", al->map->dso->long_name);
2366 
2367 	if ((verbose || print_sym) && al->sym)
2368 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2369 			al->addr - al->sym->start);
2370 	else if (al->map)
2371 		fprintf(f, "0x%" PRIx64, al->addr);
2372 	else
2373 		fprintf(f, "0x%" PRIx64, sample->addr);
2374 }
2375 
2376 static int trace__pgfault(struct trace *trace,
2377 			  struct perf_evsel *evsel,
2378 			  union perf_event *event __maybe_unused,
2379 			  struct perf_sample *sample)
2380 {
2381 	struct thread *thread;
2382 	struct addr_location al;
2383 	char map_type = 'd';
2384 	struct thread_trace *ttrace;
2385 	int err = -1;
2386 
2387 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2388 	ttrace = thread__trace(thread, trace->output);
2389 	if (ttrace == NULL)
2390 		goto out_put;
2391 
2392 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2393 		ttrace->pfmaj++;
2394 	else
2395 		ttrace->pfmin++;
2396 
2397 	if (trace->summary_only)
2398 		goto out;
2399 
2400 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
2401 			      sample->ip, &al);
2402 
2403 	trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
2404 
2405 	fprintf(trace->output, "%sfault [",
2406 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2407 		"maj" : "min");
2408 
2409 	print_location(trace->output, sample, &al, false, true);
2410 
2411 	fprintf(trace->output, "] => ");
2412 
2413 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
2414 				   sample->addr, &al);
2415 
2416 	if (!al.map) {
2417 		thread__find_addr_location(thread, sample->cpumode,
2418 					   MAP__FUNCTION, sample->addr, &al);
2419 
2420 		if (al.map)
2421 			map_type = 'x';
2422 		else
2423 			map_type = '?';
2424 	}
2425 
2426 	print_location(trace->output, sample, &al, true, false);
2427 
2428 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2429 out:
2430 	err = 0;
2431 out_put:
2432 	thread__put(thread);
2433 	return err;
2434 }
2435 
2436 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
2437 {
2438 	if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
2439 	    (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
2440 		return false;
2441 
2442 	if (trace->pid_list || trace->tid_list)
2443 		return true;
2444 
2445 	return false;
2446 }
2447 
2448 static void trace__set_base_time(struct trace *trace,
2449 				 struct perf_evsel *evsel,
2450 				 struct perf_sample *sample)
2451 {
2452 	/*
2453 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2454 	 * and don't use sample->time unconditionally, we may end up having
2455 	 * some other event in the future without PERF_SAMPLE_TIME for good
2456 	 * reason, i.e. we may not be interested in its timestamps, just in
2457 	 * it taking place, picking some piece of information when it
2458 	 * appears in our event stream (vfs_getname comes to mind).
2459 	 */
2460 	if (trace->base_time == 0 && !trace->full_time &&
2461 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2462 		trace->base_time = sample->time;
2463 }
2464 
2465 static int trace__process_sample(struct perf_tool *tool,
2466 				 union perf_event *event,
2467 				 struct perf_sample *sample,
2468 				 struct perf_evsel *evsel,
2469 				 struct machine *machine __maybe_unused)
2470 {
2471 	struct trace *trace = container_of(tool, struct trace, tool);
2472 	int err = 0;
2473 
2474 	tracepoint_handler handler = evsel->handler;
2475 
2476 	if (skip_sample(trace, sample))
2477 		return 0;
2478 
2479 	trace__set_base_time(trace, evsel, sample);
2480 
2481 	if (handler) {
2482 		++trace->nr_events;
2483 		handler(trace, evsel, event, sample);
2484 	}
2485 
2486 	return err;
2487 }
2488 
2489 static int parse_target_str(struct trace *trace)
2490 {
2491 	if (trace->opts.target.pid) {
2492 		trace->pid_list = intlist__new(trace->opts.target.pid);
2493 		if (trace->pid_list == NULL) {
2494 			pr_err("Error parsing process id string\n");
2495 			return -EINVAL;
2496 		}
2497 	}
2498 
2499 	if (trace->opts.target.tid) {
2500 		trace->tid_list = intlist__new(trace->opts.target.tid);
2501 		if (trace->tid_list == NULL) {
2502 			pr_err("Error parsing thread id string\n");
2503 			return -EINVAL;
2504 		}
2505 	}
2506 
2507 	return 0;
2508 }
2509 
2510 static int trace__record(struct trace *trace, int argc, const char **argv)
2511 {
2512 	unsigned int rec_argc, i, j;
2513 	const char **rec_argv;
2514 	const char * const record_args[] = {
2515 		"record",
2516 		"-R",
2517 		"-m", "1024",
2518 		"-c", "1",
2519 	};
2520 
2521 	const char * const sc_args[] = { "-e", };
2522 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2523 	const char * const majpf_args[] = { "-e", "major-faults" };
2524 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2525 	const char * const minpf_args[] = { "-e", "minor-faults" };
2526 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2527 
2528 	/* +1 is for the event string below */
2529 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2530 		majpf_args_nr + minpf_args_nr + argc;
2531 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2532 
2533 	if (rec_argv == NULL)
2534 		return -ENOMEM;
2535 
2536 	j = 0;
2537 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2538 		rec_argv[j++] = record_args[i];
2539 
2540 	if (trace->trace_syscalls) {
2541 		for (i = 0; i < sc_args_nr; i++)
2542 			rec_argv[j++] = sc_args[i];
2543 
2544 		/* event string may be different for older kernels - e.g., RHEL6 */
2545 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2546 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2547 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2548 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2549 		else {
2550 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2551 			return -1;
2552 		}
2553 	}
2554 
2555 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2556 		for (i = 0; i < majpf_args_nr; i++)
2557 			rec_argv[j++] = majpf_args[i];
2558 
2559 	if (trace->trace_pgfaults & TRACE_PFMIN)
2560 		for (i = 0; i < minpf_args_nr; i++)
2561 			rec_argv[j++] = minpf_args[i];
2562 
2563 	for (i = 0; i < (unsigned int)argc; i++)
2564 		rec_argv[j++] = argv[i];
2565 
2566 	return cmd_record(j, rec_argv, NULL);
2567 }
2568 
2569 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2570 
2571 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2572 {
2573 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2574 
2575 	if (IS_ERR(evsel))
2576 		return false;
2577 
2578 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2579 		perf_evsel__delete(evsel);
2580 		return false;
2581 	}
2582 
2583 	evsel->handler = trace__vfs_getname;
2584 	perf_evlist__add(evlist, evsel);
2585 	return true;
2586 }
2587 
2588 static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
2589 				    u64 config)
2590 {
2591 	struct perf_evsel *evsel;
2592 	struct perf_event_attr attr = {
2593 		.type = PERF_TYPE_SOFTWARE,
2594 		.mmap_data = 1,
2595 	};
2596 
2597 	attr.config = config;
2598 	attr.sample_period = 1;
2599 
2600 	event_attr_init(&attr);
2601 
2602 	evsel = perf_evsel__new(&attr);
2603 	if (!evsel)
2604 		return -ENOMEM;
2605 
2606 	evsel->handler = trace__pgfault;
2607 	perf_evlist__add(evlist, evsel);
2608 
2609 	return 0;
2610 }
2611 
2612 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2613 {
2614 	const u32 type = event->header.type;
2615 	struct perf_evsel *evsel;
2616 
2617 	if (type != PERF_RECORD_SAMPLE) {
2618 		trace__process_event(trace, trace->host, event, sample);
2619 		return;
2620 	}
2621 
2622 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2623 	if (evsel == NULL) {
2624 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2625 		return;
2626 	}
2627 
2628 	trace__set_base_time(trace, evsel, sample);
2629 
2630 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2631 	    sample->raw_data == NULL) {
2632 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2633 		       perf_evsel__name(evsel), sample->tid,
2634 		       sample->cpu, sample->raw_size);
2635 	} else {
2636 		tracepoint_handler handler = evsel->handler;
2637 		handler(trace, evsel, event, sample);
2638 	}
2639 }
2640 
2641 static int trace__add_syscall_newtp(struct trace *trace)
2642 {
2643 	int ret = -1;
2644 	struct perf_evlist *evlist = trace->evlist;
2645 	struct perf_evsel *sys_enter, *sys_exit;
2646 
2647 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2648 	if (sys_enter == NULL)
2649 		goto out;
2650 
2651 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2652 		goto out_delete_sys_enter;
2653 
2654 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2655 	if (sys_exit == NULL)
2656 		goto out_delete_sys_enter;
2657 
2658 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2659 		goto out_delete_sys_exit;
2660 
2661 	perf_evlist__add(evlist, sys_enter);
2662 	perf_evlist__add(evlist, sys_exit);
2663 
2664 	trace->syscalls.events.sys_enter = sys_enter;
2665 	trace->syscalls.events.sys_exit  = sys_exit;
2666 
2667 	ret = 0;
2668 out:
2669 	return ret;
2670 
2671 out_delete_sys_exit:
2672 	perf_evsel__delete_priv(sys_exit);
2673 out_delete_sys_enter:
2674 	perf_evsel__delete_priv(sys_enter);
2675 	goto out;
2676 }
2677 
2678 static int trace__set_ev_qualifier_filter(struct trace *trace)
2679 {
2680 	int err = -1;
2681 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2682 						trace->ev_qualifier_ids.nr,
2683 						trace->ev_qualifier_ids.entries);
2684 
2685 	if (filter == NULL)
2686 		goto out_enomem;
2687 
2688 	if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2689 		err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2690 
2691 	free(filter);
2692 out:
2693 	return err;
2694 out_enomem:
2695 	errno = ENOMEM;
2696 	goto out;
2697 }
2698 
2699 static int trace__run(struct trace *trace, int argc, const char **argv)
2700 {
2701 	struct perf_evlist *evlist = trace->evlist;
2702 	struct perf_evsel *evsel;
2703 	int err = -1, i;
2704 	unsigned long before;
2705 	const bool forks = argc > 0;
2706 	bool draining = false;
2707 
2708 	trace->live = true;
2709 
2710 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2711 		goto out_error_raw_syscalls;
2712 
2713 	if (trace->trace_syscalls)
2714 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2715 
2716 	if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2717 	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
2718 		goto out_error_mem;
2719 	}
2720 
2721 	if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2722 	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2723 		goto out_error_mem;
2724 
2725 	if (trace->sched &&
2726 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2727 				   trace__sched_stat_runtime))
2728 		goto out_error_sched_stat_runtime;
2729 
2730 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2731 	if (err < 0) {
2732 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2733 		goto out_delete_evlist;
2734 	}
2735 
2736 	err = trace__symbols_init(trace, evlist);
2737 	if (err < 0) {
2738 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2739 		goto out_delete_evlist;
2740 	}
2741 
2742 	perf_evlist__config(evlist, &trace->opts);
2743 
2744 	signal(SIGCHLD, sig_handler);
2745 	signal(SIGINT, sig_handler);
2746 
2747 	if (forks) {
2748 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2749 						    argv, false, NULL);
2750 		if (err < 0) {
2751 			fprintf(trace->output, "Couldn't run the workload!\n");
2752 			goto out_delete_evlist;
2753 		}
2754 	}
2755 
2756 	err = perf_evlist__open(evlist);
2757 	if (err < 0)
2758 		goto out_error_open;
2759 
2760 	err = bpf__apply_obj_config();
2761 	if (err) {
2762 		char errbuf[BUFSIZ];
2763 
2764 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2765 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2766 			 errbuf);
2767 		goto out_error_open;
2768 	}
2769 
2770 	/*
2771 	 * Better not use !target__has_task() here because we need to cover the
2772 	 * case where no threads were specified in the command line, but a
2773 	 * workload was, and in that case we will fill in the thread_map when
2774 	 * we fork the workload in perf_evlist__prepare_workload.
2775 	 */
2776 	if (trace->filter_pids.nr > 0)
2777 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2778 	else if (thread_map__pid(evlist->threads, 0) == -1)
2779 		err = perf_evlist__set_filter_pid(evlist, getpid());
2780 
2781 	if (err < 0)
2782 		goto out_error_mem;
2783 
2784 	if (trace->ev_qualifier_ids.nr > 0) {
2785 		err = trace__set_ev_qualifier_filter(trace);
2786 		if (err < 0)
2787 			goto out_errno;
2788 
2789 		pr_debug("event qualifier tracepoint filter: %s\n",
2790 			 trace->syscalls.events.sys_exit->filter);
2791 	}
2792 
2793 	err = perf_evlist__apply_filters(evlist, &evsel);
2794 	if (err < 0)
2795 		goto out_error_apply_filters;
2796 
2797 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2798 	if (err < 0)
2799 		goto out_error_mmap;
2800 
2801 	if (!target__none(&trace->opts.target))
2802 		perf_evlist__enable(evlist);
2803 
2804 	if (forks)
2805 		perf_evlist__start_workload(evlist);
2806 
2807 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2808 				  evlist->threads->nr > 1 ||
2809 				  perf_evlist__first(evlist)->attr.inherit;
2810 again:
2811 	before = trace->nr_events;
2812 
2813 	for (i = 0; i < evlist->nr_mmaps; i++) {
2814 		union perf_event *event;
2815 
2816 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2817 			struct perf_sample sample;
2818 
2819 			++trace->nr_events;
2820 
2821 			err = perf_evlist__parse_sample(evlist, event, &sample);
2822 			if (err) {
2823 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2824 				goto next_event;
2825 			}
2826 
2827 			trace__handle_event(trace, event, &sample);
2828 next_event:
2829 			perf_evlist__mmap_consume(evlist, i);
2830 
2831 			if (interrupted)
2832 				goto out_disable;
2833 
2834 			if (done && !draining) {
2835 				perf_evlist__disable(evlist);
2836 				draining = true;
2837 			}
2838 		}
2839 	}
2840 
2841 	if (trace->nr_events == before) {
2842 		int timeout = done ? 100 : -1;
2843 
2844 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2845 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2846 				draining = true;
2847 
2848 			goto again;
2849 		}
2850 	} else {
2851 		goto again;
2852 	}
2853 
2854 out_disable:
2855 	thread__zput(trace->current);
2856 
2857 	perf_evlist__disable(evlist);
2858 
2859 	if (!err) {
2860 		if (trace->summary)
2861 			trace__fprintf_thread_summary(trace, trace->output);
2862 
2863 		if (trace->show_tool_stats) {
2864 			fprintf(trace->output, "Stats:\n "
2865 					       " vfs_getname : %" PRIu64 "\n"
2866 					       " proc_getname: %" PRIu64 "\n",
2867 				trace->stats.vfs_getname,
2868 				trace->stats.proc_getname);
2869 		}
2870 	}
2871 
2872 out_delete_evlist:
2873 	perf_evlist__delete(evlist);
2874 	trace->evlist = NULL;
2875 	trace->live = false;
2876 	return err;
2877 {
2878 	char errbuf[BUFSIZ];
2879 
2880 out_error_sched_stat_runtime:
2881 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2882 	goto out_error;
2883 
2884 out_error_raw_syscalls:
2885 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2886 	goto out_error;
2887 
2888 out_error_mmap:
2889 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2890 	goto out_error;
2891 
2892 out_error_open:
2893 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2894 
2895 out_error:
2896 	fprintf(trace->output, "%s\n", errbuf);
2897 	goto out_delete_evlist;
2898 
2899 out_error_apply_filters:
2900 	fprintf(trace->output,
2901 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2902 		evsel->filter, perf_evsel__name(evsel), errno,
2903 		strerror_r(errno, errbuf, sizeof(errbuf)));
2904 	goto out_delete_evlist;
2905 }
2906 out_error_mem:
2907 	fprintf(trace->output, "Not enough memory to run!\n");
2908 	goto out_delete_evlist;
2909 
2910 out_errno:
2911 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2912 	goto out_delete_evlist;
2913 }
2914 
2915 static int trace__replay(struct trace *trace)
2916 {
2917 	const struct perf_evsel_str_handler handlers[] = {
2918 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2919 	};
2920 	struct perf_data_file file = {
2921 		.path  = input_name,
2922 		.mode  = PERF_DATA_MODE_READ,
2923 		.force = trace->force,
2924 	};
2925 	struct perf_session *session;
2926 	struct perf_evsel *evsel;
2927 	int err = -1;
2928 
2929 	trace->tool.sample	  = trace__process_sample;
2930 	trace->tool.mmap	  = perf_event__process_mmap;
2931 	trace->tool.mmap2	  = perf_event__process_mmap2;
2932 	trace->tool.comm	  = perf_event__process_comm;
2933 	trace->tool.exit	  = perf_event__process_exit;
2934 	trace->tool.fork	  = perf_event__process_fork;
2935 	trace->tool.attr	  = perf_event__process_attr;
2936 	trace->tool.tracing_data = perf_event__process_tracing_data;
2937 	trace->tool.build_id	  = perf_event__process_build_id;
2938 
2939 	trace->tool.ordered_events = true;
2940 	trace->tool.ordering_requires_timestamps = true;
2941 
2942 	/* add tid to output */
2943 	trace->multiple_threads = true;
2944 
2945 	session = perf_session__new(&file, false, &trace->tool);
2946 	if (session == NULL)
2947 		return -1;
2948 
2949 	if (symbol__init(&session->header.env) < 0)
2950 		goto out;
2951 
2952 	trace->host = &session->machines.host;
2953 
2954 	err = perf_session__set_tracepoints_handlers(session, handlers);
2955 	if (err)
2956 		goto out;
2957 
2958 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2959 						     "raw_syscalls:sys_enter");
2960 	/* older kernels have syscalls tp versus raw_syscalls */
2961 	if (evsel == NULL)
2962 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2963 							     "syscalls:sys_enter");
2964 
2965 	if (evsel &&
2966 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2967 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2968 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2969 		goto out;
2970 	}
2971 
2972 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2973 						     "raw_syscalls:sys_exit");
2974 	if (evsel == NULL)
2975 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2976 							     "syscalls:sys_exit");
2977 	if (evsel &&
2978 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2979 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2980 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2981 		goto out;
2982 	}
2983 
2984 	evlist__for_each(session->evlist, evsel) {
2985 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2986 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2987 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2988 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2989 			evsel->handler = trace__pgfault;
2990 	}
2991 
2992 	err = parse_target_str(trace);
2993 	if (err != 0)
2994 		goto out;
2995 
2996 	setup_pager();
2997 
2998 	err = perf_session__process_events(session);
2999 	if (err)
3000 		pr_err("Failed to process events, error %d", err);
3001 
3002 	else if (trace->summary)
3003 		trace__fprintf_thread_summary(trace, trace->output);
3004 
3005 out:
3006 	perf_session__delete(session);
3007 
3008 	return err;
3009 }
3010 
3011 static size_t trace__fprintf_threads_header(FILE *fp)
3012 {
3013 	size_t printed;
3014 
3015 	printed  = fprintf(fp, "\n Summary of events:\n\n");
3016 
3017 	return printed;
3018 }
3019 
3020 static size_t thread__dump_stats(struct thread_trace *ttrace,
3021 				 struct trace *trace, FILE *fp)
3022 {
3023 	struct stats *stats;
3024 	size_t printed = 0;
3025 	struct syscall *sc;
3026 	struct int_node *inode = intlist__first(ttrace->syscall_stats);
3027 
3028 	if (inode == NULL)
3029 		return 0;
3030 
3031 	printed += fprintf(fp, "\n");
3032 
3033 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
3034 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
3035 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
3036 
3037 	/* each int_node is a syscall */
3038 	while (inode) {
3039 		stats = inode->priv;
3040 		if (stats) {
3041 			double min = (double)(stats->min) / NSEC_PER_MSEC;
3042 			double max = (double)(stats->max) / NSEC_PER_MSEC;
3043 			double avg = avg_stats(stats);
3044 			double pct;
3045 			u64 n = (u64) stats->n;
3046 
3047 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
3048 			avg /= NSEC_PER_MSEC;
3049 
3050 			sc = &trace->syscalls.table[inode->i];
3051 			printed += fprintf(fp, "   %-15s", sc->name);
3052 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
3053 					   n, avg * n, min, avg);
3054 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
3055 		}
3056 
3057 		inode = intlist__next(inode);
3058 	}
3059 
3060 	printed += fprintf(fp, "\n\n");
3061 
3062 	return printed;
3063 }
3064 
3065 /* struct used to pass data to per-thread function */
3066 struct summary_data {
3067 	FILE *fp;
3068 	struct trace *trace;
3069 	size_t printed;
3070 };
3071 
3072 static int trace__fprintf_one_thread(struct thread *thread, void *priv)
3073 {
3074 	struct summary_data *data = priv;
3075 	FILE *fp = data->fp;
3076 	size_t printed = data->printed;
3077 	struct trace *trace = data->trace;
3078 	struct thread_trace *ttrace = thread__priv(thread);
3079 	double ratio;
3080 
3081 	if (ttrace == NULL)
3082 		return 0;
3083 
3084 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
3085 
3086 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
3087 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
3088 	printed += fprintf(fp, "%.1f%%", ratio);
3089 	if (ttrace->pfmaj)
3090 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
3091 	if (ttrace->pfmin)
3092 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
3093 	printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
3094 	printed += thread__dump_stats(ttrace, trace, fp);
3095 
3096 	data->printed += printed;
3097 
3098 	return 0;
3099 }
3100 
3101 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
3102 {
3103 	struct summary_data data = {
3104 		.fp = fp,
3105 		.trace = trace
3106 	};
3107 	data.printed = trace__fprintf_threads_header(fp);
3108 
3109 	machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
3110 
3111 	return data.printed;
3112 }
3113 
3114 static int trace__set_duration(const struct option *opt, const char *str,
3115 			       int unset __maybe_unused)
3116 {
3117 	struct trace *trace = opt->value;
3118 
3119 	trace->duration_filter = atof(str);
3120 	return 0;
3121 }
3122 
3123 static int trace__set_filter_pids(const struct option *opt, const char *str,
3124 				  int unset __maybe_unused)
3125 {
3126 	int ret = -1;
3127 	size_t i;
3128 	struct trace *trace = opt->value;
3129 	/*
3130 	 * FIXME: introduce a intarray class, plain parse csv and create a
3131 	 * { int nr, int entries[] } struct...
3132 	 */
3133 	struct intlist *list = intlist__new(str);
3134 
3135 	if (list == NULL)
3136 		return -1;
3137 
3138 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
3139 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
3140 
3141 	if (trace->filter_pids.entries == NULL)
3142 		goto out;
3143 
3144 	trace->filter_pids.entries[0] = getpid();
3145 
3146 	for (i = 1; i < trace->filter_pids.nr; ++i)
3147 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
3148 
3149 	intlist__delete(list);
3150 	ret = 0;
3151 out:
3152 	return ret;
3153 }
3154 
3155 static int trace__open_output(struct trace *trace, const char *filename)
3156 {
3157 	struct stat st;
3158 
3159 	if (!stat(filename, &st) && st.st_size) {
3160 		char oldname[PATH_MAX];
3161 
3162 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
3163 		unlink(oldname);
3164 		rename(filename, oldname);
3165 	}
3166 
3167 	trace->output = fopen(filename, "w");
3168 
3169 	return trace->output == NULL ? -errno : 0;
3170 }
3171 
3172 static int parse_pagefaults(const struct option *opt, const char *str,
3173 			    int unset __maybe_unused)
3174 {
3175 	int *trace_pgfaults = opt->value;
3176 
3177 	if (strcmp(str, "all") == 0)
3178 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
3179 	else if (strcmp(str, "maj") == 0)
3180 		*trace_pgfaults |= TRACE_PFMAJ;
3181 	else if (strcmp(str, "min") == 0)
3182 		*trace_pgfaults |= TRACE_PFMIN;
3183 	else
3184 		return -1;
3185 
3186 	return 0;
3187 }
3188 
3189 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
3190 {
3191 	struct perf_evsel *evsel;
3192 
3193 	evlist__for_each(evlist, evsel)
3194 		evsel->handler = handler;
3195 }
3196 
3197 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
3198 {
3199 	const char *trace_usage[] = {
3200 		"perf trace [<options>] [<command>]",
3201 		"perf trace [<options>] -- <command> [<options>]",
3202 		"perf trace record [<options>] [<command>]",
3203 		"perf trace record [<options>] -- <command> [<options>]",
3204 		NULL
3205 	};
3206 	struct trace trace = {
3207 		.syscalls = {
3208 			. max = -1,
3209 		},
3210 		.opts = {
3211 			.target = {
3212 				.uid	   = UINT_MAX,
3213 				.uses_mmap = true,
3214 			},
3215 			.user_freq     = UINT_MAX,
3216 			.user_interval = ULLONG_MAX,
3217 			.no_buffering  = true,
3218 			.mmap_pages    = UINT_MAX,
3219 			.proc_map_timeout  = 500,
3220 		},
3221 		.output = stderr,
3222 		.show_comm = true,
3223 		.trace_syscalls = true,
3224 	};
3225 	const char *output_name = NULL;
3226 	const char *ev_qualifier_str = NULL;
3227 	const struct option trace_options[] = {
3228 	OPT_CALLBACK(0, "event", &trace.evlist, "event",
3229 		     "event selector. use 'perf list' to list available events",
3230 		     parse_events_option),
3231 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
3232 		    "show the thread COMM next to its id"),
3233 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3234 	OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
3235 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
3236 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3237 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3238 		    "trace events on existing process id"),
3239 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3240 		    "trace events on existing thread id"),
3241 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3242 		     "pids to filter (by the kernel)", trace__set_filter_pids),
3243 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3244 		    "system-wide collection from all CPUs"),
3245 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3246 		    "list of cpus to monitor"),
3247 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3248 		    "child tasks do not inherit counters"),
3249 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3250 		     "number of mmap data pages",
3251 		     perf_evlist__parse_mmap_pages),
3252 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3253 		   "user to profile"),
3254 	OPT_CALLBACK(0, "duration", &trace, "float",
3255 		     "show only events with duration > N.M ms",
3256 		     trace__set_duration),
3257 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3258 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3259 	OPT_BOOLEAN('T', "time", &trace.full_time,
3260 		    "Show full timestamp, not time relative to first start"),
3261 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
3262 		    "Show only syscall summary with statistics"),
3263 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
3264 		    "Show all syscalls and summary with statistics"),
3265 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3266 		     "Trace pagefaults", parse_pagefaults, "maj"),
3267 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3268 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3269 	OPT_CALLBACK(0, "call-graph", &trace.opts,
3270 		     "record_mode[,record_size]", record_callchain_help,
3271 		     &record_parse_callchain_opt),
3272 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3273 			"per thread proc mmap processing timeout in ms"),
3274 	OPT_END()
3275 	};
3276 	const char * const trace_subcommands[] = { "record", NULL };
3277 	int err;
3278 	char bf[BUFSIZ];
3279 
3280 	signal(SIGSEGV, sighandler_dump_stack);
3281 	signal(SIGFPE, sighandler_dump_stack);
3282 
3283 	trace.evlist = perf_evlist__new();
3284 	trace.sctbl = syscalltbl__new();
3285 
3286 	if (trace.evlist == NULL || trace.sctbl == NULL) {
3287 		pr_err("Not enough memory to run!\n");
3288 		err = -ENOMEM;
3289 		goto out;
3290 	}
3291 
3292 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3293 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3294 
3295 	err = bpf__setup_stdout(trace.evlist);
3296 	if (err) {
3297 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3298 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3299 		goto out;
3300 	}
3301 
3302 	if (trace.trace_pgfaults) {
3303 		trace.opts.sample_address = true;
3304 		trace.opts.sample_time = true;
3305 	}
3306 
3307 	if (trace.opts.callgraph_set)
3308 		symbol_conf.use_callchain = true;
3309 
3310 	if (trace.evlist->nr_entries > 0)
3311 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3312 
3313 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3314 		return trace__record(&trace, argc-1, &argv[1]);
3315 
3316 	/* summary_only implies summary option, but don't overwrite summary if set */
3317 	if (trace.summary_only)
3318 		trace.summary = trace.summary_only;
3319 
3320 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3321 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3322 		pr_err("Please specify something to trace.\n");
3323 		return -1;
3324 	}
3325 
3326 	if (output_name != NULL) {
3327 		err = trace__open_output(&trace, output_name);
3328 		if (err < 0) {
3329 			perror("failed to create output file");
3330 			goto out;
3331 		}
3332 	}
3333 
3334 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
3335 
3336 	if (ev_qualifier_str != NULL) {
3337 		const char *s = ev_qualifier_str;
3338 		struct strlist_config slist_config = {
3339 			.dirname = system_path(STRACE_GROUPS_DIR),
3340 		};
3341 
3342 		trace.not_ev_qualifier = *s == '!';
3343 		if (trace.not_ev_qualifier)
3344 			++s;
3345 		trace.ev_qualifier = strlist__new(s, &slist_config);
3346 		if (trace.ev_qualifier == NULL) {
3347 			fputs("Not enough memory to parse event qualifier",
3348 			      trace.output);
3349 			err = -ENOMEM;
3350 			goto out_close;
3351 		}
3352 
3353 		err = trace__validate_ev_qualifier(&trace);
3354 		if (err)
3355 			goto out_close;
3356 	}
3357 
3358 	err = target__validate(&trace.opts.target);
3359 	if (err) {
3360 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3361 		fprintf(trace.output, "%s", bf);
3362 		goto out_close;
3363 	}
3364 
3365 	err = target__parse_uid(&trace.opts.target);
3366 	if (err) {
3367 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3368 		fprintf(trace.output, "%s", bf);
3369 		goto out_close;
3370 	}
3371 
3372 	if (!argc && target__none(&trace.opts.target))
3373 		trace.opts.target.system_wide = true;
3374 
3375 	if (input_name)
3376 		err = trace__replay(&trace);
3377 	else
3378 		err = trace__run(&trace, argc, argv);
3379 
3380 out_close:
3381 	if (output_name != NULL)
3382 		fclose(trace.output);
3383 out:
3384 	return err;
3385 }
3386