xref: /linux/tools/perf/builtin-trace.c (revision 110e6f26af80dfd90b6e5c645b1aed7228aa580d)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 #include "util/bpf-loader.h"
37 
38 #include <libaudit.h>
39 #include <stdlib.h>
40 #include <sys/mman.h>
41 #include <linux/futex.h>
42 #include <linux/err.h>
43 
44 /* For older distros: */
45 #ifndef MAP_STACK
46 # define MAP_STACK		0x20000
47 #endif
48 
49 #ifndef MADV_HWPOISON
50 # define MADV_HWPOISON		100
51 
52 #endif
53 
54 #ifndef MADV_MERGEABLE
55 # define MADV_MERGEABLE		12
56 #endif
57 
58 #ifndef MADV_UNMERGEABLE
59 # define MADV_UNMERGEABLE	13
60 #endif
61 
62 #ifndef EFD_SEMAPHORE
63 # define EFD_SEMAPHORE		1
64 #endif
65 
66 #ifndef EFD_NONBLOCK
67 # define EFD_NONBLOCK		00004000
68 #endif
69 
70 #ifndef EFD_CLOEXEC
71 # define EFD_CLOEXEC		02000000
72 #endif
73 
74 #ifndef O_CLOEXEC
75 # define O_CLOEXEC		02000000
76 #endif
77 
78 #ifndef SOCK_DCCP
79 # define SOCK_DCCP		6
80 #endif
81 
82 #ifndef SOCK_CLOEXEC
83 # define SOCK_CLOEXEC		02000000
84 #endif
85 
86 #ifndef SOCK_NONBLOCK
87 # define SOCK_NONBLOCK		00004000
88 #endif
89 
90 #ifndef MSG_CMSG_CLOEXEC
91 # define MSG_CMSG_CLOEXEC	0x40000000
92 #endif
93 
94 #ifndef PERF_FLAG_FD_NO_GROUP
95 # define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
96 #endif
97 
98 #ifndef PERF_FLAG_FD_OUTPUT
99 # define PERF_FLAG_FD_OUTPUT		(1UL << 1)
100 #endif
101 
102 #ifndef PERF_FLAG_PID_CGROUP
103 # define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
104 #endif
105 
106 #ifndef PERF_FLAG_FD_CLOEXEC
107 # define PERF_FLAG_FD_CLOEXEC		(1UL << 3) /* O_CLOEXEC */
108 #endif
109 
110 
111 struct tp_field {
112 	int offset;
113 	union {
114 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
115 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
116 	};
117 };
118 
119 #define TP_UINT_FIELD(bits) \
120 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
121 { \
122 	u##bits value; \
123 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
124 	return value;  \
125 }
126 
127 TP_UINT_FIELD(8);
128 TP_UINT_FIELD(16);
129 TP_UINT_FIELD(32);
130 TP_UINT_FIELD(64);
131 
132 #define TP_UINT_FIELD__SWAPPED(bits) \
133 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
134 { \
135 	u##bits value; \
136 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
137 	return bswap_##bits(value);\
138 }
139 
140 TP_UINT_FIELD__SWAPPED(16);
141 TP_UINT_FIELD__SWAPPED(32);
142 TP_UINT_FIELD__SWAPPED(64);
143 
144 static int tp_field__init_uint(struct tp_field *field,
145 			       struct format_field *format_field,
146 			       bool needs_swap)
147 {
148 	field->offset = format_field->offset;
149 
150 	switch (format_field->size) {
151 	case 1:
152 		field->integer = tp_field__u8;
153 		break;
154 	case 2:
155 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
156 		break;
157 	case 4:
158 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
159 		break;
160 	case 8:
161 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
162 		break;
163 	default:
164 		return -1;
165 	}
166 
167 	return 0;
168 }
169 
170 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
171 {
172 	return sample->raw_data + field->offset;
173 }
174 
175 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
176 {
177 	field->offset = format_field->offset;
178 	field->pointer = tp_field__ptr;
179 	return 0;
180 }
181 
182 struct syscall_tp {
183 	struct tp_field id;
184 	union {
185 		struct tp_field args, ret;
186 	};
187 };
188 
189 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
190 					  struct tp_field *field,
191 					  const char *name)
192 {
193 	struct format_field *format_field = perf_evsel__field(evsel, name);
194 
195 	if (format_field == NULL)
196 		return -1;
197 
198 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
199 }
200 
201 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
202 	({ struct syscall_tp *sc = evsel->priv;\
203 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
204 
205 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
206 					 struct tp_field *field,
207 					 const char *name)
208 {
209 	struct format_field *format_field = perf_evsel__field(evsel, name);
210 
211 	if (format_field == NULL)
212 		return -1;
213 
214 	return tp_field__init_ptr(field, format_field);
215 }
216 
217 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
218 	({ struct syscall_tp *sc = evsel->priv;\
219 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
220 
221 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
222 {
223 	zfree(&evsel->priv);
224 	perf_evsel__delete(evsel);
225 }
226 
227 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
228 {
229 	evsel->priv = malloc(sizeof(struct syscall_tp));
230 	if (evsel->priv != NULL) {
231 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
232 			goto out_delete;
233 
234 		evsel->handler = handler;
235 		return 0;
236 	}
237 
238 	return -ENOMEM;
239 
240 out_delete:
241 	zfree(&evsel->priv);
242 	return -ENOENT;
243 }
244 
245 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
246 {
247 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
248 
249 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
250 	if (IS_ERR(evsel))
251 		evsel = perf_evsel__newtp("syscalls", direction);
252 
253 	if (IS_ERR(evsel))
254 		return NULL;
255 
256 	if (perf_evsel__init_syscall_tp(evsel, handler))
257 		goto out_delete;
258 
259 	return evsel;
260 
261 out_delete:
262 	perf_evsel__delete_priv(evsel);
263 	return NULL;
264 }
265 
266 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
267 	({ struct syscall_tp *fields = evsel->priv; \
268 	   fields->name.integer(&fields->name, sample); })
269 
270 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
271 	({ struct syscall_tp *fields = evsel->priv; \
272 	   fields->name.pointer(&fields->name, sample); })
273 
274 struct syscall_arg {
275 	unsigned long val;
276 	struct thread *thread;
277 	struct trace  *trace;
278 	void	      *parm;
279 	u8	      idx;
280 	u8	      mask;
281 };
282 
283 struct strarray {
284 	int	    offset;
285 	int	    nr_entries;
286 	const char **entries;
287 };
288 
289 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
290 	.nr_entries = ARRAY_SIZE(array), \
291 	.entries = array, \
292 }
293 
294 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
295 	.offset	    = off, \
296 	.nr_entries = ARRAY_SIZE(array), \
297 	.entries = array, \
298 }
299 
300 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
301 						const char *intfmt,
302 					        struct syscall_arg *arg)
303 {
304 	struct strarray *sa = arg->parm;
305 	int idx = arg->val - sa->offset;
306 
307 	if (idx < 0 || idx >= sa->nr_entries)
308 		return scnprintf(bf, size, intfmt, arg->val);
309 
310 	return scnprintf(bf, size, "%s", sa->entries[idx]);
311 }
312 
313 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
314 					      struct syscall_arg *arg)
315 {
316 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
317 }
318 
319 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
320 
321 #if defined(__i386__) || defined(__x86_64__)
322 /*
323  * FIXME: Make this available to all arches as soon as the ioctl beautifier
324  * 	  gets rewritten to support all arches.
325  */
326 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
327 						 struct syscall_arg *arg)
328 {
329 	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
330 }
331 
332 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
333 #endif /* defined(__i386__) || defined(__x86_64__) */
334 
335 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
336 					struct syscall_arg *arg);
337 
338 #define SCA_FD syscall_arg__scnprintf_fd
339 
340 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
341 					   struct syscall_arg *arg)
342 {
343 	int fd = arg->val;
344 
345 	if (fd == AT_FDCWD)
346 		return scnprintf(bf, size, "CWD");
347 
348 	return syscall_arg__scnprintf_fd(bf, size, arg);
349 }
350 
351 #define SCA_FDAT syscall_arg__scnprintf_fd_at
352 
353 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
354 					      struct syscall_arg *arg);
355 
356 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
357 
358 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
359 					 struct syscall_arg *arg)
360 {
361 	return scnprintf(bf, size, "%#lx", arg->val);
362 }
363 
364 #define SCA_HEX syscall_arg__scnprintf_hex
365 
366 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
367 					 struct syscall_arg *arg)
368 {
369 	return scnprintf(bf, size, "%d", arg->val);
370 }
371 
372 #define SCA_INT syscall_arg__scnprintf_int
373 
374 static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
375 					       struct syscall_arg *arg)
376 {
377 	int printed = 0, prot = arg->val;
378 
379 	if (prot == PROT_NONE)
380 		return scnprintf(bf, size, "NONE");
381 #define	P_MMAP_PROT(n) \
382 	if (prot & PROT_##n) { \
383 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
384 		prot &= ~PROT_##n; \
385 	}
386 
387 	P_MMAP_PROT(EXEC);
388 	P_MMAP_PROT(READ);
389 	P_MMAP_PROT(WRITE);
390 #ifdef PROT_SEM
391 	P_MMAP_PROT(SEM);
392 #endif
393 	P_MMAP_PROT(GROWSDOWN);
394 	P_MMAP_PROT(GROWSUP);
395 #undef P_MMAP_PROT
396 
397 	if (prot)
398 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
399 
400 	return printed;
401 }
402 
403 #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
404 
405 static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
406 						struct syscall_arg *arg)
407 {
408 	int printed = 0, flags = arg->val;
409 
410 #define	P_MMAP_FLAG(n) \
411 	if (flags & MAP_##n) { \
412 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
413 		flags &= ~MAP_##n; \
414 	}
415 
416 	P_MMAP_FLAG(SHARED);
417 	P_MMAP_FLAG(PRIVATE);
418 #ifdef MAP_32BIT
419 	P_MMAP_FLAG(32BIT);
420 #endif
421 	P_MMAP_FLAG(ANONYMOUS);
422 	P_MMAP_FLAG(DENYWRITE);
423 	P_MMAP_FLAG(EXECUTABLE);
424 	P_MMAP_FLAG(FILE);
425 	P_MMAP_FLAG(FIXED);
426 	P_MMAP_FLAG(GROWSDOWN);
427 #ifdef MAP_HUGETLB
428 	P_MMAP_FLAG(HUGETLB);
429 #endif
430 	P_MMAP_FLAG(LOCKED);
431 	P_MMAP_FLAG(NONBLOCK);
432 	P_MMAP_FLAG(NORESERVE);
433 	P_MMAP_FLAG(POPULATE);
434 	P_MMAP_FLAG(STACK);
435 #ifdef MAP_UNINITIALIZED
436 	P_MMAP_FLAG(UNINITIALIZED);
437 #endif
438 #undef P_MMAP_FLAG
439 
440 	if (flags)
441 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
442 
443 	return printed;
444 }
445 
446 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
447 
448 static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
449 						  struct syscall_arg *arg)
450 {
451 	int printed = 0, flags = arg->val;
452 
453 #define P_MREMAP_FLAG(n) \
454 	if (flags & MREMAP_##n) { \
455 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
456 		flags &= ~MREMAP_##n; \
457 	}
458 
459 	P_MREMAP_FLAG(MAYMOVE);
460 #ifdef MREMAP_FIXED
461 	P_MREMAP_FLAG(FIXED);
462 #endif
463 #undef P_MREMAP_FLAG
464 
465 	if (flags)
466 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
467 
468 	return printed;
469 }
470 
471 #define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
472 
473 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
474 						      struct syscall_arg *arg)
475 {
476 	int behavior = arg->val;
477 
478 	switch (behavior) {
479 #define	P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
480 	P_MADV_BHV(NORMAL);
481 	P_MADV_BHV(RANDOM);
482 	P_MADV_BHV(SEQUENTIAL);
483 	P_MADV_BHV(WILLNEED);
484 	P_MADV_BHV(DONTNEED);
485 	P_MADV_BHV(REMOVE);
486 	P_MADV_BHV(DONTFORK);
487 	P_MADV_BHV(DOFORK);
488 	P_MADV_BHV(HWPOISON);
489 #ifdef MADV_SOFT_OFFLINE
490 	P_MADV_BHV(SOFT_OFFLINE);
491 #endif
492 	P_MADV_BHV(MERGEABLE);
493 	P_MADV_BHV(UNMERGEABLE);
494 #ifdef MADV_HUGEPAGE
495 	P_MADV_BHV(HUGEPAGE);
496 #endif
497 #ifdef MADV_NOHUGEPAGE
498 	P_MADV_BHV(NOHUGEPAGE);
499 #endif
500 #ifdef MADV_DONTDUMP
501 	P_MADV_BHV(DONTDUMP);
502 #endif
503 #ifdef MADV_DODUMP
504 	P_MADV_BHV(DODUMP);
505 #endif
506 #undef P_MADV_PHV
507 	default: break;
508 	}
509 
510 	return scnprintf(bf, size, "%#x", behavior);
511 }
512 
513 #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
514 
515 static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
516 					   struct syscall_arg *arg)
517 {
518 	int printed = 0, op = arg->val;
519 
520 	if (op == 0)
521 		return scnprintf(bf, size, "NONE");
522 #define	P_CMD(cmd) \
523 	if ((op & LOCK_##cmd) == LOCK_##cmd) { \
524 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
525 		op &= ~LOCK_##cmd; \
526 	}
527 
528 	P_CMD(SH);
529 	P_CMD(EX);
530 	P_CMD(NB);
531 	P_CMD(UN);
532 	P_CMD(MAND);
533 	P_CMD(RW);
534 	P_CMD(READ);
535 	P_CMD(WRITE);
536 #undef P_OP
537 
538 	if (op)
539 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
540 
541 	return printed;
542 }
543 
544 #define SCA_FLOCK syscall_arg__scnprintf_flock
545 
546 static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
547 {
548 	enum syscall_futex_args {
549 		SCF_UADDR   = (1 << 0),
550 		SCF_OP	    = (1 << 1),
551 		SCF_VAL	    = (1 << 2),
552 		SCF_TIMEOUT = (1 << 3),
553 		SCF_UADDR2  = (1 << 4),
554 		SCF_VAL3    = (1 << 5),
555 	};
556 	int op = arg->val;
557 	int cmd = op & FUTEX_CMD_MASK;
558 	size_t printed = 0;
559 
560 	switch (cmd) {
561 #define	P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
562 	P_FUTEX_OP(WAIT);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
563 	P_FUTEX_OP(WAKE);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
564 	P_FUTEX_OP(FD);		    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
565 	P_FUTEX_OP(REQUEUE);	    arg->mask |= SCF_VAL3|SCF_TIMEOUT;	          break;
566 	P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;			  break;
567 	P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;			  break;
568 	P_FUTEX_OP(WAKE_OP);							  break;
569 	P_FUTEX_OP(LOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
570 	P_FUTEX_OP(UNLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
571 	P_FUTEX_OP(TRYLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
572 	P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;			  break;
573 	P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;			  break;
574 	P_FUTEX_OP(WAIT_REQUEUE_PI);						  break;
575 	default: printed = scnprintf(bf, size, "%#x", cmd);			  break;
576 	}
577 
578 	if (op & FUTEX_PRIVATE_FLAG)
579 		printed += scnprintf(bf + printed, size - printed, "|PRIV");
580 
581 	if (op & FUTEX_CLOCK_REALTIME)
582 		printed += scnprintf(bf + printed, size - printed, "|CLKRT");
583 
584 	return printed;
585 }
586 
587 #define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
588 
589 static const char *bpf_cmd[] = {
590 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
591 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
592 };
593 static DEFINE_STRARRAY(bpf_cmd);
594 
595 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
596 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
597 
598 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
599 static DEFINE_STRARRAY(itimers);
600 
601 static const char *keyctl_options[] = {
602 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
603 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
604 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
605 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
606 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
607 };
608 static DEFINE_STRARRAY(keyctl_options);
609 
610 static const char *whences[] = { "SET", "CUR", "END",
611 #ifdef SEEK_DATA
612 "DATA",
613 #endif
614 #ifdef SEEK_HOLE
615 "HOLE",
616 #endif
617 };
618 static DEFINE_STRARRAY(whences);
619 
620 static const char *fcntl_cmds[] = {
621 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
622 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
623 	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
624 	"F_GETOWNER_UIDS",
625 };
626 static DEFINE_STRARRAY(fcntl_cmds);
627 
628 static const char *rlimit_resources[] = {
629 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
630 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
631 	"RTTIME",
632 };
633 static DEFINE_STRARRAY(rlimit_resources);
634 
635 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
636 static DEFINE_STRARRAY(sighow);
637 
638 static const char *clockid[] = {
639 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
640 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
641 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
642 };
643 static DEFINE_STRARRAY(clockid);
644 
645 static const char *socket_families[] = {
646 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
647 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
648 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
649 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
650 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
651 	"ALG", "NFC", "VSOCK",
652 };
653 static DEFINE_STRARRAY(socket_families);
654 
655 #ifndef SOCK_TYPE_MASK
656 #define SOCK_TYPE_MASK 0xf
657 #endif
658 
659 static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
660 						      struct syscall_arg *arg)
661 {
662 	size_t printed;
663 	int type = arg->val,
664 	    flags = type & ~SOCK_TYPE_MASK;
665 
666 	type &= SOCK_TYPE_MASK;
667 	/*
668  	 * Can't use a strarray, MIPS may override for ABI reasons.
669  	 */
670 	switch (type) {
671 #define	P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
672 	P_SK_TYPE(STREAM);
673 	P_SK_TYPE(DGRAM);
674 	P_SK_TYPE(RAW);
675 	P_SK_TYPE(RDM);
676 	P_SK_TYPE(SEQPACKET);
677 	P_SK_TYPE(DCCP);
678 	P_SK_TYPE(PACKET);
679 #undef P_SK_TYPE
680 	default:
681 		printed = scnprintf(bf, size, "%#x", type);
682 	}
683 
684 #define	P_SK_FLAG(n) \
685 	if (flags & SOCK_##n) { \
686 		printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
687 		flags &= ~SOCK_##n; \
688 	}
689 
690 	P_SK_FLAG(CLOEXEC);
691 	P_SK_FLAG(NONBLOCK);
692 #undef P_SK_FLAG
693 
694 	if (flags)
695 		printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
696 
697 	return printed;
698 }
699 
700 #define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
701 
702 #ifndef MSG_PROBE
703 #define MSG_PROBE	     0x10
704 #endif
705 #ifndef MSG_WAITFORONE
706 #define MSG_WAITFORONE	0x10000
707 #endif
708 #ifndef MSG_SENDPAGE_NOTLAST
709 #define MSG_SENDPAGE_NOTLAST 0x20000
710 #endif
711 #ifndef MSG_FASTOPEN
712 #define MSG_FASTOPEN	     0x20000000
713 #endif
714 
715 static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
716 					       struct syscall_arg *arg)
717 {
718 	int printed = 0, flags = arg->val;
719 
720 	if (flags == 0)
721 		return scnprintf(bf, size, "NONE");
722 #define	P_MSG_FLAG(n) \
723 	if (flags & MSG_##n) { \
724 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
725 		flags &= ~MSG_##n; \
726 	}
727 
728 	P_MSG_FLAG(OOB);
729 	P_MSG_FLAG(PEEK);
730 	P_MSG_FLAG(DONTROUTE);
731 	P_MSG_FLAG(TRYHARD);
732 	P_MSG_FLAG(CTRUNC);
733 	P_MSG_FLAG(PROBE);
734 	P_MSG_FLAG(TRUNC);
735 	P_MSG_FLAG(DONTWAIT);
736 	P_MSG_FLAG(EOR);
737 	P_MSG_FLAG(WAITALL);
738 	P_MSG_FLAG(FIN);
739 	P_MSG_FLAG(SYN);
740 	P_MSG_FLAG(CONFIRM);
741 	P_MSG_FLAG(RST);
742 	P_MSG_FLAG(ERRQUEUE);
743 	P_MSG_FLAG(NOSIGNAL);
744 	P_MSG_FLAG(MORE);
745 	P_MSG_FLAG(WAITFORONE);
746 	P_MSG_FLAG(SENDPAGE_NOTLAST);
747 	P_MSG_FLAG(FASTOPEN);
748 	P_MSG_FLAG(CMSG_CLOEXEC);
749 #undef P_MSG_FLAG
750 
751 	if (flags)
752 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
753 
754 	return printed;
755 }
756 
757 #define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
758 
759 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
760 						 struct syscall_arg *arg)
761 {
762 	size_t printed = 0;
763 	int mode = arg->val;
764 
765 	if (mode == F_OK) /* 0 */
766 		return scnprintf(bf, size, "F");
767 #define	P_MODE(n) \
768 	if (mode & n##_OK) { \
769 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
770 		mode &= ~n##_OK; \
771 	}
772 
773 	P_MODE(R);
774 	P_MODE(W);
775 	P_MODE(X);
776 #undef P_MODE
777 
778 	if (mode)
779 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
780 
781 	return printed;
782 }
783 
784 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
785 
786 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
787 					      struct syscall_arg *arg);
788 
789 #define SCA_FILENAME syscall_arg__scnprintf_filename
790 
791 static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
792 					       struct syscall_arg *arg)
793 {
794 	int printed = 0, flags = arg->val;
795 
796 	if (!(flags & O_CREAT))
797 		arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
798 
799 	if (flags == 0)
800 		return scnprintf(bf, size, "RDONLY");
801 #define	P_FLAG(n) \
802 	if (flags & O_##n) { \
803 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
804 		flags &= ~O_##n; \
805 	}
806 
807 	P_FLAG(APPEND);
808 	P_FLAG(ASYNC);
809 	P_FLAG(CLOEXEC);
810 	P_FLAG(CREAT);
811 	P_FLAG(DIRECT);
812 	P_FLAG(DIRECTORY);
813 	P_FLAG(EXCL);
814 	P_FLAG(LARGEFILE);
815 	P_FLAG(NOATIME);
816 	P_FLAG(NOCTTY);
817 #ifdef O_NONBLOCK
818 	P_FLAG(NONBLOCK);
819 #elif O_NDELAY
820 	P_FLAG(NDELAY);
821 #endif
822 #ifdef O_PATH
823 	P_FLAG(PATH);
824 #endif
825 	P_FLAG(RDWR);
826 #ifdef O_DSYNC
827 	if ((flags & O_SYNC) == O_SYNC)
828 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
829 	else {
830 		P_FLAG(DSYNC);
831 	}
832 #else
833 	P_FLAG(SYNC);
834 #endif
835 	P_FLAG(TRUNC);
836 	P_FLAG(WRONLY);
837 #undef P_FLAG
838 
839 	if (flags)
840 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
841 
842 	return printed;
843 }
844 
845 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
846 
847 static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
848 						struct syscall_arg *arg)
849 {
850 	int printed = 0, flags = arg->val;
851 
852 	if (flags == 0)
853 		return 0;
854 
855 #define	P_FLAG(n) \
856 	if (flags & PERF_FLAG_##n) { \
857 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
858 		flags &= ~PERF_FLAG_##n; \
859 	}
860 
861 	P_FLAG(FD_NO_GROUP);
862 	P_FLAG(FD_OUTPUT);
863 	P_FLAG(PID_CGROUP);
864 	P_FLAG(FD_CLOEXEC);
865 #undef P_FLAG
866 
867 	if (flags)
868 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
869 
870 	return printed;
871 }
872 
873 #define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
874 
875 static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
876 						   struct syscall_arg *arg)
877 {
878 	int printed = 0, flags = arg->val;
879 
880 	if (flags == 0)
881 		return scnprintf(bf, size, "NONE");
882 #define	P_FLAG(n) \
883 	if (flags & EFD_##n) { \
884 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
885 		flags &= ~EFD_##n; \
886 	}
887 
888 	P_FLAG(SEMAPHORE);
889 	P_FLAG(CLOEXEC);
890 	P_FLAG(NONBLOCK);
891 #undef P_FLAG
892 
893 	if (flags)
894 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
895 
896 	return printed;
897 }
898 
899 #define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
900 
901 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
902 						struct syscall_arg *arg)
903 {
904 	int printed = 0, flags = arg->val;
905 
906 #define	P_FLAG(n) \
907 	if (flags & O_##n) { \
908 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
909 		flags &= ~O_##n; \
910 	}
911 
912 	P_FLAG(CLOEXEC);
913 	P_FLAG(NONBLOCK);
914 #undef P_FLAG
915 
916 	if (flags)
917 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
918 
919 	return printed;
920 }
921 
922 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
923 
924 static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
925 {
926 	int sig = arg->val;
927 
928 	switch (sig) {
929 #define	P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
930 	P_SIGNUM(HUP);
931 	P_SIGNUM(INT);
932 	P_SIGNUM(QUIT);
933 	P_SIGNUM(ILL);
934 	P_SIGNUM(TRAP);
935 	P_SIGNUM(ABRT);
936 	P_SIGNUM(BUS);
937 	P_SIGNUM(FPE);
938 	P_SIGNUM(KILL);
939 	P_SIGNUM(USR1);
940 	P_SIGNUM(SEGV);
941 	P_SIGNUM(USR2);
942 	P_SIGNUM(PIPE);
943 	P_SIGNUM(ALRM);
944 	P_SIGNUM(TERM);
945 	P_SIGNUM(CHLD);
946 	P_SIGNUM(CONT);
947 	P_SIGNUM(STOP);
948 	P_SIGNUM(TSTP);
949 	P_SIGNUM(TTIN);
950 	P_SIGNUM(TTOU);
951 	P_SIGNUM(URG);
952 	P_SIGNUM(XCPU);
953 	P_SIGNUM(XFSZ);
954 	P_SIGNUM(VTALRM);
955 	P_SIGNUM(PROF);
956 	P_SIGNUM(WINCH);
957 	P_SIGNUM(IO);
958 	P_SIGNUM(PWR);
959 	P_SIGNUM(SYS);
960 #ifdef SIGEMT
961 	P_SIGNUM(EMT);
962 #endif
963 #ifdef SIGSTKFLT
964 	P_SIGNUM(STKFLT);
965 #endif
966 #ifdef SIGSWI
967 	P_SIGNUM(SWI);
968 #endif
969 	default: break;
970 	}
971 
972 	return scnprintf(bf, size, "%#x", sig);
973 }
974 
975 #define SCA_SIGNUM syscall_arg__scnprintf_signum
976 
977 #if defined(__i386__) || defined(__x86_64__)
978 /*
979  * FIXME: Make this available to all arches.
980  */
981 #define TCGETS		0x5401
982 
983 static const char *tioctls[] = {
984 	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
985 	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
986 	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
987 	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
988 	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
989 	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
990 	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
991 	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
992 	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
993 	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
994 	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
995 	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
996 	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
997 	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
998 	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
999 };
1000 
1001 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
1002 #endif /* defined(__i386__) || defined(__x86_64__) */
1003 
1004 #define STRARRAY(arg, name, array) \
1005 	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
1006 	  .arg_parm	 = { [arg] = &strarray__##array, }
1007 
1008 static struct syscall_fmt {
1009 	const char *name;
1010 	const char *alias;
1011 	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
1012 	void	   *arg_parm[6];
1013 	bool	   errmsg;
1014 	bool	   timeout;
1015 	bool	   hexret;
1016 } syscall_fmts[] = {
1017 	{ .name	    = "access",	    .errmsg = true,
1018 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */
1019 			     [1] = SCA_ACCMODE,  /* mode */ }, },
1020 	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
1021 	{ .name	    = "bpf",	    .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
1022 	{ .name	    = "brk",	    .hexret = true,
1023 	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
1024 	{ .name	    = "chdir",	    .errmsg = true,
1025 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1026 	{ .name	    = "chmod",	    .errmsg = true,
1027 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1028 	{ .name	    = "chroot",	    .errmsg = true,
1029 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1030 	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
1031 	{ .name	    = "close",	    .errmsg = true,
1032 	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
1033 	{ .name	    = "connect",    .errmsg = true, },
1034 	{ .name	    = "creat",	    .errmsg = true,
1035 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1036 	{ .name	    = "dup",	    .errmsg = true,
1037 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1038 	{ .name	    = "dup2",	    .errmsg = true,
1039 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1040 	{ .name	    = "dup3",	    .errmsg = true,
1041 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1042 	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
1043 	{ .name	    = "eventfd2",   .errmsg = true,
1044 	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
1045 	{ .name	    = "faccessat",  .errmsg = true,
1046 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1047 			     [1] = SCA_FILENAME, /* filename */ }, },
1048 	{ .name	    = "fadvise64",  .errmsg = true,
1049 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1050 	{ .name	    = "fallocate",  .errmsg = true,
1051 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1052 	{ .name	    = "fchdir",	    .errmsg = true,
1053 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1054 	{ .name	    = "fchmod",	    .errmsg = true,
1055 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1056 	{ .name	    = "fchmodat",   .errmsg = true,
1057 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1058 			     [1] = SCA_FILENAME, /* filename */ }, },
1059 	{ .name	    = "fchown",	    .errmsg = true,
1060 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1061 	{ .name	    = "fchownat",   .errmsg = true,
1062 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1063 			     [1] = SCA_FILENAME, /* filename */ }, },
1064 	{ .name	    = "fcntl",	    .errmsg = true,
1065 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1066 			     [1] = SCA_STRARRAY, /* cmd */ },
1067 	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
1068 	{ .name	    = "fdatasync",  .errmsg = true,
1069 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1070 	{ .name	    = "flock",	    .errmsg = true,
1071 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1072 			     [1] = SCA_FLOCK, /* cmd */ }, },
1073 	{ .name	    = "fsetxattr",  .errmsg = true,
1074 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1075 	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat",
1076 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1077 	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat",
1078 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1079 			     [1] = SCA_FILENAME, /* filename */ }, },
1080 	{ .name	    = "fstatfs",    .errmsg = true,
1081 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1082 	{ .name	    = "fsync",    .errmsg = true,
1083 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1084 	{ .name	    = "ftruncate", .errmsg = true,
1085 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1086 	{ .name	    = "futex",	    .errmsg = true,
1087 	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
1088 	{ .name	    = "futimesat", .errmsg = true,
1089 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1090 			     [1] = SCA_FILENAME, /* filename */ }, },
1091 	{ .name	    = "getdents",   .errmsg = true,
1092 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1093 	{ .name	    = "getdents64", .errmsg = true,
1094 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1095 	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1096 	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1097 	{ .name	    = "getxattr",    .errmsg = true,
1098 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1099 	{ .name	    = "inotify_add_watch",	    .errmsg = true,
1100 	  .arg_scnprintf = { [1] = SCA_FILENAME, /* pathname */ }, },
1101 	{ .name	    = "ioctl",	    .errmsg = true,
1102 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1103 #if defined(__i386__) || defined(__x86_64__)
1104 /*
1105  * FIXME: Make this available to all arches.
1106  */
1107 			     [1] = SCA_STRHEXARRAY, /* cmd */
1108 			     [2] = SCA_HEX, /* arg */ },
1109 	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
1110 #else
1111 			     [2] = SCA_HEX, /* arg */ }, },
1112 #endif
1113 	{ .name	    = "keyctl",	    .errmsg = true, STRARRAY(0, option, keyctl_options), },
1114 	{ .name	    = "kill",	    .errmsg = true,
1115 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1116 	{ .name	    = "lchown",    .errmsg = true,
1117 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1118 	{ .name	    = "lgetxattr",  .errmsg = true,
1119 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1120 	{ .name	    = "linkat",	    .errmsg = true,
1121 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1122 	{ .name	    = "listxattr",  .errmsg = true,
1123 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1124 	{ .name	    = "llistxattr", .errmsg = true,
1125 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1126 	{ .name	    = "lremovexattr",  .errmsg = true,
1127 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1128 	{ .name	    = "lseek",	    .errmsg = true,
1129 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1130 			     [2] = SCA_STRARRAY, /* whence */ },
1131 	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
1132 	{ .name	    = "lsetxattr",  .errmsg = true,
1133 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1134 	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat",
1135 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1136 	{ .name	    = "lsxattr",    .errmsg = true,
1137 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1138 	{ .name     = "madvise",    .errmsg = true,
1139 	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
1140 			     [2] = SCA_MADV_BHV, /* behavior */ }, },
1141 	{ .name	    = "mkdir",    .errmsg = true,
1142 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1143 	{ .name	    = "mkdirat",    .errmsg = true,
1144 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1145 			     [1] = SCA_FILENAME, /* pathname */ }, },
1146 	{ .name	    = "mknod",      .errmsg = true,
1147 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1148 	{ .name	    = "mknodat",    .errmsg = true,
1149 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1150 			     [1] = SCA_FILENAME, /* filename */ }, },
1151 	{ .name	    = "mlock",	    .errmsg = true,
1152 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1153 	{ .name	    = "mlockall",   .errmsg = true,
1154 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1155 	{ .name	    = "mmap",	    .hexret = true,
1156 	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
1157 			     [2] = SCA_MMAP_PROT, /* prot */
1158 			     [3] = SCA_MMAP_FLAGS, /* flags */
1159 			     [4] = SCA_FD, 	  /* fd */ }, },
1160 	{ .name	    = "mprotect",   .errmsg = true,
1161 	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
1162 			     [2] = SCA_MMAP_PROT, /* prot */ }, },
1163 	{ .name	    = "mq_unlink", .errmsg = true,
1164 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
1165 	{ .name	    = "mremap",	    .hexret = true,
1166 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1167 			     [3] = SCA_MREMAP_FLAGS, /* flags */
1168 			     [4] = SCA_HEX, /* new_addr */ }, },
1169 	{ .name	    = "munlock",    .errmsg = true,
1170 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1171 	{ .name	    = "munmap",	    .errmsg = true,
1172 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1173 	{ .name	    = "name_to_handle_at", .errmsg = true,
1174 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1175 	{ .name	    = "newfstatat", .errmsg = true,
1176 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1177 			     [1] = SCA_FILENAME, /* filename */ }, },
1178 	{ .name	    = "open",	    .errmsg = true,
1179 	  .arg_scnprintf = { [0] = SCA_FILENAME,   /* filename */
1180 			     [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1181 	{ .name	    = "open_by_handle_at", .errmsg = true,
1182 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1183 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1184 	{ .name	    = "openat",	    .errmsg = true,
1185 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1186 			     [1] = SCA_FILENAME, /* filename */
1187 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1188 	{ .name	    = "perf_event_open", .errmsg = true,
1189 	  .arg_scnprintf = { [1] = SCA_INT, /* pid */
1190 			     [2] = SCA_INT, /* cpu */
1191 			     [3] = SCA_FD,  /* group_fd */
1192 			     [4] = SCA_PERF_FLAGS,  /* flags */ }, },
1193 	{ .name	    = "pipe2",	    .errmsg = true,
1194 	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1195 	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
1196 	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
1197 	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64",
1198 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1199 	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread",
1200 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1201 	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1202 	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64",
1203 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1204 	{ .name	    = "pwritev",    .errmsg = true,
1205 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1206 	{ .name	    = "read",	    .errmsg = true,
1207 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1208 	{ .name	    = "readlink",   .errmsg = true,
1209 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
1210 	{ .name	    = "readlinkat", .errmsg = true,
1211 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1212 			     [1] = SCA_FILENAME, /* pathname */ }, },
1213 	{ .name	    = "readv",	    .errmsg = true,
1214 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1215 	{ .name	    = "recvfrom",   .errmsg = true,
1216 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1217 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
1218 	{ .name	    = "recvmmsg",   .errmsg = true,
1219 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1220 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
1221 	{ .name	    = "recvmsg",    .errmsg = true,
1222 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1223 			     [2] = SCA_MSG_FLAGS, /* flags */ }, },
1224 	{ .name	    = "removexattr", .errmsg = true,
1225 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1226 	{ .name	    = "renameat",   .errmsg = true,
1227 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1228 	{ .name	    = "rmdir",    .errmsg = true,
1229 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1230 	{ .name	    = "rt_sigaction", .errmsg = true,
1231 	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1232 	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
1233 	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
1234 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1235 	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
1236 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1237 	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
1238 	{ .name	    = "sendmmsg",    .errmsg = true,
1239 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1240 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
1241 	{ .name	    = "sendmsg",    .errmsg = true,
1242 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1243 			     [2] = SCA_MSG_FLAGS, /* flags */ }, },
1244 	{ .name	    = "sendto",	    .errmsg = true,
1245 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1246 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
1247 	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1248 	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1249 	{ .name	    = "setxattr",   .errmsg = true,
1250 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1251 	{ .name	    = "shutdown",   .errmsg = true,
1252 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1253 	{ .name	    = "socket",	    .errmsg = true,
1254 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1255 			     [1] = SCA_SK_TYPE, /* type */ },
1256 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
1257 	{ .name	    = "socketpair", .errmsg = true,
1258 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1259 			     [1] = SCA_SK_TYPE, /* type */ },
1260 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
1261 	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat",
1262 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1263 	{ .name	    = "statfs",	    .errmsg = true,
1264 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1265 	{ .name	    = "swapoff",    .errmsg = true,
1266 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
1267 	{ .name	    = "swapon",	    .errmsg = true,
1268 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
1269 	{ .name	    = "symlinkat",  .errmsg = true,
1270 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1271 	{ .name	    = "tgkill",	    .errmsg = true,
1272 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1273 	{ .name	    = "tkill",	    .errmsg = true,
1274 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1275 	{ .name	    = "truncate",   .errmsg = true,
1276 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
1277 	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
1278 	{ .name	    = "unlinkat",   .errmsg = true,
1279 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1280 			     [1] = SCA_FILENAME, /* pathname */ }, },
1281 	{ .name	    = "utime",  .errmsg = true,
1282 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1283 	{ .name	    = "utimensat",  .errmsg = true,
1284 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */
1285 			     [1] = SCA_FILENAME, /* filename */ }, },
1286 	{ .name	    = "utimes",  .errmsg = true,
1287 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1288 	{ .name	    = "vmsplice",  .errmsg = true,
1289 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1290 	{ .name	    = "write",	    .errmsg = true,
1291 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1292 	{ .name	    = "writev",	    .errmsg = true,
1293 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1294 };
1295 
1296 static int syscall_fmt__cmp(const void *name, const void *fmtp)
1297 {
1298 	const struct syscall_fmt *fmt = fmtp;
1299 	return strcmp(name, fmt->name);
1300 }
1301 
1302 static struct syscall_fmt *syscall_fmt__find(const char *name)
1303 {
1304 	const int nmemb = ARRAY_SIZE(syscall_fmts);
1305 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1306 }
1307 
1308 struct syscall {
1309 	struct event_format *tp_format;
1310 	int		    nr_args;
1311 	struct format_field *args;
1312 	const char	    *name;
1313 	bool		    is_exit;
1314 	struct syscall_fmt  *fmt;
1315 	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1316 	void		    **arg_parm;
1317 };
1318 
1319 static size_t fprintf_duration(unsigned long t, FILE *fp)
1320 {
1321 	double duration = (double)t / NSEC_PER_MSEC;
1322 	size_t printed = fprintf(fp, "(");
1323 
1324 	if (duration >= 1.0)
1325 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1326 	else if (duration >= 0.01)
1327 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1328 	else
1329 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1330 	return printed + fprintf(fp, "): ");
1331 }
1332 
1333 /**
1334  * filename.ptr: The filename char pointer that will be vfs_getname'd
1335  * filename.entry_str_pos: Where to insert the string translated from
1336  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
1337  */
1338 struct thread_trace {
1339 	u64		  entry_time;
1340 	u64		  exit_time;
1341 	bool		  entry_pending;
1342 	unsigned long	  nr_events;
1343 	unsigned long	  pfmaj, pfmin;
1344 	char		  *entry_str;
1345 	double		  runtime_ms;
1346         struct {
1347 		unsigned long ptr;
1348 		short int     entry_str_pos;
1349 		bool	      pending_open;
1350 		unsigned int  namelen;
1351 		char	      *name;
1352 	} filename;
1353 	struct {
1354 		int	  max;
1355 		char	  **table;
1356 	} paths;
1357 
1358 	struct intlist *syscall_stats;
1359 };
1360 
1361 static struct thread_trace *thread_trace__new(void)
1362 {
1363 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1364 
1365 	if (ttrace)
1366 		ttrace->paths.max = -1;
1367 
1368 	ttrace->syscall_stats = intlist__new(NULL);
1369 
1370 	return ttrace;
1371 }
1372 
1373 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1374 {
1375 	struct thread_trace *ttrace;
1376 
1377 	if (thread == NULL)
1378 		goto fail;
1379 
1380 	if (thread__priv(thread) == NULL)
1381 		thread__set_priv(thread, thread_trace__new());
1382 
1383 	if (thread__priv(thread) == NULL)
1384 		goto fail;
1385 
1386 	ttrace = thread__priv(thread);
1387 	++ttrace->nr_events;
1388 
1389 	return ttrace;
1390 fail:
1391 	color_fprintf(fp, PERF_COLOR_RED,
1392 		      "WARNING: not enough memory, dropping samples!\n");
1393 	return NULL;
1394 }
1395 
1396 #define TRACE_PFMAJ		(1 << 0)
1397 #define TRACE_PFMIN		(1 << 1)
1398 
1399 static const size_t trace__entry_str_size = 2048;
1400 
1401 struct trace {
1402 	struct perf_tool	tool;
1403 	struct {
1404 		int		machine;
1405 		int		open_id;
1406 	}			audit;
1407 	struct {
1408 		int		max;
1409 		struct syscall  *table;
1410 		struct {
1411 			struct perf_evsel *sys_enter,
1412 					  *sys_exit;
1413 		}		events;
1414 	} syscalls;
1415 	struct record_opts	opts;
1416 	struct perf_evlist	*evlist;
1417 	struct machine		*host;
1418 	struct thread		*current;
1419 	u64			base_time;
1420 	FILE			*output;
1421 	unsigned long		nr_events;
1422 	struct strlist		*ev_qualifier;
1423 	struct {
1424 		size_t		nr;
1425 		int		*entries;
1426 	}			ev_qualifier_ids;
1427 	struct intlist		*tid_list;
1428 	struct intlist		*pid_list;
1429 	struct {
1430 		size_t		nr;
1431 		pid_t		*entries;
1432 	}			filter_pids;
1433 	double			duration_filter;
1434 	double			runtime_ms;
1435 	struct {
1436 		u64		vfs_getname,
1437 				proc_getname;
1438 	} stats;
1439 	bool			not_ev_qualifier;
1440 	bool			live;
1441 	bool			full_time;
1442 	bool			sched;
1443 	bool			multiple_threads;
1444 	bool			summary;
1445 	bool			summary_only;
1446 	bool			show_comm;
1447 	bool			show_tool_stats;
1448 	bool			trace_syscalls;
1449 	bool			force;
1450 	bool			vfs_getname;
1451 	int			trace_pgfaults;
1452 };
1453 
1454 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1455 {
1456 	struct thread_trace *ttrace = thread__priv(thread);
1457 
1458 	if (fd > ttrace->paths.max) {
1459 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1460 
1461 		if (npath == NULL)
1462 			return -1;
1463 
1464 		if (ttrace->paths.max != -1) {
1465 			memset(npath + ttrace->paths.max + 1, 0,
1466 			       (fd - ttrace->paths.max) * sizeof(char *));
1467 		} else {
1468 			memset(npath, 0, (fd + 1) * sizeof(char *));
1469 		}
1470 
1471 		ttrace->paths.table = npath;
1472 		ttrace->paths.max   = fd;
1473 	}
1474 
1475 	ttrace->paths.table[fd] = strdup(pathname);
1476 
1477 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
1478 }
1479 
1480 static int thread__read_fd_path(struct thread *thread, int fd)
1481 {
1482 	char linkname[PATH_MAX], pathname[PATH_MAX];
1483 	struct stat st;
1484 	int ret;
1485 
1486 	if (thread->pid_ == thread->tid) {
1487 		scnprintf(linkname, sizeof(linkname),
1488 			  "/proc/%d/fd/%d", thread->pid_, fd);
1489 	} else {
1490 		scnprintf(linkname, sizeof(linkname),
1491 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1492 	}
1493 
1494 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1495 		return -1;
1496 
1497 	ret = readlink(linkname, pathname, sizeof(pathname));
1498 
1499 	if (ret < 0 || ret > st.st_size)
1500 		return -1;
1501 
1502 	pathname[ret] = '\0';
1503 	return trace__set_fd_pathname(thread, fd, pathname);
1504 }
1505 
1506 static const char *thread__fd_path(struct thread *thread, int fd,
1507 				   struct trace *trace)
1508 {
1509 	struct thread_trace *ttrace = thread__priv(thread);
1510 
1511 	if (ttrace == NULL)
1512 		return NULL;
1513 
1514 	if (fd < 0)
1515 		return NULL;
1516 
1517 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1518 		if (!trace->live)
1519 			return NULL;
1520 		++trace->stats.proc_getname;
1521 		if (thread__read_fd_path(thread, fd))
1522 			return NULL;
1523 	}
1524 
1525 	return ttrace->paths.table[fd];
1526 }
1527 
1528 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1529 					struct syscall_arg *arg)
1530 {
1531 	int fd = arg->val;
1532 	size_t printed = scnprintf(bf, size, "%d", fd);
1533 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1534 
1535 	if (path)
1536 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1537 
1538 	return printed;
1539 }
1540 
1541 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1542 					      struct syscall_arg *arg)
1543 {
1544 	int fd = arg->val;
1545 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1546 	struct thread_trace *ttrace = thread__priv(arg->thread);
1547 
1548 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1549 		zfree(&ttrace->paths.table[fd]);
1550 
1551 	return printed;
1552 }
1553 
1554 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1555 				     unsigned long ptr)
1556 {
1557 	struct thread_trace *ttrace = thread__priv(thread);
1558 
1559 	ttrace->filename.ptr = ptr;
1560 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1561 }
1562 
1563 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1564 					      struct syscall_arg *arg)
1565 {
1566 	unsigned long ptr = arg->val;
1567 
1568 	if (!arg->trace->vfs_getname)
1569 		return scnprintf(bf, size, "%#x", ptr);
1570 
1571 	thread__set_filename_pos(arg->thread, bf, ptr);
1572 	return 0;
1573 }
1574 
1575 static bool trace__filter_duration(struct trace *trace, double t)
1576 {
1577 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1578 }
1579 
1580 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1581 {
1582 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1583 
1584 	return fprintf(fp, "%10.3f ", ts);
1585 }
1586 
1587 static bool done = false;
1588 static bool interrupted = false;
1589 
1590 static void sig_handler(int sig)
1591 {
1592 	done = true;
1593 	interrupted = sig == SIGINT;
1594 }
1595 
1596 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1597 					u64 duration, u64 tstamp, FILE *fp)
1598 {
1599 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1600 	printed += fprintf_duration(duration, fp);
1601 
1602 	if (trace->multiple_threads) {
1603 		if (trace->show_comm)
1604 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1605 		printed += fprintf(fp, "%d ", thread->tid);
1606 	}
1607 
1608 	return printed;
1609 }
1610 
1611 static int trace__process_event(struct trace *trace, struct machine *machine,
1612 				union perf_event *event, struct perf_sample *sample)
1613 {
1614 	int ret = 0;
1615 
1616 	switch (event->header.type) {
1617 	case PERF_RECORD_LOST:
1618 		color_fprintf(trace->output, PERF_COLOR_RED,
1619 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1620 		ret = machine__process_lost_event(machine, event, sample);
1621 	default:
1622 		ret = machine__process_event(machine, event, sample);
1623 		break;
1624 	}
1625 
1626 	return ret;
1627 }
1628 
1629 static int trace__tool_process(struct perf_tool *tool,
1630 			       union perf_event *event,
1631 			       struct perf_sample *sample,
1632 			       struct machine *machine)
1633 {
1634 	struct trace *trace = container_of(tool, struct trace, tool);
1635 	return trace__process_event(trace, machine, event, sample);
1636 }
1637 
1638 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1639 {
1640 	int err = symbol__init(NULL);
1641 
1642 	if (err)
1643 		return err;
1644 
1645 	trace->host = machine__new_host();
1646 	if (trace->host == NULL)
1647 		return -ENOMEM;
1648 
1649 	if (trace_event__register_resolver(trace->host, machine__resolve_kernel_addr) < 0)
1650 		return -errno;
1651 
1652 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1653 					    evlist->threads, trace__tool_process, false,
1654 					    trace->opts.proc_map_timeout);
1655 	if (err)
1656 		symbol__exit();
1657 
1658 	return err;
1659 }
1660 
1661 static int syscall__set_arg_fmts(struct syscall *sc)
1662 {
1663 	struct format_field *field;
1664 	int idx = 0;
1665 
1666 	sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1667 	if (sc->arg_scnprintf == NULL)
1668 		return -1;
1669 
1670 	if (sc->fmt)
1671 		sc->arg_parm = sc->fmt->arg_parm;
1672 
1673 	for (field = sc->args; field; field = field->next) {
1674 		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1675 			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1676 		else if (field->flags & FIELD_IS_POINTER)
1677 			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1678 		++idx;
1679 	}
1680 
1681 	return 0;
1682 }
1683 
1684 static int trace__read_syscall_info(struct trace *trace, int id)
1685 {
1686 	char tp_name[128];
1687 	struct syscall *sc;
1688 	const char *name = audit_syscall_to_name(id, trace->audit.machine);
1689 
1690 	if (name == NULL)
1691 		return -1;
1692 
1693 	if (id > trace->syscalls.max) {
1694 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1695 
1696 		if (nsyscalls == NULL)
1697 			return -1;
1698 
1699 		if (trace->syscalls.max != -1) {
1700 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1701 			       (id - trace->syscalls.max) * sizeof(*sc));
1702 		} else {
1703 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1704 		}
1705 
1706 		trace->syscalls.table = nsyscalls;
1707 		trace->syscalls.max   = id;
1708 	}
1709 
1710 	sc = trace->syscalls.table + id;
1711 	sc->name = name;
1712 
1713 	sc->fmt  = syscall_fmt__find(sc->name);
1714 
1715 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1716 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1717 
1718 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1719 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1720 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1721 	}
1722 
1723 	if (IS_ERR(sc->tp_format))
1724 		return -1;
1725 
1726 	sc->args = sc->tp_format->format.fields;
1727 	sc->nr_args = sc->tp_format->format.nr_fields;
1728 	/*
1729 	 * We need to check and discard the first variable '__syscall_nr'
1730 	 * or 'nr' that mean the syscall number. It is needless here.
1731 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1732 	 */
1733 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1734 		sc->args = sc->args->next;
1735 		--sc->nr_args;
1736 	}
1737 
1738 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1739 
1740 	return syscall__set_arg_fmts(sc);
1741 }
1742 
1743 static int trace__validate_ev_qualifier(struct trace *trace)
1744 {
1745 	int err = 0, i;
1746 	struct str_node *pos;
1747 
1748 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1749 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1750 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1751 
1752 	if (trace->ev_qualifier_ids.entries == NULL) {
1753 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1754 		       trace->output);
1755 		err = -EINVAL;
1756 		goto out;
1757 	}
1758 
1759 	i = 0;
1760 
1761 	strlist__for_each(pos, trace->ev_qualifier) {
1762 		const char *sc = pos->s;
1763 		int id = audit_name_to_syscall(sc, trace->audit.machine);
1764 
1765 		if (id < 0) {
1766 			if (err == 0) {
1767 				fputs("Error:\tInvalid syscall ", trace->output);
1768 				err = -EINVAL;
1769 			} else {
1770 				fputs(", ", trace->output);
1771 			}
1772 
1773 			fputs(sc, trace->output);
1774 		}
1775 
1776 		trace->ev_qualifier_ids.entries[i++] = id;
1777 	}
1778 
1779 	if (err < 0) {
1780 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1781 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1782 		zfree(&trace->ev_qualifier_ids.entries);
1783 		trace->ev_qualifier_ids.nr = 0;
1784 	}
1785 out:
1786 	return err;
1787 }
1788 
1789 /*
1790  * args is to be interpreted as a series of longs but we need to handle
1791  * 8-byte unaligned accesses. args points to raw_data within the event
1792  * and raw_data is guaranteed to be 8-byte unaligned because it is
1793  * preceded by raw_size which is a u32. So we need to copy args to a temp
1794  * variable to read it. Most notably this avoids extended load instructions
1795  * on unaligned addresses
1796  */
1797 
1798 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1799 				      unsigned char *args, struct trace *trace,
1800 				      struct thread *thread)
1801 {
1802 	size_t printed = 0;
1803 	unsigned char *p;
1804 	unsigned long val;
1805 
1806 	if (sc->args != NULL) {
1807 		struct format_field *field;
1808 		u8 bit = 1;
1809 		struct syscall_arg arg = {
1810 			.idx	= 0,
1811 			.mask	= 0,
1812 			.trace  = trace,
1813 			.thread = thread,
1814 		};
1815 
1816 		for (field = sc->args; field;
1817 		     field = field->next, ++arg.idx, bit <<= 1) {
1818 			if (arg.mask & bit)
1819 				continue;
1820 
1821 			/* special care for unaligned accesses */
1822 			p = args + sizeof(unsigned long) * arg.idx;
1823 			memcpy(&val, p, sizeof(val));
1824 
1825 			/*
1826  			 * Suppress this argument if its value is zero and
1827  			 * and we don't have a string associated in an
1828  			 * strarray for it.
1829  			 */
1830 			if (val == 0 &&
1831 			    !(sc->arg_scnprintf &&
1832 			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1833 			      sc->arg_parm[arg.idx]))
1834 				continue;
1835 
1836 			printed += scnprintf(bf + printed, size - printed,
1837 					     "%s%s: ", printed ? ", " : "", field->name);
1838 			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1839 				arg.val = val;
1840 				if (sc->arg_parm)
1841 					arg.parm = sc->arg_parm[arg.idx];
1842 				printed += sc->arg_scnprintf[arg.idx](bf + printed,
1843 								      size - printed, &arg);
1844 			} else {
1845 				printed += scnprintf(bf + printed, size - printed,
1846 						     "%ld", val);
1847 			}
1848 		}
1849 	} else {
1850 		int i = 0;
1851 
1852 		while (i < 6) {
1853 			/* special care for unaligned accesses */
1854 			p = args + sizeof(unsigned long) * i;
1855 			memcpy(&val, p, sizeof(val));
1856 			printed += scnprintf(bf + printed, size - printed,
1857 					     "%sarg%d: %ld",
1858 					     printed ? ", " : "", i, val);
1859 			++i;
1860 		}
1861 	}
1862 
1863 	return printed;
1864 }
1865 
1866 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1867 				  union perf_event *event,
1868 				  struct perf_sample *sample);
1869 
1870 static struct syscall *trace__syscall_info(struct trace *trace,
1871 					   struct perf_evsel *evsel, int id)
1872 {
1873 
1874 	if (id < 0) {
1875 
1876 		/*
1877 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1878 		 * before that, leaving at a higher verbosity level till that is
1879 		 * explained. Reproduced with plain ftrace with:
1880 		 *
1881 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1882 		 * grep "NR -1 " /t/trace_pipe
1883 		 *
1884 		 * After generating some load on the machine.
1885  		 */
1886 		if (verbose > 1) {
1887 			static u64 n;
1888 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1889 				id, perf_evsel__name(evsel), ++n);
1890 		}
1891 		return NULL;
1892 	}
1893 
1894 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1895 	    trace__read_syscall_info(trace, id))
1896 		goto out_cant_read;
1897 
1898 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1899 		goto out_cant_read;
1900 
1901 	return &trace->syscalls.table[id];
1902 
1903 out_cant_read:
1904 	if (verbose) {
1905 		fprintf(trace->output, "Problems reading syscall %d", id);
1906 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1907 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1908 		fputs(" information\n", trace->output);
1909 	}
1910 	return NULL;
1911 }
1912 
1913 static void thread__update_stats(struct thread_trace *ttrace,
1914 				 int id, struct perf_sample *sample)
1915 {
1916 	struct int_node *inode;
1917 	struct stats *stats;
1918 	u64 duration = 0;
1919 
1920 	inode = intlist__findnew(ttrace->syscall_stats, id);
1921 	if (inode == NULL)
1922 		return;
1923 
1924 	stats = inode->priv;
1925 	if (stats == NULL) {
1926 		stats = malloc(sizeof(struct stats));
1927 		if (stats == NULL)
1928 			return;
1929 		init_stats(stats);
1930 		inode->priv = stats;
1931 	}
1932 
1933 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1934 		duration = sample->time - ttrace->entry_time;
1935 
1936 	update_stats(stats, duration);
1937 }
1938 
1939 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1940 {
1941 	struct thread_trace *ttrace;
1942 	u64 duration;
1943 	size_t printed;
1944 
1945 	if (trace->current == NULL)
1946 		return 0;
1947 
1948 	ttrace = thread__priv(trace->current);
1949 
1950 	if (!ttrace->entry_pending)
1951 		return 0;
1952 
1953 	duration = sample->time - ttrace->entry_time;
1954 
1955 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1956 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1957 	ttrace->entry_pending = false;
1958 
1959 	return printed;
1960 }
1961 
1962 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1963 			    union perf_event *event __maybe_unused,
1964 			    struct perf_sample *sample)
1965 {
1966 	char *msg;
1967 	void *args;
1968 	size_t printed = 0;
1969 	struct thread *thread;
1970 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1971 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1972 	struct thread_trace *ttrace;
1973 
1974 	if (sc == NULL)
1975 		return -1;
1976 
1977 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1978 	ttrace = thread__trace(thread, trace->output);
1979 	if (ttrace == NULL)
1980 		goto out_put;
1981 
1982 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1983 
1984 	if (ttrace->entry_str == NULL) {
1985 		ttrace->entry_str = malloc(trace__entry_str_size);
1986 		if (!ttrace->entry_str)
1987 			goto out_put;
1988 	}
1989 
1990 	if (!trace->summary_only)
1991 		trace__printf_interrupted_entry(trace, sample);
1992 
1993 	ttrace->entry_time = sample->time;
1994 	msg = ttrace->entry_str;
1995 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1996 
1997 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1998 					   args, trace, thread);
1999 
2000 	if (sc->is_exit) {
2001 		if (!trace->duration_filter && !trace->summary_only) {
2002 			trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
2003 			fprintf(trace->output, "%-70s\n", ttrace->entry_str);
2004 		}
2005 	} else {
2006 		ttrace->entry_pending = true;
2007 		/* See trace__vfs_getname & trace__sys_exit */
2008 		ttrace->filename.pending_open = false;
2009 	}
2010 
2011 	if (trace->current != thread) {
2012 		thread__put(trace->current);
2013 		trace->current = thread__get(thread);
2014 	}
2015 	err = 0;
2016 out_put:
2017 	thread__put(thread);
2018 	return err;
2019 }
2020 
2021 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
2022 			   union perf_event *event __maybe_unused,
2023 			   struct perf_sample *sample)
2024 {
2025 	long ret;
2026 	u64 duration = 0;
2027 	struct thread *thread;
2028 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2029 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
2030 	struct thread_trace *ttrace;
2031 
2032 	if (sc == NULL)
2033 		return -1;
2034 
2035 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2036 	ttrace = thread__trace(thread, trace->output);
2037 	if (ttrace == NULL)
2038 		goto out_put;
2039 
2040 	if (trace->summary)
2041 		thread__update_stats(ttrace, id, sample);
2042 
2043 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2044 
2045 	if (id == trace->audit.open_id && ret >= 0 && ttrace->filename.pending_open) {
2046 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2047 		ttrace->filename.pending_open = false;
2048 		++trace->stats.vfs_getname;
2049 	}
2050 
2051 	ttrace->exit_time = sample->time;
2052 
2053 	if (ttrace->entry_time) {
2054 		duration = sample->time - ttrace->entry_time;
2055 		if (trace__filter_duration(trace, duration))
2056 			goto out;
2057 	} else if (trace->duration_filter)
2058 		goto out;
2059 
2060 	if (trace->summary_only)
2061 		goto out;
2062 
2063 	trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
2064 
2065 	if (ttrace->entry_pending) {
2066 		fprintf(trace->output, "%-70s", ttrace->entry_str);
2067 	} else {
2068 		fprintf(trace->output, " ... [");
2069 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2070 		fprintf(trace->output, "]: %s()", sc->name);
2071 	}
2072 
2073 	if (sc->fmt == NULL) {
2074 signed_print:
2075 		fprintf(trace->output, ") = %ld", ret);
2076 	} else if (ret < 0 && sc->fmt->errmsg) {
2077 		char bf[STRERR_BUFSIZE];
2078 		const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
2079 			   *e = audit_errno_to_name(-ret);
2080 
2081 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
2082 	} else if (ret == 0 && sc->fmt->timeout)
2083 		fprintf(trace->output, ") = 0 Timeout");
2084 	else if (sc->fmt->hexret)
2085 		fprintf(trace->output, ") = %#lx", ret);
2086 	else
2087 		goto signed_print;
2088 
2089 	fputc('\n', trace->output);
2090 out:
2091 	ttrace->entry_pending = false;
2092 	err = 0;
2093 out_put:
2094 	thread__put(thread);
2095 	return err;
2096 }
2097 
2098 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
2099 			      union perf_event *event __maybe_unused,
2100 			      struct perf_sample *sample)
2101 {
2102 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2103 	struct thread_trace *ttrace;
2104 	size_t filename_len, entry_str_len, to_move;
2105 	ssize_t remaining_space;
2106 	char *pos;
2107 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
2108 
2109 	if (!thread)
2110 		goto out;
2111 
2112 	ttrace = thread__priv(thread);
2113 	if (!ttrace)
2114 		goto out;
2115 
2116 	filename_len = strlen(filename);
2117 
2118 	if (ttrace->filename.namelen < filename_len) {
2119 		char *f = realloc(ttrace->filename.name, filename_len + 1);
2120 
2121 		if (f == NULL)
2122 				goto out;
2123 
2124 		ttrace->filename.namelen = filename_len;
2125 		ttrace->filename.name = f;
2126 	}
2127 
2128 	strcpy(ttrace->filename.name, filename);
2129 	ttrace->filename.pending_open = true;
2130 
2131 	if (!ttrace->filename.ptr)
2132 		goto out;
2133 
2134 	entry_str_len = strlen(ttrace->entry_str);
2135 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2136 	if (remaining_space <= 0)
2137 		goto out;
2138 
2139 	if (filename_len > (size_t)remaining_space) {
2140 		filename += filename_len - remaining_space;
2141 		filename_len = remaining_space;
2142 	}
2143 
2144 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2145 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2146 	memmove(pos + filename_len, pos, to_move);
2147 	memcpy(pos, filename, filename_len);
2148 
2149 	ttrace->filename.ptr = 0;
2150 	ttrace->filename.entry_str_pos = 0;
2151 out:
2152 	return 0;
2153 }
2154 
2155 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
2156 				     union perf_event *event __maybe_unused,
2157 				     struct perf_sample *sample)
2158 {
2159         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
2160 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2161 	struct thread *thread = machine__findnew_thread(trace->host,
2162 							sample->pid,
2163 							sample->tid);
2164 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
2165 
2166 	if (ttrace == NULL)
2167 		goto out_dump;
2168 
2169 	ttrace->runtime_ms += runtime_ms;
2170 	trace->runtime_ms += runtime_ms;
2171 	thread__put(thread);
2172 	return 0;
2173 
2174 out_dump:
2175 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2176 	       evsel->name,
2177 	       perf_evsel__strval(evsel, sample, "comm"),
2178 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
2179 	       runtime,
2180 	       perf_evsel__intval(evsel, sample, "vruntime"));
2181 	thread__put(thread);
2182 	return 0;
2183 }
2184 
2185 static void bpf_output__printer(enum binary_printer_ops op,
2186 				unsigned int val, void *extra)
2187 {
2188 	FILE *output = extra;
2189 	unsigned char ch = (unsigned char)val;
2190 
2191 	switch (op) {
2192 	case BINARY_PRINT_CHAR_DATA:
2193 		fprintf(output, "%c", isprint(ch) ? ch : '.');
2194 		break;
2195 	case BINARY_PRINT_DATA_BEGIN:
2196 	case BINARY_PRINT_LINE_BEGIN:
2197 	case BINARY_PRINT_ADDR:
2198 	case BINARY_PRINT_NUM_DATA:
2199 	case BINARY_PRINT_NUM_PAD:
2200 	case BINARY_PRINT_SEP:
2201 	case BINARY_PRINT_CHAR_PAD:
2202 	case BINARY_PRINT_LINE_END:
2203 	case BINARY_PRINT_DATA_END:
2204 	default:
2205 		break;
2206 	}
2207 }
2208 
2209 static void bpf_output__fprintf(struct trace *trace,
2210 				struct perf_sample *sample)
2211 {
2212 	print_binary(sample->raw_data, sample->raw_size, 8,
2213 		     bpf_output__printer, trace->output);
2214 }
2215 
2216 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2217 				union perf_event *event __maybe_unused,
2218 				struct perf_sample *sample)
2219 {
2220 	trace__printf_interrupted_entry(trace, sample);
2221 	trace__fprintf_tstamp(trace, sample->time, trace->output);
2222 
2223 	if (trace->trace_syscalls)
2224 		fprintf(trace->output, "(         ): ");
2225 
2226 	fprintf(trace->output, "%s:", evsel->name);
2227 
2228 	if (perf_evsel__is_bpf_output(evsel)) {
2229 		bpf_output__fprintf(trace, sample);
2230 	} else if (evsel->tp_format) {
2231 		event_format__fprintf(evsel->tp_format, sample->cpu,
2232 				      sample->raw_data, sample->raw_size,
2233 				      trace->output);
2234 	}
2235 
2236 	fprintf(trace->output, ")\n");
2237 	return 0;
2238 }
2239 
2240 static void print_location(FILE *f, struct perf_sample *sample,
2241 			   struct addr_location *al,
2242 			   bool print_dso, bool print_sym)
2243 {
2244 
2245 	if ((verbose || print_dso) && al->map)
2246 		fprintf(f, "%s@", al->map->dso->long_name);
2247 
2248 	if ((verbose || print_sym) && al->sym)
2249 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2250 			al->addr - al->sym->start);
2251 	else if (al->map)
2252 		fprintf(f, "0x%" PRIx64, al->addr);
2253 	else
2254 		fprintf(f, "0x%" PRIx64, sample->addr);
2255 }
2256 
2257 static int trace__pgfault(struct trace *trace,
2258 			  struct perf_evsel *evsel,
2259 			  union perf_event *event __maybe_unused,
2260 			  struct perf_sample *sample)
2261 {
2262 	struct thread *thread;
2263 	struct addr_location al;
2264 	char map_type = 'd';
2265 	struct thread_trace *ttrace;
2266 	int err = -1;
2267 
2268 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2269 	ttrace = thread__trace(thread, trace->output);
2270 	if (ttrace == NULL)
2271 		goto out_put;
2272 
2273 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2274 		ttrace->pfmaj++;
2275 	else
2276 		ttrace->pfmin++;
2277 
2278 	if (trace->summary_only)
2279 		goto out;
2280 
2281 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
2282 			      sample->ip, &al);
2283 
2284 	trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
2285 
2286 	fprintf(trace->output, "%sfault [",
2287 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2288 		"maj" : "min");
2289 
2290 	print_location(trace->output, sample, &al, false, true);
2291 
2292 	fprintf(trace->output, "] => ");
2293 
2294 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
2295 				   sample->addr, &al);
2296 
2297 	if (!al.map) {
2298 		thread__find_addr_location(thread, sample->cpumode,
2299 					   MAP__FUNCTION, sample->addr, &al);
2300 
2301 		if (al.map)
2302 			map_type = 'x';
2303 		else
2304 			map_type = '?';
2305 	}
2306 
2307 	print_location(trace->output, sample, &al, true, false);
2308 
2309 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2310 out:
2311 	err = 0;
2312 out_put:
2313 	thread__put(thread);
2314 	return err;
2315 }
2316 
2317 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
2318 {
2319 	if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
2320 	    (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
2321 		return false;
2322 
2323 	if (trace->pid_list || trace->tid_list)
2324 		return true;
2325 
2326 	return false;
2327 }
2328 
2329 static int trace__process_sample(struct perf_tool *tool,
2330 				 union perf_event *event,
2331 				 struct perf_sample *sample,
2332 				 struct perf_evsel *evsel,
2333 				 struct machine *machine __maybe_unused)
2334 {
2335 	struct trace *trace = container_of(tool, struct trace, tool);
2336 	int err = 0;
2337 
2338 	tracepoint_handler handler = evsel->handler;
2339 
2340 	if (skip_sample(trace, sample))
2341 		return 0;
2342 
2343 	if (!trace->full_time && trace->base_time == 0)
2344 		trace->base_time = sample->time;
2345 
2346 	if (handler) {
2347 		++trace->nr_events;
2348 		handler(trace, evsel, event, sample);
2349 	}
2350 
2351 	return err;
2352 }
2353 
2354 static int parse_target_str(struct trace *trace)
2355 {
2356 	if (trace->opts.target.pid) {
2357 		trace->pid_list = intlist__new(trace->opts.target.pid);
2358 		if (trace->pid_list == NULL) {
2359 			pr_err("Error parsing process id string\n");
2360 			return -EINVAL;
2361 		}
2362 	}
2363 
2364 	if (trace->opts.target.tid) {
2365 		trace->tid_list = intlist__new(trace->opts.target.tid);
2366 		if (trace->tid_list == NULL) {
2367 			pr_err("Error parsing thread id string\n");
2368 			return -EINVAL;
2369 		}
2370 	}
2371 
2372 	return 0;
2373 }
2374 
2375 static int trace__record(struct trace *trace, int argc, const char **argv)
2376 {
2377 	unsigned int rec_argc, i, j;
2378 	const char **rec_argv;
2379 	const char * const record_args[] = {
2380 		"record",
2381 		"-R",
2382 		"-m", "1024",
2383 		"-c", "1",
2384 	};
2385 
2386 	const char * const sc_args[] = { "-e", };
2387 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2388 	const char * const majpf_args[] = { "-e", "major-faults" };
2389 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2390 	const char * const minpf_args[] = { "-e", "minor-faults" };
2391 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2392 
2393 	/* +1 is for the event string below */
2394 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2395 		majpf_args_nr + minpf_args_nr + argc;
2396 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2397 
2398 	if (rec_argv == NULL)
2399 		return -ENOMEM;
2400 
2401 	j = 0;
2402 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2403 		rec_argv[j++] = record_args[i];
2404 
2405 	if (trace->trace_syscalls) {
2406 		for (i = 0; i < sc_args_nr; i++)
2407 			rec_argv[j++] = sc_args[i];
2408 
2409 		/* event string may be different for older kernels - e.g., RHEL6 */
2410 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2411 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2412 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2413 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2414 		else {
2415 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2416 			return -1;
2417 		}
2418 	}
2419 
2420 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2421 		for (i = 0; i < majpf_args_nr; i++)
2422 			rec_argv[j++] = majpf_args[i];
2423 
2424 	if (trace->trace_pgfaults & TRACE_PFMIN)
2425 		for (i = 0; i < minpf_args_nr; i++)
2426 			rec_argv[j++] = minpf_args[i];
2427 
2428 	for (i = 0; i < (unsigned int)argc; i++)
2429 		rec_argv[j++] = argv[i];
2430 
2431 	return cmd_record(j, rec_argv, NULL);
2432 }
2433 
2434 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2435 
2436 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2437 {
2438 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2439 
2440 	if (IS_ERR(evsel))
2441 		return false;
2442 
2443 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2444 		perf_evsel__delete(evsel);
2445 		return false;
2446 	}
2447 
2448 	evsel->handler = trace__vfs_getname;
2449 	perf_evlist__add(evlist, evsel);
2450 	return true;
2451 }
2452 
2453 static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
2454 				    u64 config)
2455 {
2456 	struct perf_evsel *evsel;
2457 	struct perf_event_attr attr = {
2458 		.type = PERF_TYPE_SOFTWARE,
2459 		.mmap_data = 1,
2460 	};
2461 
2462 	attr.config = config;
2463 	attr.sample_period = 1;
2464 
2465 	event_attr_init(&attr);
2466 
2467 	evsel = perf_evsel__new(&attr);
2468 	if (!evsel)
2469 		return -ENOMEM;
2470 
2471 	evsel->handler = trace__pgfault;
2472 	perf_evlist__add(evlist, evsel);
2473 
2474 	return 0;
2475 }
2476 
2477 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2478 {
2479 	const u32 type = event->header.type;
2480 	struct perf_evsel *evsel;
2481 
2482 	if (!trace->full_time && trace->base_time == 0)
2483 		trace->base_time = sample->time;
2484 
2485 	if (type != PERF_RECORD_SAMPLE) {
2486 		trace__process_event(trace, trace->host, event, sample);
2487 		return;
2488 	}
2489 
2490 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2491 	if (evsel == NULL) {
2492 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2493 		return;
2494 	}
2495 
2496 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2497 	    sample->raw_data == NULL) {
2498 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2499 		       perf_evsel__name(evsel), sample->tid,
2500 		       sample->cpu, sample->raw_size);
2501 	} else {
2502 		tracepoint_handler handler = evsel->handler;
2503 		handler(trace, evsel, event, sample);
2504 	}
2505 }
2506 
2507 static int trace__add_syscall_newtp(struct trace *trace)
2508 {
2509 	int ret = -1;
2510 	struct perf_evlist *evlist = trace->evlist;
2511 	struct perf_evsel *sys_enter, *sys_exit;
2512 
2513 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2514 	if (sys_enter == NULL)
2515 		goto out;
2516 
2517 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2518 		goto out_delete_sys_enter;
2519 
2520 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2521 	if (sys_exit == NULL)
2522 		goto out_delete_sys_enter;
2523 
2524 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2525 		goto out_delete_sys_exit;
2526 
2527 	perf_evlist__add(evlist, sys_enter);
2528 	perf_evlist__add(evlist, sys_exit);
2529 
2530 	trace->syscalls.events.sys_enter = sys_enter;
2531 	trace->syscalls.events.sys_exit  = sys_exit;
2532 
2533 	ret = 0;
2534 out:
2535 	return ret;
2536 
2537 out_delete_sys_exit:
2538 	perf_evsel__delete_priv(sys_exit);
2539 out_delete_sys_enter:
2540 	perf_evsel__delete_priv(sys_enter);
2541 	goto out;
2542 }
2543 
2544 static int trace__set_ev_qualifier_filter(struct trace *trace)
2545 {
2546 	int err = -1;
2547 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2548 						trace->ev_qualifier_ids.nr,
2549 						trace->ev_qualifier_ids.entries);
2550 
2551 	if (filter == NULL)
2552 		goto out_enomem;
2553 
2554 	if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2555 		err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2556 
2557 	free(filter);
2558 out:
2559 	return err;
2560 out_enomem:
2561 	errno = ENOMEM;
2562 	goto out;
2563 }
2564 
2565 static int trace__run(struct trace *trace, int argc, const char **argv)
2566 {
2567 	struct perf_evlist *evlist = trace->evlist;
2568 	struct perf_evsel *evsel;
2569 	int err = -1, i;
2570 	unsigned long before;
2571 	const bool forks = argc > 0;
2572 	bool draining = false;
2573 
2574 	trace->live = true;
2575 
2576 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2577 		goto out_error_raw_syscalls;
2578 
2579 	if (trace->trace_syscalls)
2580 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2581 
2582 	if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2583 	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
2584 		goto out_error_mem;
2585 	}
2586 
2587 	if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2588 	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2589 		goto out_error_mem;
2590 
2591 	if (trace->sched &&
2592 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2593 				   trace__sched_stat_runtime))
2594 		goto out_error_sched_stat_runtime;
2595 
2596 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2597 	if (err < 0) {
2598 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2599 		goto out_delete_evlist;
2600 	}
2601 
2602 	err = trace__symbols_init(trace, evlist);
2603 	if (err < 0) {
2604 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2605 		goto out_delete_evlist;
2606 	}
2607 
2608 	perf_evlist__config(evlist, &trace->opts);
2609 
2610 	signal(SIGCHLD, sig_handler);
2611 	signal(SIGINT, sig_handler);
2612 
2613 	if (forks) {
2614 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2615 						    argv, false, NULL);
2616 		if (err < 0) {
2617 			fprintf(trace->output, "Couldn't run the workload!\n");
2618 			goto out_delete_evlist;
2619 		}
2620 	}
2621 
2622 	err = perf_evlist__open(evlist);
2623 	if (err < 0)
2624 		goto out_error_open;
2625 
2626 	err = bpf__apply_obj_config();
2627 	if (err) {
2628 		char errbuf[BUFSIZ];
2629 
2630 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2631 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2632 			 errbuf);
2633 		goto out_error_open;
2634 	}
2635 
2636 	/*
2637 	 * Better not use !target__has_task() here because we need to cover the
2638 	 * case where no threads were specified in the command line, but a
2639 	 * workload was, and in that case we will fill in the thread_map when
2640 	 * we fork the workload in perf_evlist__prepare_workload.
2641 	 */
2642 	if (trace->filter_pids.nr > 0)
2643 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2644 	else if (thread_map__pid(evlist->threads, 0) == -1)
2645 		err = perf_evlist__set_filter_pid(evlist, getpid());
2646 
2647 	if (err < 0)
2648 		goto out_error_mem;
2649 
2650 	if (trace->ev_qualifier_ids.nr > 0) {
2651 		err = trace__set_ev_qualifier_filter(trace);
2652 		if (err < 0)
2653 			goto out_errno;
2654 
2655 		pr_debug("event qualifier tracepoint filter: %s\n",
2656 			 trace->syscalls.events.sys_exit->filter);
2657 	}
2658 
2659 	err = perf_evlist__apply_filters(evlist, &evsel);
2660 	if (err < 0)
2661 		goto out_error_apply_filters;
2662 
2663 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2664 	if (err < 0)
2665 		goto out_error_mmap;
2666 
2667 	if (!target__none(&trace->opts.target))
2668 		perf_evlist__enable(evlist);
2669 
2670 	if (forks)
2671 		perf_evlist__start_workload(evlist);
2672 
2673 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2674 				  evlist->threads->nr > 1 ||
2675 				  perf_evlist__first(evlist)->attr.inherit;
2676 again:
2677 	before = trace->nr_events;
2678 
2679 	for (i = 0; i < evlist->nr_mmaps; i++) {
2680 		union perf_event *event;
2681 
2682 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2683 			struct perf_sample sample;
2684 
2685 			++trace->nr_events;
2686 
2687 			err = perf_evlist__parse_sample(evlist, event, &sample);
2688 			if (err) {
2689 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2690 				goto next_event;
2691 			}
2692 
2693 			trace__handle_event(trace, event, &sample);
2694 next_event:
2695 			perf_evlist__mmap_consume(evlist, i);
2696 
2697 			if (interrupted)
2698 				goto out_disable;
2699 
2700 			if (done && !draining) {
2701 				perf_evlist__disable(evlist);
2702 				draining = true;
2703 			}
2704 		}
2705 	}
2706 
2707 	if (trace->nr_events == before) {
2708 		int timeout = done ? 100 : -1;
2709 
2710 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2711 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2712 				draining = true;
2713 
2714 			goto again;
2715 		}
2716 	} else {
2717 		goto again;
2718 	}
2719 
2720 out_disable:
2721 	thread__zput(trace->current);
2722 
2723 	perf_evlist__disable(evlist);
2724 
2725 	if (!err) {
2726 		if (trace->summary)
2727 			trace__fprintf_thread_summary(trace, trace->output);
2728 
2729 		if (trace->show_tool_stats) {
2730 			fprintf(trace->output, "Stats:\n "
2731 					       " vfs_getname : %" PRIu64 "\n"
2732 					       " proc_getname: %" PRIu64 "\n",
2733 				trace->stats.vfs_getname,
2734 				trace->stats.proc_getname);
2735 		}
2736 	}
2737 
2738 out_delete_evlist:
2739 	perf_evlist__delete(evlist);
2740 	trace->evlist = NULL;
2741 	trace->live = false;
2742 	return err;
2743 {
2744 	char errbuf[BUFSIZ];
2745 
2746 out_error_sched_stat_runtime:
2747 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2748 	goto out_error;
2749 
2750 out_error_raw_syscalls:
2751 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2752 	goto out_error;
2753 
2754 out_error_mmap:
2755 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2756 	goto out_error;
2757 
2758 out_error_open:
2759 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2760 
2761 out_error:
2762 	fprintf(trace->output, "%s\n", errbuf);
2763 	goto out_delete_evlist;
2764 
2765 out_error_apply_filters:
2766 	fprintf(trace->output,
2767 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2768 		evsel->filter, perf_evsel__name(evsel), errno,
2769 		strerror_r(errno, errbuf, sizeof(errbuf)));
2770 	goto out_delete_evlist;
2771 }
2772 out_error_mem:
2773 	fprintf(trace->output, "Not enough memory to run!\n");
2774 	goto out_delete_evlist;
2775 
2776 out_errno:
2777 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2778 	goto out_delete_evlist;
2779 }
2780 
2781 static int trace__replay(struct trace *trace)
2782 {
2783 	const struct perf_evsel_str_handler handlers[] = {
2784 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2785 	};
2786 	struct perf_data_file file = {
2787 		.path  = input_name,
2788 		.mode  = PERF_DATA_MODE_READ,
2789 		.force = trace->force,
2790 	};
2791 	struct perf_session *session;
2792 	struct perf_evsel *evsel;
2793 	int err = -1;
2794 
2795 	trace->tool.sample	  = trace__process_sample;
2796 	trace->tool.mmap	  = perf_event__process_mmap;
2797 	trace->tool.mmap2	  = perf_event__process_mmap2;
2798 	trace->tool.comm	  = perf_event__process_comm;
2799 	trace->tool.exit	  = perf_event__process_exit;
2800 	trace->tool.fork	  = perf_event__process_fork;
2801 	trace->tool.attr	  = perf_event__process_attr;
2802 	trace->tool.tracing_data = perf_event__process_tracing_data;
2803 	trace->tool.build_id	  = perf_event__process_build_id;
2804 
2805 	trace->tool.ordered_events = true;
2806 	trace->tool.ordering_requires_timestamps = true;
2807 
2808 	/* add tid to output */
2809 	trace->multiple_threads = true;
2810 
2811 	session = perf_session__new(&file, false, &trace->tool);
2812 	if (session == NULL)
2813 		return -1;
2814 
2815 	if (symbol__init(&session->header.env) < 0)
2816 		goto out;
2817 
2818 	trace->host = &session->machines.host;
2819 
2820 	err = perf_session__set_tracepoints_handlers(session, handlers);
2821 	if (err)
2822 		goto out;
2823 
2824 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2825 						     "raw_syscalls:sys_enter");
2826 	/* older kernels have syscalls tp versus raw_syscalls */
2827 	if (evsel == NULL)
2828 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2829 							     "syscalls:sys_enter");
2830 
2831 	if (evsel &&
2832 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2833 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2834 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2835 		goto out;
2836 	}
2837 
2838 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2839 						     "raw_syscalls:sys_exit");
2840 	if (evsel == NULL)
2841 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2842 							     "syscalls:sys_exit");
2843 	if (evsel &&
2844 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2845 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2846 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2847 		goto out;
2848 	}
2849 
2850 	evlist__for_each(session->evlist, evsel) {
2851 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2852 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2853 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2854 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2855 			evsel->handler = trace__pgfault;
2856 	}
2857 
2858 	err = parse_target_str(trace);
2859 	if (err != 0)
2860 		goto out;
2861 
2862 	setup_pager();
2863 
2864 	err = perf_session__process_events(session);
2865 	if (err)
2866 		pr_err("Failed to process events, error %d", err);
2867 
2868 	else if (trace->summary)
2869 		trace__fprintf_thread_summary(trace, trace->output);
2870 
2871 out:
2872 	perf_session__delete(session);
2873 
2874 	return err;
2875 }
2876 
2877 static size_t trace__fprintf_threads_header(FILE *fp)
2878 {
2879 	size_t printed;
2880 
2881 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2882 
2883 	return printed;
2884 }
2885 
2886 static size_t thread__dump_stats(struct thread_trace *ttrace,
2887 				 struct trace *trace, FILE *fp)
2888 {
2889 	struct stats *stats;
2890 	size_t printed = 0;
2891 	struct syscall *sc;
2892 	struct int_node *inode = intlist__first(ttrace->syscall_stats);
2893 
2894 	if (inode == NULL)
2895 		return 0;
2896 
2897 	printed += fprintf(fp, "\n");
2898 
2899 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2900 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2901 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2902 
2903 	/* each int_node is a syscall */
2904 	while (inode) {
2905 		stats = inode->priv;
2906 		if (stats) {
2907 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2908 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2909 			double avg = avg_stats(stats);
2910 			double pct;
2911 			u64 n = (u64) stats->n;
2912 
2913 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2914 			avg /= NSEC_PER_MSEC;
2915 
2916 			sc = &trace->syscalls.table[inode->i];
2917 			printed += fprintf(fp, "   %-15s", sc->name);
2918 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2919 					   n, avg * n, min, avg);
2920 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2921 		}
2922 
2923 		inode = intlist__next(inode);
2924 	}
2925 
2926 	printed += fprintf(fp, "\n\n");
2927 
2928 	return printed;
2929 }
2930 
2931 /* struct used to pass data to per-thread function */
2932 struct summary_data {
2933 	FILE *fp;
2934 	struct trace *trace;
2935 	size_t printed;
2936 };
2937 
2938 static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2939 {
2940 	struct summary_data *data = priv;
2941 	FILE *fp = data->fp;
2942 	size_t printed = data->printed;
2943 	struct trace *trace = data->trace;
2944 	struct thread_trace *ttrace = thread__priv(thread);
2945 	double ratio;
2946 
2947 	if (ttrace == NULL)
2948 		return 0;
2949 
2950 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2951 
2952 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2953 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2954 	printed += fprintf(fp, "%.1f%%", ratio);
2955 	if (ttrace->pfmaj)
2956 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2957 	if (ttrace->pfmin)
2958 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2959 	printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2960 	printed += thread__dump_stats(ttrace, trace, fp);
2961 
2962 	data->printed += printed;
2963 
2964 	return 0;
2965 }
2966 
2967 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2968 {
2969 	struct summary_data data = {
2970 		.fp = fp,
2971 		.trace = trace
2972 	};
2973 	data.printed = trace__fprintf_threads_header(fp);
2974 
2975 	machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2976 
2977 	return data.printed;
2978 }
2979 
2980 static int trace__set_duration(const struct option *opt, const char *str,
2981 			       int unset __maybe_unused)
2982 {
2983 	struct trace *trace = opt->value;
2984 
2985 	trace->duration_filter = atof(str);
2986 	return 0;
2987 }
2988 
2989 static int trace__set_filter_pids(const struct option *opt, const char *str,
2990 				  int unset __maybe_unused)
2991 {
2992 	int ret = -1;
2993 	size_t i;
2994 	struct trace *trace = opt->value;
2995 	/*
2996 	 * FIXME: introduce a intarray class, plain parse csv and create a
2997 	 * { int nr, int entries[] } struct...
2998 	 */
2999 	struct intlist *list = intlist__new(str);
3000 
3001 	if (list == NULL)
3002 		return -1;
3003 
3004 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
3005 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
3006 
3007 	if (trace->filter_pids.entries == NULL)
3008 		goto out;
3009 
3010 	trace->filter_pids.entries[0] = getpid();
3011 
3012 	for (i = 1; i < trace->filter_pids.nr; ++i)
3013 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
3014 
3015 	intlist__delete(list);
3016 	ret = 0;
3017 out:
3018 	return ret;
3019 }
3020 
3021 static int trace__open_output(struct trace *trace, const char *filename)
3022 {
3023 	struct stat st;
3024 
3025 	if (!stat(filename, &st) && st.st_size) {
3026 		char oldname[PATH_MAX];
3027 
3028 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
3029 		unlink(oldname);
3030 		rename(filename, oldname);
3031 	}
3032 
3033 	trace->output = fopen(filename, "w");
3034 
3035 	return trace->output == NULL ? -errno : 0;
3036 }
3037 
3038 static int parse_pagefaults(const struct option *opt, const char *str,
3039 			    int unset __maybe_unused)
3040 {
3041 	int *trace_pgfaults = opt->value;
3042 
3043 	if (strcmp(str, "all") == 0)
3044 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
3045 	else if (strcmp(str, "maj") == 0)
3046 		*trace_pgfaults |= TRACE_PFMAJ;
3047 	else if (strcmp(str, "min") == 0)
3048 		*trace_pgfaults |= TRACE_PFMIN;
3049 	else
3050 		return -1;
3051 
3052 	return 0;
3053 }
3054 
3055 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
3056 {
3057 	struct perf_evsel *evsel;
3058 
3059 	evlist__for_each(evlist, evsel)
3060 		evsel->handler = handler;
3061 }
3062 
3063 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
3064 {
3065 	const char *trace_usage[] = {
3066 		"perf trace [<options>] [<command>]",
3067 		"perf trace [<options>] -- <command> [<options>]",
3068 		"perf trace record [<options>] [<command>]",
3069 		"perf trace record [<options>] -- <command> [<options>]",
3070 		NULL
3071 	};
3072 	struct trace trace = {
3073 		.audit = {
3074 			.machine = audit_detect_machine(),
3075 			.open_id = audit_name_to_syscall("open", trace.audit.machine),
3076 		},
3077 		.syscalls = {
3078 			. max = -1,
3079 		},
3080 		.opts = {
3081 			.target = {
3082 				.uid	   = UINT_MAX,
3083 				.uses_mmap = true,
3084 			},
3085 			.user_freq     = UINT_MAX,
3086 			.user_interval = ULLONG_MAX,
3087 			.no_buffering  = true,
3088 			.mmap_pages    = UINT_MAX,
3089 			.proc_map_timeout  = 500,
3090 		},
3091 		.output = stderr,
3092 		.show_comm = true,
3093 		.trace_syscalls = true,
3094 	};
3095 	const char *output_name = NULL;
3096 	const char *ev_qualifier_str = NULL;
3097 	const struct option trace_options[] = {
3098 	OPT_CALLBACK(0, "event", &trace.evlist, "event",
3099 		     "event selector. use 'perf list' to list available events",
3100 		     parse_events_option),
3101 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
3102 		    "show the thread COMM next to its id"),
3103 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3104 	OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
3105 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
3106 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3107 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3108 		    "trace events on existing process id"),
3109 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3110 		    "trace events on existing thread id"),
3111 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3112 		     "pids to filter (by the kernel)", trace__set_filter_pids),
3113 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3114 		    "system-wide collection from all CPUs"),
3115 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3116 		    "list of cpus to monitor"),
3117 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3118 		    "child tasks do not inherit counters"),
3119 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3120 		     "number of mmap data pages",
3121 		     perf_evlist__parse_mmap_pages),
3122 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3123 		   "user to profile"),
3124 	OPT_CALLBACK(0, "duration", &trace, "float",
3125 		     "show only events with duration > N.M ms",
3126 		     trace__set_duration),
3127 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3128 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3129 	OPT_BOOLEAN('T', "time", &trace.full_time,
3130 		    "Show full timestamp, not time relative to first start"),
3131 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
3132 		    "Show only syscall summary with statistics"),
3133 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
3134 		    "Show all syscalls and summary with statistics"),
3135 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3136 		     "Trace pagefaults", parse_pagefaults, "maj"),
3137 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3138 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3139 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3140 			"per thread proc mmap processing timeout in ms"),
3141 	OPT_END()
3142 	};
3143 	const char * const trace_subcommands[] = { "record", NULL };
3144 	int err;
3145 	char bf[BUFSIZ];
3146 
3147 	signal(SIGSEGV, sighandler_dump_stack);
3148 	signal(SIGFPE, sighandler_dump_stack);
3149 
3150 	trace.evlist = perf_evlist__new();
3151 
3152 	if (trace.evlist == NULL) {
3153 		pr_err("Not enough memory to run!\n");
3154 		err = -ENOMEM;
3155 		goto out;
3156 	}
3157 
3158 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3159 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3160 
3161 	if (trace.trace_pgfaults) {
3162 		trace.opts.sample_address = true;
3163 		trace.opts.sample_time = true;
3164 	}
3165 
3166 	if (trace.evlist->nr_entries > 0)
3167 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3168 
3169 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3170 		return trace__record(&trace, argc-1, &argv[1]);
3171 
3172 	/* summary_only implies summary option, but don't overwrite summary if set */
3173 	if (trace.summary_only)
3174 		trace.summary = trace.summary_only;
3175 
3176 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3177 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3178 		pr_err("Please specify something to trace.\n");
3179 		return -1;
3180 	}
3181 
3182 	if (output_name != NULL) {
3183 		err = trace__open_output(&trace, output_name);
3184 		if (err < 0) {
3185 			perror("failed to create output file");
3186 			goto out;
3187 		}
3188 	}
3189 
3190 	if (ev_qualifier_str != NULL) {
3191 		const char *s = ev_qualifier_str;
3192 		struct strlist_config slist_config = {
3193 			.dirname = system_path(STRACE_GROUPS_DIR),
3194 		};
3195 
3196 		trace.not_ev_qualifier = *s == '!';
3197 		if (trace.not_ev_qualifier)
3198 			++s;
3199 		trace.ev_qualifier = strlist__new(s, &slist_config);
3200 		if (trace.ev_qualifier == NULL) {
3201 			fputs("Not enough memory to parse event qualifier",
3202 			      trace.output);
3203 			err = -ENOMEM;
3204 			goto out_close;
3205 		}
3206 
3207 		err = trace__validate_ev_qualifier(&trace);
3208 		if (err)
3209 			goto out_close;
3210 	}
3211 
3212 	err = target__validate(&trace.opts.target);
3213 	if (err) {
3214 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3215 		fprintf(trace.output, "%s", bf);
3216 		goto out_close;
3217 	}
3218 
3219 	err = target__parse_uid(&trace.opts.target);
3220 	if (err) {
3221 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3222 		fprintf(trace.output, "%s", bf);
3223 		goto out_close;
3224 	}
3225 
3226 	if (!argc && target__none(&trace.opts.target))
3227 		trace.opts.target.system_wide = true;
3228 
3229 	if (input_name)
3230 		err = trace__replay(&trace);
3231 	else
3232 		err = trace__run(&trace, argc, argv);
3233 
3234 out_close:
3235 	if (output_name != NULL)
3236 		fclose(trace.output);
3237 out:
3238 	return err;
3239 }
3240