xref: /linux/tools/perf/builtin-trace.c (revision 9cfc5c90ad38c8fc11bfd39de42a107da00871ba)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include "util/exec_cmd.h"
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include "util/parse-options.h"
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 
37 #include <libaudit.h>
38 #include <stdlib.h>
39 #include <sys/mman.h>
40 #include <linux/futex.h>
41 #include <linux/err.h>
42 
43 /* For older distros: */
44 #ifndef MAP_STACK
45 # define MAP_STACK		0x20000
46 #endif
47 
48 #ifndef MADV_HWPOISON
49 # define MADV_HWPOISON		100
50 
51 #endif
52 
53 #ifndef MADV_MERGEABLE
54 # define MADV_MERGEABLE		12
55 #endif
56 
57 #ifndef MADV_UNMERGEABLE
58 # define MADV_UNMERGEABLE	13
59 #endif
60 
61 #ifndef EFD_SEMAPHORE
62 # define EFD_SEMAPHORE		1
63 #endif
64 
65 #ifndef EFD_NONBLOCK
66 # define EFD_NONBLOCK		00004000
67 #endif
68 
69 #ifndef EFD_CLOEXEC
70 # define EFD_CLOEXEC		02000000
71 #endif
72 
73 #ifndef O_CLOEXEC
74 # define O_CLOEXEC		02000000
75 #endif
76 
77 #ifndef SOCK_DCCP
78 # define SOCK_DCCP		6
79 #endif
80 
81 #ifndef SOCK_CLOEXEC
82 # define SOCK_CLOEXEC		02000000
83 #endif
84 
85 #ifndef SOCK_NONBLOCK
86 # define SOCK_NONBLOCK		00004000
87 #endif
88 
89 #ifndef MSG_CMSG_CLOEXEC
90 # define MSG_CMSG_CLOEXEC	0x40000000
91 #endif
92 
93 #ifndef PERF_FLAG_FD_NO_GROUP
94 # define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
95 #endif
96 
97 #ifndef PERF_FLAG_FD_OUTPUT
98 # define PERF_FLAG_FD_OUTPUT		(1UL << 1)
99 #endif
100 
101 #ifndef PERF_FLAG_PID_CGROUP
102 # define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
103 #endif
104 
105 #ifndef PERF_FLAG_FD_CLOEXEC
106 # define PERF_FLAG_FD_CLOEXEC		(1UL << 3) /* O_CLOEXEC */
107 #endif
108 
109 
110 struct tp_field {
111 	int offset;
112 	union {
113 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
114 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
115 	};
116 };
117 
118 #define TP_UINT_FIELD(bits) \
119 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
120 { \
121 	u##bits value; \
122 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
123 	return value;  \
124 }
125 
126 TP_UINT_FIELD(8);
127 TP_UINT_FIELD(16);
128 TP_UINT_FIELD(32);
129 TP_UINT_FIELD(64);
130 
131 #define TP_UINT_FIELD__SWAPPED(bits) \
132 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
133 { \
134 	u##bits value; \
135 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
136 	return bswap_##bits(value);\
137 }
138 
139 TP_UINT_FIELD__SWAPPED(16);
140 TP_UINT_FIELD__SWAPPED(32);
141 TP_UINT_FIELD__SWAPPED(64);
142 
143 static int tp_field__init_uint(struct tp_field *field,
144 			       struct format_field *format_field,
145 			       bool needs_swap)
146 {
147 	field->offset = format_field->offset;
148 
149 	switch (format_field->size) {
150 	case 1:
151 		field->integer = tp_field__u8;
152 		break;
153 	case 2:
154 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
155 		break;
156 	case 4:
157 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
158 		break;
159 	case 8:
160 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
161 		break;
162 	default:
163 		return -1;
164 	}
165 
166 	return 0;
167 }
168 
169 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
170 {
171 	return sample->raw_data + field->offset;
172 }
173 
174 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
175 {
176 	field->offset = format_field->offset;
177 	field->pointer = tp_field__ptr;
178 	return 0;
179 }
180 
181 struct syscall_tp {
182 	struct tp_field id;
183 	union {
184 		struct tp_field args, ret;
185 	};
186 };
187 
188 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
189 					  struct tp_field *field,
190 					  const char *name)
191 {
192 	struct format_field *format_field = perf_evsel__field(evsel, name);
193 
194 	if (format_field == NULL)
195 		return -1;
196 
197 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
198 }
199 
200 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
201 	({ struct syscall_tp *sc = evsel->priv;\
202 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
203 
204 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
205 					 struct tp_field *field,
206 					 const char *name)
207 {
208 	struct format_field *format_field = perf_evsel__field(evsel, name);
209 
210 	if (format_field == NULL)
211 		return -1;
212 
213 	return tp_field__init_ptr(field, format_field);
214 }
215 
216 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
217 	({ struct syscall_tp *sc = evsel->priv;\
218 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
219 
220 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
221 {
222 	zfree(&evsel->priv);
223 	perf_evsel__delete(evsel);
224 }
225 
226 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
227 {
228 	evsel->priv = malloc(sizeof(struct syscall_tp));
229 	if (evsel->priv != NULL) {
230 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
231 			goto out_delete;
232 
233 		evsel->handler = handler;
234 		return 0;
235 	}
236 
237 	return -ENOMEM;
238 
239 out_delete:
240 	zfree(&evsel->priv);
241 	return -ENOENT;
242 }
243 
244 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
245 {
246 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
247 
248 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
249 	if (IS_ERR(evsel))
250 		evsel = perf_evsel__newtp("syscalls", direction);
251 
252 	if (IS_ERR(evsel))
253 		return NULL;
254 
255 	if (perf_evsel__init_syscall_tp(evsel, handler))
256 		goto out_delete;
257 
258 	return evsel;
259 
260 out_delete:
261 	perf_evsel__delete_priv(evsel);
262 	return NULL;
263 }
264 
265 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
266 	({ struct syscall_tp *fields = evsel->priv; \
267 	   fields->name.integer(&fields->name, sample); })
268 
269 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
270 	({ struct syscall_tp *fields = evsel->priv; \
271 	   fields->name.pointer(&fields->name, sample); })
272 
273 struct syscall_arg {
274 	unsigned long val;
275 	struct thread *thread;
276 	struct trace  *trace;
277 	void	      *parm;
278 	u8	      idx;
279 	u8	      mask;
280 };
281 
282 struct strarray {
283 	int	    offset;
284 	int	    nr_entries;
285 	const char **entries;
286 };
287 
288 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
289 	.nr_entries = ARRAY_SIZE(array), \
290 	.entries = array, \
291 }
292 
293 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
294 	.offset	    = off, \
295 	.nr_entries = ARRAY_SIZE(array), \
296 	.entries = array, \
297 }
298 
299 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
300 						const char *intfmt,
301 					        struct syscall_arg *arg)
302 {
303 	struct strarray *sa = arg->parm;
304 	int idx = arg->val - sa->offset;
305 
306 	if (idx < 0 || idx >= sa->nr_entries)
307 		return scnprintf(bf, size, intfmt, arg->val);
308 
309 	return scnprintf(bf, size, "%s", sa->entries[idx]);
310 }
311 
312 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
313 					      struct syscall_arg *arg)
314 {
315 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
316 }
317 
318 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
319 
320 #if defined(__i386__) || defined(__x86_64__)
321 /*
322  * FIXME: Make this available to all arches as soon as the ioctl beautifier
323  * 	  gets rewritten to support all arches.
324  */
325 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
326 						 struct syscall_arg *arg)
327 {
328 	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
329 }
330 
331 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
332 #endif /* defined(__i386__) || defined(__x86_64__) */
333 
334 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
335 					struct syscall_arg *arg);
336 
337 #define SCA_FD syscall_arg__scnprintf_fd
338 
339 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
340 					   struct syscall_arg *arg)
341 {
342 	int fd = arg->val;
343 
344 	if (fd == AT_FDCWD)
345 		return scnprintf(bf, size, "CWD");
346 
347 	return syscall_arg__scnprintf_fd(bf, size, arg);
348 }
349 
350 #define SCA_FDAT syscall_arg__scnprintf_fd_at
351 
352 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
353 					      struct syscall_arg *arg);
354 
355 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
356 
357 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
358 					 struct syscall_arg *arg)
359 {
360 	return scnprintf(bf, size, "%#lx", arg->val);
361 }
362 
363 #define SCA_HEX syscall_arg__scnprintf_hex
364 
365 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
366 					 struct syscall_arg *arg)
367 {
368 	return scnprintf(bf, size, "%d", arg->val);
369 }
370 
371 #define SCA_INT syscall_arg__scnprintf_int
372 
373 static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
374 					       struct syscall_arg *arg)
375 {
376 	int printed = 0, prot = arg->val;
377 
378 	if (prot == PROT_NONE)
379 		return scnprintf(bf, size, "NONE");
380 #define	P_MMAP_PROT(n) \
381 	if (prot & PROT_##n) { \
382 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
383 		prot &= ~PROT_##n; \
384 	}
385 
386 	P_MMAP_PROT(EXEC);
387 	P_MMAP_PROT(READ);
388 	P_MMAP_PROT(WRITE);
389 #ifdef PROT_SEM
390 	P_MMAP_PROT(SEM);
391 #endif
392 	P_MMAP_PROT(GROWSDOWN);
393 	P_MMAP_PROT(GROWSUP);
394 #undef P_MMAP_PROT
395 
396 	if (prot)
397 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
398 
399 	return printed;
400 }
401 
402 #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
403 
404 static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
405 						struct syscall_arg *arg)
406 {
407 	int printed = 0, flags = arg->val;
408 
409 #define	P_MMAP_FLAG(n) \
410 	if (flags & MAP_##n) { \
411 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
412 		flags &= ~MAP_##n; \
413 	}
414 
415 	P_MMAP_FLAG(SHARED);
416 	P_MMAP_FLAG(PRIVATE);
417 #ifdef MAP_32BIT
418 	P_MMAP_FLAG(32BIT);
419 #endif
420 	P_MMAP_FLAG(ANONYMOUS);
421 	P_MMAP_FLAG(DENYWRITE);
422 	P_MMAP_FLAG(EXECUTABLE);
423 	P_MMAP_FLAG(FILE);
424 	P_MMAP_FLAG(FIXED);
425 	P_MMAP_FLAG(GROWSDOWN);
426 #ifdef MAP_HUGETLB
427 	P_MMAP_FLAG(HUGETLB);
428 #endif
429 	P_MMAP_FLAG(LOCKED);
430 	P_MMAP_FLAG(NONBLOCK);
431 	P_MMAP_FLAG(NORESERVE);
432 	P_MMAP_FLAG(POPULATE);
433 	P_MMAP_FLAG(STACK);
434 #ifdef MAP_UNINITIALIZED
435 	P_MMAP_FLAG(UNINITIALIZED);
436 #endif
437 #undef P_MMAP_FLAG
438 
439 	if (flags)
440 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
441 
442 	return printed;
443 }
444 
445 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
446 
447 static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
448 						  struct syscall_arg *arg)
449 {
450 	int printed = 0, flags = arg->val;
451 
452 #define P_MREMAP_FLAG(n) \
453 	if (flags & MREMAP_##n) { \
454 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
455 		flags &= ~MREMAP_##n; \
456 	}
457 
458 	P_MREMAP_FLAG(MAYMOVE);
459 #ifdef MREMAP_FIXED
460 	P_MREMAP_FLAG(FIXED);
461 #endif
462 #undef P_MREMAP_FLAG
463 
464 	if (flags)
465 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
466 
467 	return printed;
468 }
469 
470 #define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
471 
472 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
473 						      struct syscall_arg *arg)
474 {
475 	int behavior = arg->val;
476 
477 	switch (behavior) {
478 #define	P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
479 	P_MADV_BHV(NORMAL);
480 	P_MADV_BHV(RANDOM);
481 	P_MADV_BHV(SEQUENTIAL);
482 	P_MADV_BHV(WILLNEED);
483 	P_MADV_BHV(DONTNEED);
484 	P_MADV_BHV(REMOVE);
485 	P_MADV_BHV(DONTFORK);
486 	P_MADV_BHV(DOFORK);
487 	P_MADV_BHV(HWPOISON);
488 #ifdef MADV_SOFT_OFFLINE
489 	P_MADV_BHV(SOFT_OFFLINE);
490 #endif
491 	P_MADV_BHV(MERGEABLE);
492 	P_MADV_BHV(UNMERGEABLE);
493 #ifdef MADV_HUGEPAGE
494 	P_MADV_BHV(HUGEPAGE);
495 #endif
496 #ifdef MADV_NOHUGEPAGE
497 	P_MADV_BHV(NOHUGEPAGE);
498 #endif
499 #ifdef MADV_DONTDUMP
500 	P_MADV_BHV(DONTDUMP);
501 #endif
502 #ifdef MADV_DODUMP
503 	P_MADV_BHV(DODUMP);
504 #endif
505 #undef P_MADV_PHV
506 	default: break;
507 	}
508 
509 	return scnprintf(bf, size, "%#x", behavior);
510 }
511 
512 #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
513 
514 static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
515 					   struct syscall_arg *arg)
516 {
517 	int printed = 0, op = arg->val;
518 
519 	if (op == 0)
520 		return scnprintf(bf, size, "NONE");
521 #define	P_CMD(cmd) \
522 	if ((op & LOCK_##cmd) == LOCK_##cmd) { \
523 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
524 		op &= ~LOCK_##cmd; \
525 	}
526 
527 	P_CMD(SH);
528 	P_CMD(EX);
529 	P_CMD(NB);
530 	P_CMD(UN);
531 	P_CMD(MAND);
532 	P_CMD(RW);
533 	P_CMD(READ);
534 	P_CMD(WRITE);
535 #undef P_OP
536 
537 	if (op)
538 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
539 
540 	return printed;
541 }
542 
543 #define SCA_FLOCK syscall_arg__scnprintf_flock
544 
545 static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
546 {
547 	enum syscall_futex_args {
548 		SCF_UADDR   = (1 << 0),
549 		SCF_OP	    = (1 << 1),
550 		SCF_VAL	    = (1 << 2),
551 		SCF_TIMEOUT = (1 << 3),
552 		SCF_UADDR2  = (1 << 4),
553 		SCF_VAL3    = (1 << 5),
554 	};
555 	int op = arg->val;
556 	int cmd = op & FUTEX_CMD_MASK;
557 	size_t printed = 0;
558 
559 	switch (cmd) {
560 #define	P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
561 	P_FUTEX_OP(WAIT);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
562 	P_FUTEX_OP(WAKE);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
563 	P_FUTEX_OP(FD);		    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
564 	P_FUTEX_OP(REQUEUE);	    arg->mask |= SCF_VAL3|SCF_TIMEOUT;	          break;
565 	P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;			  break;
566 	P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;			  break;
567 	P_FUTEX_OP(WAKE_OP);							  break;
568 	P_FUTEX_OP(LOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
569 	P_FUTEX_OP(UNLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
570 	P_FUTEX_OP(TRYLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
571 	P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;			  break;
572 	P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;			  break;
573 	P_FUTEX_OP(WAIT_REQUEUE_PI);						  break;
574 	default: printed = scnprintf(bf, size, "%#x", cmd);			  break;
575 	}
576 
577 	if (op & FUTEX_PRIVATE_FLAG)
578 		printed += scnprintf(bf + printed, size - printed, "|PRIV");
579 
580 	if (op & FUTEX_CLOCK_REALTIME)
581 		printed += scnprintf(bf + printed, size - printed, "|CLKRT");
582 
583 	return printed;
584 }
585 
586 #define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
587 
588 static const char *bpf_cmd[] = {
589 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
590 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
591 };
592 static DEFINE_STRARRAY(bpf_cmd);
593 
594 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
595 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
596 
597 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
598 static DEFINE_STRARRAY(itimers);
599 
600 static const char *keyctl_options[] = {
601 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
602 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
603 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
604 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
605 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
606 };
607 static DEFINE_STRARRAY(keyctl_options);
608 
609 static const char *whences[] = { "SET", "CUR", "END",
610 #ifdef SEEK_DATA
611 "DATA",
612 #endif
613 #ifdef SEEK_HOLE
614 "HOLE",
615 #endif
616 };
617 static DEFINE_STRARRAY(whences);
618 
619 static const char *fcntl_cmds[] = {
620 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
621 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
622 	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
623 	"F_GETOWNER_UIDS",
624 };
625 static DEFINE_STRARRAY(fcntl_cmds);
626 
627 static const char *rlimit_resources[] = {
628 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
629 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
630 	"RTTIME",
631 };
632 static DEFINE_STRARRAY(rlimit_resources);
633 
634 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
635 static DEFINE_STRARRAY(sighow);
636 
637 static const char *clockid[] = {
638 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
639 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
640 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
641 };
642 static DEFINE_STRARRAY(clockid);
643 
644 static const char *socket_families[] = {
645 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
646 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
647 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
648 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
649 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
650 	"ALG", "NFC", "VSOCK",
651 };
652 static DEFINE_STRARRAY(socket_families);
653 
654 #ifndef SOCK_TYPE_MASK
655 #define SOCK_TYPE_MASK 0xf
656 #endif
657 
658 static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
659 						      struct syscall_arg *arg)
660 {
661 	size_t printed;
662 	int type = arg->val,
663 	    flags = type & ~SOCK_TYPE_MASK;
664 
665 	type &= SOCK_TYPE_MASK;
666 	/*
667  	 * Can't use a strarray, MIPS may override for ABI reasons.
668  	 */
669 	switch (type) {
670 #define	P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
671 	P_SK_TYPE(STREAM);
672 	P_SK_TYPE(DGRAM);
673 	P_SK_TYPE(RAW);
674 	P_SK_TYPE(RDM);
675 	P_SK_TYPE(SEQPACKET);
676 	P_SK_TYPE(DCCP);
677 	P_SK_TYPE(PACKET);
678 #undef P_SK_TYPE
679 	default:
680 		printed = scnprintf(bf, size, "%#x", type);
681 	}
682 
683 #define	P_SK_FLAG(n) \
684 	if (flags & SOCK_##n) { \
685 		printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
686 		flags &= ~SOCK_##n; \
687 	}
688 
689 	P_SK_FLAG(CLOEXEC);
690 	P_SK_FLAG(NONBLOCK);
691 #undef P_SK_FLAG
692 
693 	if (flags)
694 		printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
695 
696 	return printed;
697 }
698 
699 #define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
700 
701 #ifndef MSG_PROBE
702 #define MSG_PROBE	     0x10
703 #endif
704 #ifndef MSG_WAITFORONE
705 #define MSG_WAITFORONE	0x10000
706 #endif
707 #ifndef MSG_SENDPAGE_NOTLAST
708 #define MSG_SENDPAGE_NOTLAST 0x20000
709 #endif
710 #ifndef MSG_FASTOPEN
711 #define MSG_FASTOPEN	     0x20000000
712 #endif
713 
714 static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
715 					       struct syscall_arg *arg)
716 {
717 	int printed = 0, flags = arg->val;
718 
719 	if (flags == 0)
720 		return scnprintf(bf, size, "NONE");
721 #define	P_MSG_FLAG(n) \
722 	if (flags & MSG_##n) { \
723 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
724 		flags &= ~MSG_##n; \
725 	}
726 
727 	P_MSG_FLAG(OOB);
728 	P_MSG_FLAG(PEEK);
729 	P_MSG_FLAG(DONTROUTE);
730 	P_MSG_FLAG(TRYHARD);
731 	P_MSG_FLAG(CTRUNC);
732 	P_MSG_FLAG(PROBE);
733 	P_MSG_FLAG(TRUNC);
734 	P_MSG_FLAG(DONTWAIT);
735 	P_MSG_FLAG(EOR);
736 	P_MSG_FLAG(WAITALL);
737 	P_MSG_FLAG(FIN);
738 	P_MSG_FLAG(SYN);
739 	P_MSG_FLAG(CONFIRM);
740 	P_MSG_FLAG(RST);
741 	P_MSG_FLAG(ERRQUEUE);
742 	P_MSG_FLAG(NOSIGNAL);
743 	P_MSG_FLAG(MORE);
744 	P_MSG_FLAG(WAITFORONE);
745 	P_MSG_FLAG(SENDPAGE_NOTLAST);
746 	P_MSG_FLAG(FASTOPEN);
747 	P_MSG_FLAG(CMSG_CLOEXEC);
748 #undef P_MSG_FLAG
749 
750 	if (flags)
751 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
752 
753 	return printed;
754 }
755 
756 #define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
757 
758 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
759 						 struct syscall_arg *arg)
760 {
761 	size_t printed = 0;
762 	int mode = arg->val;
763 
764 	if (mode == F_OK) /* 0 */
765 		return scnprintf(bf, size, "F");
766 #define	P_MODE(n) \
767 	if (mode & n##_OK) { \
768 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
769 		mode &= ~n##_OK; \
770 	}
771 
772 	P_MODE(R);
773 	P_MODE(W);
774 	P_MODE(X);
775 #undef P_MODE
776 
777 	if (mode)
778 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
779 
780 	return printed;
781 }
782 
783 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
784 
785 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
786 					      struct syscall_arg *arg);
787 
788 #define SCA_FILENAME syscall_arg__scnprintf_filename
789 
790 static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
791 					       struct syscall_arg *arg)
792 {
793 	int printed = 0, flags = arg->val;
794 
795 	if (!(flags & O_CREAT))
796 		arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
797 
798 	if (flags == 0)
799 		return scnprintf(bf, size, "RDONLY");
800 #define	P_FLAG(n) \
801 	if (flags & O_##n) { \
802 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
803 		flags &= ~O_##n; \
804 	}
805 
806 	P_FLAG(APPEND);
807 	P_FLAG(ASYNC);
808 	P_FLAG(CLOEXEC);
809 	P_FLAG(CREAT);
810 	P_FLAG(DIRECT);
811 	P_FLAG(DIRECTORY);
812 	P_FLAG(EXCL);
813 	P_FLAG(LARGEFILE);
814 	P_FLAG(NOATIME);
815 	P_FLAG(NOCTTY);
816 #ifdef O_NONBLOCK
817 	P_FLAG(NONBLOCK);
818 #elif O_NDELAY
819 	P_FLAG(NDELAY);
820 #endif
821 #ifdef O_PATH
822 	P_FLAG(PATH);
823 #endif
824 	P_FLAG(RDWR);
825 #ifdef O_DSYNC
826 	if ((flags & O_SYNC) == O_SYNC)
827 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
828 	else {
829 		P_FLAG(DSYNC);
830 	}
831 #else
832 	P_FLAG(SYNC);
833 #endif
834 	P_FLAG(TRUNC);
835 	P_FLAG(WRONLY);
836 #undef P_FLAG
837 
838 	if (flags)
839 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
840 
841 	return printed;
842 }
843 
844 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
845 
846 static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
847 						struct syscall_arg *arg)
848 {
849 	int printed = 0, flags = arg->val;
850 
851 	if (flags == 0)
852 		return 0;
853 
854 #define	P_FLAG(n) \
855 	if (flags & PERF_FLAG_##n) { \
856 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
857 		flags &= ~PERF_FLAG_##n; \
858 	}
859 
860 	P_FLAG(FD_NO_GROUP);
861 	P_FLAG(FD_OUTPUT);
862 	P_FLAG(PID_CGROUP);
863 	P_FLAG(FD_CLOEXEC);
864 #undef P_FLAG
865 
866 	if (flags)
867 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
868 
869 	return printed;
870 }
871 
872 #define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
873 
874 static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
875 						   struct syscall_arg *arg)
876 {
877 	int printed = 0, flags = arg->val;
878 
879 	if (flags == 0)
880 		return scnprintf(bf, size, "NONE");
881 #define	P_FLAG(n) \
882 	if (flags & EFD_##n) { \
883 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
884 		flags &= ~EFD_##n; \
885 	}
886 
887 	P_FLAG(SEMAPHORE);
888 	P_FLAG(CLOEXEC);
889 	P_FLAG(NONBLOCK);
890 #undef P_FLAG
891 
892 	if (flags)
893 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
894 
895 	return printed;
896 }
897 
898 #define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
899 
900 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
901 						struct syscall_arg *arg)
902 {
903 	int printed = 0, flags = arg->val;
904 
905 #define	P_FLAG(n) \
906 	if (flags & O_##n) { \
907 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
908 		flags &= ~O_##n; \
909 	}
910 
911 	P_FLAG(CLOEXEC);
912 	P_FLAG(NONBLOCK);
913 #undef P_FLAG
914 
915 	if (flags)
916 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
917 
918 	return printed;
919 }
920 
921 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
922 
923 static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
924 {
925 	int sig = arg->val;
926 
927 	switch (sig) {
928 #define	P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
929 	P_SIGNUM(HUP);
930 	P_SIGNUM(INT);
931 	P_SIGNUM(QUIT);
932 	P_SIGNUM(ILL);
933 	P_SIGNUM(TRAP);
934 	P_SIGNUM(ABRT);
935 	P_SIGNUM(BUS);
936 	P_SIGNUM(FPE);
937 	P_SIGNUM(KILL);
938 	P_SIGNUM(USR1);
939 	P_SIGNUM(SEGV);
940 	P_SIGNUM(USR2);
941 	P_SIGNUM(PIPE);
942 	P_SIGNUM(ALRM);
943 	P_SIGNUM(TERM);
944 	P_SIGNUM(CHLD);
945 	P_SIGNUM(CONT);
946 	P_SIGNUM(STOP);
947 	P_SIGNUM(TSTP);
948 	P_SIGNUM(TTIN);
949 	P_SIGNUM(TTOU);
950 	P_SIGNUM(URG);
951 	P_SIGNUM(XCPU);
952 	P_SIGNUM(XFSZ);
953 	P_SIGNUM(VTALRM);
954 	P_SIGNUM(PROF);
955 	P_SIGNUM(WINCH);
956 	P_SIGNUM(IO);
957 	P_SIGNUM(PWR);
958 	P_SIGNUM(SYS);
959 #ifdef SIGEMT
960 	P_SIGNUM(EMT);
961 #endif
962 #ifdef SIGSTKFLT
963 	P_SIGNUM(STKFLT);
964 #endif
965 #ifdef SIGSWI
966 	P_SIGNUM(SWI);
967 #endif
968 	default: break;
969 	}
970 
971 	return scnprintf(bf, size, "%#x", sig);
972 }
973 
974 #define SCA_SIGNUM syscall_arg__scnprintf_signum
975 
976 #if defined(__i386__) || defined(__x86_64__)
977 /*
978  * FIXME: Make this available to all arches.
979  */
980 #define TCGETS		0x5401
981 
982 static const char *tioctls[] = {
983 	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
984 	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
985 	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
986 	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
987 	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
988 	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
989 	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
990 	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
991 	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
992 	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
993 	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
994 	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
995 	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
996 	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
997 	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
998 };
999 
1000 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
1001 #endif /* defined(__i386__) || defined(__x86_64__) */
1002 
1003 #define STRARRAY(arg, name, array) \
1004 	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
1005 	  .arg_parm	 = { [arg] = &strarray__##array, }
1006 
1007 static struct syscall_fmt {
1008 	const char *name;
1009 	const char *alias;
1010 	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
1011 	void	   *arg_parm[6];
1012 	bool	   errmsg;
1013 	bool	   timeout;
1014 	bool	   hexret;
1015 } syscall_fmts[] = {
1016 	{ .name	    = "access",	    .errmsg = true,
1017 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */
1018 			     [1] = SCA_ACCMODE,  /* mode */ }, },
1019 	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
1020 	{ .name	    = "bpf",	    .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
1021 	{ .name	    = "brk",	    .hexret = true,
1022 	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
1023 	{ .name	    = "chdir",	    .errmsg = true,
1024 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1025 	{ .name	    = "chmod",	    .errmsg = true,
1026 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1027 	{ .name	    = "chroot",	    .errmsg = true,
1028 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1029 	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
1030 	{ .name	    = "close",	    .errmsg = true,
1031 	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
1032 	{ .name	    = "connect",    .errmsg = true, },
1033 	{ .name	    = "creat",	    .errmsg = true,
1034 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1035 	{ .name	    = "dup",	    .errmsg = true,
1036 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1037 	{ .name	    = "dup2",	    .errmsg = true,
1038 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1039 	{ .name	    = "dup3",	    .errmsg = true,
1040 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1041 	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
1042 	{ .name	    = "eventfd2",   .errmsg = true,
1043 	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
1044 	{ .name	    = "faccessat",  .errmsg = true,
1045 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1046 			     [1] = SCA_FILENAME, /* filename */ }, },
1047 	{ .name	    = "fadvise64",  .errmsg = true,
1048 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1049 	{ .name	    = "fallocate",  .errmsg = true,
1050 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1051 	{ .name	    = "fchdir",	    .errmsg = true,
1052 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1053 	{ .name	    = "fchmod",	    .errmsg = true,
1054 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1055 	{ .name	    = "fchmodat",   .errmsg = true,
1056 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1057 			     [1] = SCA_FILENAME, /* filename */ }, },
1058 	{ .name	    = "fchown",	    .errmsg = true,
1059 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1060 	{ .name	    = "fchownat",   .errmsg = true,
1061 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1062 			     [1] = SCA_FILENAME, /* filename */ }, },
1063 	{ .name	    = "fcntl",	    .errmsg = true,
1064 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1065 			     [1] = SCA_STRARRAY, /* cmd */ },
1066 	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
1067 	{ .name	    = "fdatasync",  .errmsg = true,
1068 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1069 	{ .name	    = "flock",	    .errmsg = true,
1070 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1071 			     [1] = SCA_FLOCK, /* cmd */ }, },
1072 	{ .name	    = "fsetxattr",  .errmsg = true,
1073 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1074 	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat",
1075 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1076 	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat",
1077 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1078 			     [1] = SCA_FILENAME, /* filename */ }, },
1079 	{ .name	    = "fstatfs",    .errmsg = true,
1080 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1081 	{ .name	    = "fsync",    .errmsg = true,
1082 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1083 	{ .name	    = "ftruncate", .errmsg = true,
1084 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1085 	{ .name	    = "futex",	    .errmsg = true,
1086 	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
1087 	{ .name	    = "futimesat", .errmsg = true,
1088 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1089 			     [1] = SCA_FILENAME, /* filename */ }, },
1090 	{ .name	    = "getdents",   .errmsg = true,
1091 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1092 	{ .name	    = "getdents64", .errmsg = true,
1093 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1094 	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1095 	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1096 	{ .name	    = "getxattr",    .errmsg = true,
1097 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1098 	{ .name	    = "inotify_add_watch",	    .errmsg = true,
1099 	  .arg_scnprintf = { [1] = SCA_FILENAME, /* pathname */ }, },
1100 	{ .name	    = "ioctl",	    .errmsg = true,
1101 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1102 #if defined(__i386__) || defined(__x86_64__)
1103 /*
1104  * FIXME: Make this available to all arches.
1105  */
1106 			     [1] = SCA_STRHEXARRAY, /* cmd */
1107 			     [2] = SCA_HEX, /* arg */ },
1108 	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
1109 #else
1110 			     [2] = SCA_HEX, /* arg */ }, },
1111 #endif
1112 	{ .name	    = "keyctl",	    .errmsg = true, STRARRAY(0, option, keyctl_options), },
1113 	{ .name	    = "kill",	    .errmsg = true,
1114 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1115 	{ .name	    = "lchown",    .errmsg = true,
1116 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1117 	{ .name	    = "lgetxattr",  .errmsg = true,
1118 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1119 	{ .name	    = "linkat",	    .errmsg = true,
1120 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1121 	{ .name	    = "listxattr",  .errmsg = true,
1122 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1123 	{ .name	    = "llistxattr", .errmsg = true,
1124 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1125 	{ .name	    = "lremovexattr",  .errmsg = true,
1126 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1127 	{ .name	    = "lseek",	    .errmsg = true,
1128 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1129 			     [2] = SCA_STRARRAY, /* whence */ },
1130 	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
1131 	{ .name	    = "lsetxattr",  .errmsg = true,
1132 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1133 	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat",
1134 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1135 	{ .name	    = "lsxattr",    .errmsg = true,
1136 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1137 	{ .name     = "madvise",    .errmsg = true,
1138 	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
1139 			     [2] = SCA_MADV_BHV, /* behavior */ }, },
1140 	{ .name	    = "mkdir",    .errmsg = true,
1141 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1142 	{ .name	    = "mkdirat",    .errmsg = true,
1143 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1144 			     [1] = SCA_FILENAME, /* pathname */ }, },
1145 	{ .name	    = "mknod",      .errmsg = true,
1146 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1147 	{ .name	    = "mknodat",    .errmsg = true,
1148 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1149 			     [1] = SCA_FILENAME, /* filename */ }, },
1150 	{ .name	    = "mlock",	    .errmsg = true,
1151 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1152 	{ .name	    = "mlockall",   .errmsg = true,
1153 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1154 	{ .name	    = "mmap",	    .hexret = true,
1155 	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
1156 			     [2] = SCA_MMAP_PROT, /* prot */
1157 			     [3] = SCA_MMAP_FLAGS, /* flags */
1158 			     [4] = SCA_FD, 	  /* fd */ }, },
1159 	{ .name	    = "mprotect",   .errmsg = true,
1160 	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
1161 			     [2] = SCA_MMAP_PROT, /* prot */ }, },
1162 	{ .name	    = "mq_unlink", .errmsg = true,
1163 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
1164 	{ .name	    = "mremap",	    .hexret = true,
1165 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1166 			     [3] = SCA_MREMAP_FLAGS, /* flags */
1167 			     [4] = SCA_HEX, /* new_addr */ }, },
1168 	{ .name	    = "munlock",    .errmsg = true,
1169 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1170 	{ .name	    = "munmap",	    .errmsg = true,
1171 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1172 	{ .name	    = "name_to_handle_at", .errmsg = true,
1173 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1174 	{ .name	    = "newfstatat", .errmsg = true,
1175 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1176 			     [1] = SCA_FILENAME, /* filename */ }, },
1177 	{ .name	    = "open",	    .errmsg = true,
1178 	  .arg_scnprintf = { [0] = SCA_FILENAME,   /* filename */
1179 			     [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1180 	{ .name	    = "open_by_handle_at", .errmsg = true,
1181 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1182 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1183 	{ .name	    = "openat",	    .errmsg = true,
1184 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1185 			     [1] = SCA_FILENAME, /* filename */
1186 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1187 	{ .name	    = "perf_event_open", .errmsg = true,
1188 	  .arg_scnprintf = { [1] = SCA_INT, /* pid */
1189 			     [2] = SCA_INT, /* cpu */
1190 			     [3] = SCA_FD,  /* group_fd */
1191 			     [4] = SCA_PERF_FLAGS,  /* flags */ }, },
1192 	{ .name	    = "pipe2",	    .errmsg = true,
1193 	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1194 	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
1195 	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
1196 	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64",
1197 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1198 	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread",
1199 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1200 	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1201 	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64",
1202 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1203 	{ .name	    = "pwritev",    .errmsg = true,
1204 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1205 	{ .name	    = "read",	    .errmsg = true,
1206 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1207 	{ .name	    = "readlink",   .errmsg = true,
1208 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
1209 	{ .name	    = "readlinkat", .errmsg = true,
1210 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1211 			     [1] = SCA_FILENAME, /* pathname */ }, },
1212 	{ .name	    = "readv",	    .errmsg = true,
1213 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1214 	{ .name	    = "recvfrom",   .errmsg = true,
1215 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1216 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
1217 	{ .name	    = "recvmmsg",   .errmsg = true,
1218 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1219 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
1220 	{ .name	    = "recvmsg",    .errmsg = true,
1221 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1222 			     [2] = SCA_MSG_FLAGS, /* flags */ }, },
1223 	{ .name	    = "removexattr", .errmsg = true,
1224 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1225 	{ .name	    = "renameat",   .errmsg = true,
1226 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1227 	{ .name	    = "rmdir",    .errmsg = true,
1228 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1229 	{ .name	    = "rt_sigaction", .errmsg = true,
1230 	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1231 	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
1232 	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
1233 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1234 	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
1235 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1236 	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
1237 	{ .name	    = "sendmmsg",    .errmsg = true,
1238 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1239 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
1240 	{ .name	    = "sendmsg",    .errmsg = true,
1241 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1242 			     [2] = SCA_MSG_FLAGS, /* flags */ }, },
1243 	{ .name	    = "sendto",	    .errmsg = true,
1244 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1245 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
1246 	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1247 	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1248 	{ .name	    = "setxattr",   .errmsg = true,
1249 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1250 	{ .name	    = "shutdown",   .errmsg = true,
1251 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1252 	{ .name	    = "socket",	    .errmsg = true,
1253 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1254 			     [1] = SCA_SK_TYPE, /* type */ },
1255 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
1256 	{ .name	    = "socketpair", .errmsg = true,
1257 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1258 			     [1] = SCA_SK_TYPE, /* type */ },
1259 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
1260 	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat",
1261 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1262 	{ .name	    = "statfs",	    .errmsg = true,
1263 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1264 	{ .name	    = "swapoff",    .errmsg = true,
1265 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
1266 	{ .name	    = "swapon",	    .errmsg = true,
1267 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
1268 	{ .name	    = "symlinkat",  .errmsg = true,
1269 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1270 	{ .name	    = "tgkill",	    .errmsg = true,
1271 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1272 	{ .name	    = "tkill",	    .errmsg = true,
1273 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1274 	{ .name	    = "truncate",   .errmsg = true,
1275 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
1276 	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
1277 	{ .name	    = "unlinkat",   .errmsg = true,
1278 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1279 			     [1] = SCA_FILENAME, /* pathname */ }, },
1280 	{ .name	    = "utime",  .errmsg = true,
1281 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1282 	{ .name	    = "utimensat",  .errmsg = true,
1283 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */
1284 			     [1] = SCA_FILENAME, /* filename */ }, },
1285 	{ .name	    = "utimes",  .errmsg = true,
1286 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1287 	{ .name	    = "vmsplice",  .errmsg = true,
1288 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1289 	{ .name	    = "write",	    .errmsg = true,
1290 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1291 	{ .name	    = "writev",	    .errmsg = true,
1292 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1293 };
1294 
1295 static int syscall_fmt__cmp(const void *name, const void *fmtp)
1296 {
1297 	const struct syscall_fmt *fmt = fmtp;
1298 	return strcmp(name, fmt->name);
1299 }
1300 
1301 static struct syscall_fmt *syscall_fmt__find(const char *name)
1302 {
1303 	const int nmemb = ARRAY_SIZE(syscall_fmts);
1304 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1305 }
1306 
1307 struct syscall {
1308 	struct event_format *tp_format;
1309 	int		    nr_args;
1310 	struct format_field *args;
1311 	const char	    *name;
1312 	bool		    is_exit;
1313 	struct syscall_fmt  *fmt;
1314 	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1315 	void		    **arg_parm;
1316 };
1317 
1318 static size_t fprintf_duration(unsigned long t, FILE *fp)
1319 {
1320 	double duration = (double)t / NSEC_PER_MSEC;
1321 	size_t printed = fprintf(fp, "(");
1322 
1323 	if (duration >= 1.0)
1324 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1325 	else if (duration >= 0.01)
1326 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1327 	else
1328 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1329 	return printed + fprintf(fp, "): ");
1330 }
1331 
1332 /**
1333  * filename.ptr: The filename char pointer that will be vfs_getname'd
1334  * filename.entry_str_pos: Where to insert the string translated from
1335  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
1336  */
1337 struct thread_trace {
1338 	u64		  entry_time;
1339 	u64		  exit_time;
1340 	bool		  entry_pending;
1341 	unsigned long	  nr_events;
1342 	unsigned long	  pfmaj, pfmin;
1343 	char		  *entry_str;
1344 	double		  runtime_ms;
1345         struct {
1346 		unsigned long ptr;
1347 		short int     entry_str_pos;
1348 		bool	      pending_open;
1349 		unsigned int  namelen;
1350 		char	      *name;
1351 	} filename;
1352 	struct {
1353 		int	  max;
1354 		char	  **table;
1355 	} paths;
1356 
1357 	struct intlist *syscall_stats;
1358 };
1359 
1360 static struct thread_trace *thread_trace__new(void)
1361 {
1362 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1363 
1364 	if (ttrace)
1365 		ttrace->paths.max = -1;
1366 
1367 	ttrace->syscall_stats = intlist__new(NULL);
1368 
1369 	return ttrace;
1370 }
1371 
1372 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1373 {
1374 	struct thread_trace *ttrace;
1375 
1376 	if (thread == NULL)
1377 		goto fail;
1378 
1379 	if (thread__priv(thread) == NULL)
1380 		thread__set_priv(thread, thread_trace__new());
1381 
1382 	if (thread__priv(thread) == NULL)
1383 		goto fail;
1384 
1385 	ttrace = thread__priv(thread);
1386 	++ttrace->nr_events;
1387 
1388 	return ttrace;
1389 fail:
1390 	color_fprintf(fp, PERF_COLOR_RED,
1391 		      "WARNING: not enough memory, dropping samples!\n");
1392 	return NULL;
1393 }
1394 
1395 #define TRACE_PFMAJ		(1 << 0)
1396 #define TRACE_PFMIN		(1 << 1)
1397 
1398 static const size_t trace__entry_str_size = 2048;
1399 
1400 struct trace {
1401 	struct perf_tool	tool;
1402 	struct {
1403 		int		machine;
1404 		int		open_id;
1405 	}			audit;
1406 	struct {
1407 		int		max;
1408 		struct syscall  *table;
1409 		struct {
1410 			struct perf_evsel *sys_enter,
1411 					  *sys_exit;
1412 		}		events;
1413 	} syscalls;
1414 	struct record_opts	opts;
1415 	struct perf_evlist	*evlist;
1416 	struct machine		*host;
1417 	struct thread		*current;
1418 	u64			base_time;
1419 	FILE			*output;
1420 	unsigned long		nr_events;
1421 	struct strlist		*ev_qualifier;
1422 	struct {
1423 		size_t		nr;
1424 		int		*entries;
1425 	}			ev_qualifier_ids;
1426 	struct intlist		*tid_list;
1427 	struct intlist		*pid_list;
1428 	struct {
1429 		size_t		nr;
1430 		pid_t		*entries;
1431 	}			filter_pids;
1432 	double			duration_filter;
1433 	double			runtime_ms;
1434 	struct {
1435 		u64		vfs_getname,
1436 				proc_getname;
1437 	} stats;
1438 	bool			not_ev_qualifier;
1439 	bool			live;
1440 	bool			full_time;
1441 	bool			sched;
1442 	bool			multiple_threads;
1443 	bool			summary;
1444 	bool			summary_only;
1445 	bool			show_comm;
1446 	bool			show_tool_stats;
1447 	bool			trace_syscalls;
1448 	bool			force;
1449 	bool			vfs_getname;
1450 	int			trace_pgfaults;
1451 };
1452 
1453 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1454 {
1455 	struct thread_trace *ttrace = thread__priv(thread);
1456 
1457 	if (fd > ttrace->paths.max) {
1458 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1459 
1460 		if (npath == NULL)
1461 			return -1;
1462 
1463 		if (ttrace->paths.max != -1) {
1464 			memset(npath + ttrace->paths.max + 1, 0,
1465 			       (fd - ttrace->paths.max) * sizeof(char *));
1466 		} else {
1467 			memset(npath, 0, (fd + 1) * sizeof(char *));
1468 		}
1469 
1470 		ttrace->paths.table = npath;
1471 		ttrace->paths.max   = fd;
1472 	}
1473 
1474 	ttrace->paths.table[fd] = strdup(pathname);
1475 
1476 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
1477 }
1478 
1479 static int thread__read_fd_path(struct thread *thread, int fd)
1480 {
1481 	char linkname[PATH_MAX], pathname[PATH_MAX];
1482 	struct stat st;
1483 	int ret;
1484 
1485 	if (thread->pid_ == thread->tid) {
1486 		scnprintf(linkname, sizeof(linkname),
1487 			  "/proc/%d/fd/%d", thread->pid_, fd);
1488 	} else {
1489 		scnprintf(linkname, sizeof(linkname),
1490 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1491 	}
1492 
1493 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1494 		return -1;
1495 
1496 	ret = readlink(linkname, pathname, sizeof(pathname));
1497 
1498 	if (ret < 0 || ret > st.st_size)
1499 		return -1;
1500 
1501 	pathname[ret] = '\0';
1502 	return trace__set_fd_pathname(thread, fd, pathname);
1503 }
1504 
1505 static const char *thread__fd_path(struct thread *thread, int fd,
1506 				   struct trace *trace)
1507 {
1508 	struct thread_trace *ttrace = thread__priv(thread);
1509 
1510 	if (ttrace == NULL)
1511 		return NULL;
1512 
1513 	if (fd < 0)
1514 		return NULL;
1515 
1516 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1517 		if (!trace->live)
1518 			return NULL;
1519 		++trace->stats.proc_getname;
1520 		if (thread__read_fd_path(thread, fd))
1521 			return NULL;
1522 	}
1523 
1524 	return ttrace->paths.table[fd];
1525 }
1526 
1527 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1528 					struct syscall_arg *arg)
1529 {
1530 	int fd = arg->val;
1531 	size_t printed = scnprintf(bf, size, "%d", fd);
1532 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1533 
1534 	if (path)
1535 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1536 
1537 	return printed;
1538 }
1539 
1540 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1541 					      struct syscall_arg *arg)
1542 {
1543 	int fd = arg->val;
1544 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1545 	struct thread_trace *ttrace = thread__priv(arg->thread);
1546 
1547 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1548 		zfree(&ttrace->paths.table[fd]);
1549 
1550 	return printed;
1551 }
1552 
1553 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1554 				     unsigned long ptr)
1555 {
1556 	struct thread_trace *ttrace = thread__priv(thread);
1557 
1558 	ttrace->filename.ptr = ptr;
1559 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1560 }
1561 
1562 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1563 					      struct syscall_arg *arg)
1564 {
1565 	unsigned long ptr = arg->val;
1566 
1567 	if (!arg->trace->vfs_getname)
1568 		return scnprintf(bf, size, "%#x", ptr);
1569 
1570 	thread__set_filename_pos(arg->thread, bf, ptr);
1571 	return 0;
1572 }
1573 
1574 static bool trace__filter_duration(struct trace *trace, double t)
1575 {
1576 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1577 }
1578 
1579 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1580 {
1581 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1582 
1583 	return fprintf(fp, "%10.3f ", ts);
1584 }
1585 
1586 static bool done = false;
1587 static bool interrupted = false;
1588 
1589 static void sig_handler(int sig)
1590 {
1591 	done = true;
1592 	interrupted = sig == SIGINT;
1593 }
1594 
1595 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1596 					u64 duration, u64 tstamp, FILE *fp)
1597 {
1598 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1599 	printed += fprintf_duration(duration, fp);
1600 
1601 	if (trace->multiple_threads) {
1602 		if (trace->show_comm)
1603 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1604 		printed += fprintf(fp, "%d ", thread->tid);
1605 	}
1606 
1607 	return printed;
1608 }
1609 
1610 static int trace__process_event(struct trace *trace, struct machine *machine,
1611 				union perf_event *event, struct perf_sample *sample)
1612 {
1613 	int ret = 0;
1614 
1615 	switch (event->header.type) {
1616 	case PERF_RECORD_LOST:
1617 		color_fprintf(trace->output, PERF_COLOR_RED,
1618 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1619 		ret = machine__process_lost_event(machine, event, sample);
1620 	default:
1621 		ret = machine__process_event(machine, event, sample);
1622 		break;
1623 	}
1624 
1625 	return ret;
1626 }
1627 
1628 static int trace__tool_process(struct perf_tool *tool,
1629 			       union perf_event *event,
1630 			       struct perf_sample *sample,
1631 			       struct machine *machine)
1632 {
1633 	struct trace *trace = container_of(tool, struct trace, tool);
1634 	return trace__process_event(trace, machine, event, sample);
1635 }
1636 
1637 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1638 {
1639 	int err = symbol__init(NULL);
1640 
1641 	if (err)
1642 		return err;
1643 
1644 	trace->host = machine__new_host();
1645 	if (trace->host == NULL)
1646 		return -ENOMEM;
1647 
1648 	if (trace_event__register_resolver(trace->host, machine__resolve_kernel_addr) < 0)
1649 		return -errno;
1650 
1651 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1652 					    evlist->threads, trace__tool_process, false,
1653 					    trace->opts.proc_map_timeout);
1654 	if (err)
1655 		symbol__exit();
1656 
1657 	return err;
1658 }
1659 
1660 static int syscall__set_arg_fmts(struct syscall *sc)
1661 {
1662 	struct format_field *field;
1663 	int idx = 0;
1664 
1665 	sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1666 	if (sc->arg_scnprintf == NULL)
1667 		return -1;
1668 
1669 	if (sc->fmt)
1670 		sc->arg_parm = sc->fmt->arg_parm;
1671 
1672 	for (field = sc->args; field; field = field->next) {
1673 		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1674 			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1675 		else if (field->flags & FIELD_IS_POINTER)
1676 			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1677 		++idx;
1678 	}
1679 
1680 	return 0;
1681 }
1682 
1683 static int trace__read_syscall_info(struct trace *trace, int id)
1684 {
1685 	char tp_name[128];
1686 	struct syscall *sc;
1687 	const char *name = audit_syscall_to_name(id, trace->audit.machine);
1688 
1689 	if (name == NULL)
1690 		return -1;
1691 
1692 	if (id > trace->syscalls.max) {
1693 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1694 
1695 		if (nsyscalls == NULL)
1696 			return -1;
1697 
1698 		if (trace->syscalls.max != -1) {
1699 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1700 			       (id - trace->syscalls.max) * sizeof(*sc));
1701 		} else {
1702 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1703 		}
1704 
1705 		trace->syscalls.table = nsyscalls;
1706 		trace->syscalls.max   = id;
1707 	}
1708 
1709 	sc = trace->syscalls.table + id;
1710 	sc->name = name;
1711 
1712 	sc->fmt  = syscall_fmt__find(sc->name);
1713 
1714 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1715 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1716 
1717 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1718 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1719 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1720 	}
1721 
1722 	if (IS_ERR(sc->tp_format))
1723 		return -1;
1724 
1725 	sc->args = sc->tp_format->format.fields;
1726 	sc->nr_args = sc->tp_format->format.nr_fields;
1727 	/* drop nr field - not relevant here; does not exist on older kernels */
1728 	if (sc->args && strcmp(sc->args->name, "nr") == 0) {
1729 		sc->args = sc->args->next;
1730 		--sc->nr_args;
1731 	}
1732 
1733 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1734 
1735 	return syscall__set_arg_fmts(sc);
1736 }
1737 
1738 static int trace__validate_ev_qualifier(struct trace *trace)
1739 {
1740 	int err = 0, i;
1741 	struct str_node *pos;
1742 
1743 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1744 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1745 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1746 
1747 	if (trace->ev_qualifier_ids.entries == NULL) {
1748 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1749 		       trace->output);
1750 		err = -EINVAL;
1751 		goto out;
1752 	}
1753 
1754 	i = 0;
1755 
1756 	strlist__for_each(pos, trace->ev_qualifier) {
1757 		const char *sc = pos->s;
1758 		int id = audit_name_to_syscall(sc, trace->audit.machine);
1759 
1760 		if (id < 0) {
1761 			if (err == 0) {
1762 				fputs("Error:\tInvalid syscall ", trace->output);
1763 				err = -EINVAL;
1764 			} else {
1765 				fputs(", ", trace->output);
1766 			}
1767 
1768 			fputs(sc, trace->output);
1769 		}
1770 
1771 		trace->ev_qualifier_ids.entries[i++] = id;
1772 	}
1773 
1774 	if (err < 0) {
1775 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1776 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1777 		zfree(&trace->ev_qualifier_ids.entries);
1778 		trace->ev_qualifier_ids.nr = 0;
1779 	}
1780 out:
1781 	return err;
1782 }
1783 
1784 /*
1785  * args is to be interpreted as a series of longs but we need to handle
1786  * 8-byte unaligned accesses. args points to raw_data within the event
1787  * and raw_data is guaranteed to be 8-byte unaligned because it is
1788  * preceded by raw_size which is a u32. So we need to copy args to a temp
1789  * variable to read it. Most notably this avoids extended load instructions
1790  * on unaligned addresses
1791  */
1792 
1793 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1794 				      unsigned char *args, struct trace *trace,
1795 				      struct thread *thread)
1796 {
1797 	size_t printed = 0;
1798 	unsigned char *p;
1799 	unsigned long val;
1800 
1801 	if (sc->args != NULL) {
1802 		struct format_field *field;
1803 		u8 bit = 1;
1804 		struct syscall_arg arg = {
1805 			.idx	= 0,
1806 			.mask	= 0,
1807 			.trace  = trace,
1808 			.thread = thread,
1809 		};
1810 
1811 		for (field = sc->args; field;
1812 		     field = field->next, ++arg.idx, bit <<= 1) {
1813 			if (arg.mask & bit)
1814 				continue;
1815 
1816 			/* special care for unaligned accesses */
1817 			p = args + sizeof(unsigned long) * arg.idx;
1818 			memcpy(&val, p, sizeof(val));
1819 
1820 			/*
1821  			 * Suppress this argument if its value is zero and
1822  			 * and we don't have a string associated in an
1823  			 * strarray for it.
1824  			 */
1825 			if (val == 0 &&
1826 			    !(sc->arg_scnprintf &&
1827 			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1828 			      sc->arg_parm[arg.idx]))
1829 				continue;
1830 
1831 			printed += scnprintf(bf + printed, size - printed,
1832 					     "%s%s: ", printed ? ", " : "", field->name);
1833 			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1834 				arg.val = val;
1835 				if (sc->arg_parm)
1836 					arg.parm = sc->arg_parm[arg.idx];
1837 				printed += sc->arg_scnprintf[arg.idx](bf + printed,
1838 								      size - printed, &arg);
1839 			} else {
1840 				printed += scnprintf(bf + printed, size - printed,
1841 						     "%ld", val);
1842 			}
1843 		}
1844 	} else {
1845 		int i = 0;
1846 
1847 		while (i < 6) {
1848 			/* special care for unaligned accesses */
1849 			p = args + sizeof(unsigned long) * i;
1850 			memcpy(&val, p, sizeof(val));
1851 			printed += scnprintf(bf + printed, size - printed,
1852 					     "%sarg%d: %ld",
1853 					     printed ? ", " : "", i, val);
1854 			++i;
1855 		}
1856 	}
1857 
1858 	return printed;
1859 }
1860 
1861 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1862 				  union perf_event *event,
1863 				  struct perf_sample *sample);
1864 
1865 static struct syscall *trace__syscall_info(struct trace *trace,
1866 					   struct perf_evsel *evsel, int id)
1867 {
1868 
1869 	if (id < 0) {
1870 
1871 		/*
1872 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1873 		 * before that, leaving at a higher verbosity level till that is
1874 		 * explained. Reproduced with plain ftrace with:
1875 		 *
1876 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1877 		 * grep "NR -1 " /t/trace_pipe
1878 		 *
1879 		 * After generating some load on the machine.
1880  		 */
1881 		if (verbose > 1) {
1882 			static u64 n;
1883 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1884 				id, perf_evsel__name(evsel), ++n);
1885 		}
1886 		return NULL;
1887 	}
1888 
1889 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1890 	    trace__read_syscall_info(trace, id))
1891 		goto out_cant_read;
1892 
1893 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1894 		goto out_cant_read;
1895 
1896 	return &trace->syscalls.table[id];
1897 
1898 out_cant_read:
1899 	if (verbose) {
1900 		fprintf(trace->output, "Problems reading syscall %d", id);
1901 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1902 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1903 		fputs(" information\n", trace->output);
1904 	}
1905 	return NULL;
1906 }
1907 
1908 static void thread__update_stats(struct thread_trace *ttrace,
1909 				 int id, struct perf_sample *sample)
1910 {
1911 	struct int_node *inode;
1912 	struct stats *stats;
1913 	u64 duration = 0;
1914 
1915 	inode = intlist__findnew(ttrace->syscall_stats, id);
1916 	if (inode == NULL)
1917 		return;
1918 
1919 	stats = inode->priv;
1920 	if (stats == NULL) {
1921 		stats = malloc(sizeof(struct stats));
1922 		if (stats == NULL)
1923 			return;
1924 		init_stats(stats);
1925 		inode->priv = stats;
1926 	}
1927 
1928 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1929 		duration = sample->time - ttrace->entry_time;
1930 
1931 	update_stats(stats, duration);
1932 }
1933 
1934 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1935 {
1936 	struct thread_trace *ttrace;
1937 	u64 duration;
1938 	size_t printed;
1939 
1940 	if (trace->current == NULL)
1941 		return 0;
1942 
1943 	ttrace = thread__priv(trace->current);
1944 
1945 	if (!ttrace->entry_pending)
1946 		return 0;
1947 
1948 	duration = sample->time - ttrace->entry_time;
1949 
1950 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1951 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1952 	ttrace->entry_pending = false;
1953 
1954 	return printed;
1955 }
1956 
1957 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1958 			    union perf_event *event __maybe_unused,
1959 			    struct perf_sample *sample)
1960 {
1961 	char *msg;
1962 	void *args;
1963 	size_t printed = 0;
1964 	struct thread *thread;
1965 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1966 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1967 	struct thread_trace *ttrace;
1968 
1969 	if (sc == NULL)
1970 		return -1;
1971 
1972 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1973 	ttrace = thread__trace(thread, trace->output);
1974 	if (ttrace == NULL)
1975 		goto out_put;
1976 
1977 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1978 
1979 	if (ttrace->entry_str == NULL) {
1980 		ttrace->entry_str = malloc(trace__entry_str_size);
1981 		if (!ttrace->entry_str)
1982 			goto out_put;
1983 	}
1984 
1985 	if (!trace->summary_only)
1986 		trace__printf_interrupted_entry(trace, sample);
1987 
1988 	ttrace->entry_time = sample->time;
1989 	msg = ttrace->entry_str;
1990 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1991 
1992 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1993 					   args, trace, thread);
1994 
1995 	if (sc->is_exit) {
1996 		if (!trace->duration_filter && !trace->summary_only) {
1997 			trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1998 			fprintf(trace->output, "%-70s\n", ttrace->entry_str);
1999 		}
2000 	} else {
2001 		ttrace->entry_pending = true;
2002 		/* See trace__vfs_getname & trace__sys_exit */
2003 		ttrace->filename.pending_open = false;
2004 	}
2005 
2006 	if (trace->current != thread) {
2007 		thread__put(trace->current);
2008 		trace->current = thread__get(thread);
2009 	}
2010 	err = 0;
2011 out_put:
2012 	thread__put(thread);
2013 	return err;
2014 }
2015 
2016 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
2017 			   union perf_event *event __maybe_unused,
2018 			   struct perf_sample *sample)
2019 {
2020 	long ret;
2021 	u64 duration = 0;
2022 	struct thread *thread;
2023 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2024 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
2025 	struct thread_trace *ttrace;
2026 
2027 	if (sc == NULL)
2028 		return -1;
2029 
2030 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2031 	ttrace = thread__trace(thread, trace->output);
2032 	if (ttrace == NULL)
2033 		goto out_put;
2034 
2035 	if (trace->summary)
2036 		thread__update_stats(ttrace, id, sample);
2037 
2038 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2039 
2040 	if (id == trace->audit.open_id && ret >= 0 && ttrace->filename.pending_open) {
2041 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2042 		ttrace->filename.pending_open = false;
2043 		++trace->stats.vfs_getname;
2044 	}
2045 
2046 	ttrace->exit_time = sample->time;
2047 
2048 	if (ttrace->entry_time) {
2049 		duration = sample->time - ttrace->entry_time;
2050 		if (trace__filter_duration(trace, duration))
2051 			goto out;
2052 	} else if (trace->duration_filter)
2053 		goto out;
2054 
2055 	if (trace->summary_only)
2056 		goto out;
2057 
2058 	trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
2059 
2060 	if (ttrace->entry_pending) {
2061 		fprintf(trace->output, "%-70s", ttrace->entry_str);
2062 	} else {
2063 		fprintf(trace->output, " ... [");
2064 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2065 		fprintf(trace->output, "]: %s()", sc->name);
2066 	}
2067 
2068 	if (sc->fmt == NULL) {
2069 signed_print:
2070 		fprintf(trace->output, ") = %ld", ret);
2071 	} else if (ret < 0 && sc->fmt->errmsg) {
2072 		char bf[STRERR_BUFSIZE];
2073 		const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
2074 			   *e = audit_errno_to_name(-ret);
2075 
2076 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
2077 	} else if (ret == 0 && sc->fmt->timeout)
2078 		fprintf(trace->output, ") = 0 Timeout");
2079 	else if (sc->fmt->hexret)
2080 		fprintf(trace->output, ") = %#lx", ret);
2081 	else
2082 		goto signed_print;
2083 
2084 	fputc('\n', trace->output);
2085 out:
2086 	ttrace->entry_pending = false;
2087 	err = 0;
2088 out_put:
2089 	thread__put(thread);
2090 	return err;
2091 }
2092 
2093 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
2094 			      union perf_event *event __maybe_unused,
2095 			      struct perf_sample *sample)
2096 {
2097 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2098 	struct thread_trace *ttrace;
2099 	size_t filename_len, entry_str_len, to_move;
2100 	ssize_t remaining_space;
2101 	char *pos;
2102 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
2103 
2104 	if (!thread)
2105 		goto out;
2106 
2107 	ttrace = thread__priv(thread);
2108 	if (!ttrace)
2109 		goto out;
2110 
2111 	filename_len = strlen(filename);
2112 
2113 	if (ttrace->filename.namelen < filename_len) {
2114 		char *f = realloc(ttrace->filename.name, filename_len + 1);
2115 
2116 		if (f == NULL)
2117 				goto out;
2118 
2119 		ttrace->filename.namelen = filename_len;
2120 		ttrace->filename.name = f;
2121 	}
2122 
2123 	strcpy(ttrace->filename.name, filename);
2124 	ttrace->filename.pending_open = true;
2125 
2126 	if (!ttrace->filename.ptr)
2127 		goto out;
2128 
2129 	entry_str_len = strlen(ttrace->entry_str);
2130 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2131 	if (remaining_space <= 0)
2132 		goto out;
2133 
2134 	if (filename_len > (size_t)remaining_space) {
2135 		filename += filename_len - remaining_space;
2136 		filename_len = remaining_space;
2137 	}
2138 
2139 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2140 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2141 	memmove(pos + filename_len, pos, to_move);
2142 	memcpy(pos, filename, filename_len);
2143 
2144 	ttrace->filename.ptr = 0;
2145 	ttrace->filename.entry_str_pos = 0;
2146 out:
2147 	return 0;
2148 }
2149 
2150 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
2151 				     union perf_event *event __maybe_unused,
2152 				     struct perf_sample *sample)
2153 {
2154         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
2155 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2156 	struct thread *thread = machine__findnew_thread(trace->host,
2157 							sample->pid,
2158 							sample->tid);
2159 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
2160 
2161 	if (ttrace == NULL)
2162 		goto out_dump;
2163 
2164 	ttrace->runtime_ms += runtime_ms;
2165 	trace->runtime_ms += runtime_ms;
2166 	thread__put(thread);
2167 	return 0;
2168 
2169 out_dump:
2170 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2171 	       evsel->name,
2172 	       perf_evsel__strval(evsel, sample, "comm"),
2173 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
2174 	       runtime,
2175 	       perf_evsel__intval(evsel, sample, "vruntime"));
2176 	thread__put(thread);
2177 	return 0;
2178 }
2179 
2180 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2181 				union perf_event *event __maybe_unused,
2182 				struct perf_sample *sample)
2183 {
2184 	trace__printf_interrupted_entry(trace, sample);
2185 	trace__fprintf_tstamp(trace, sample->time, trace->output);
2186 
2187 	if (trace->trace_syscalls)
2188 		fprintf(trace->output, "(         ): ");
2189 
2190 	fprintf(trace->output, "%s:", evsel->name);
2191 
2192 	if (evsel->tp_format) {
2193 		event_format__fprintf(evsel->tp_format, sample->cpu,
2194 				      sample->raw_data, sample->raw_size,
2195 				      trace->output);
2196 	}
2197 
2198 	fprintf(trace->output, ")\n");
2199 	return 0;
2200 }
2201 
2202 static void print_location(FILE *f, struct perf_sample *sample,
2203 			   struct addr_location *al,
2204 			   bool print_dso, bool print_sym)
2205 {
2206 
2207 	if ((verbose || print_dso) && al->map)
2208 		fprintf(f, "%s@", al->map->dso->long_name);
2209 
2210 	if ((verbose || print_sym) && al->sym)
2211 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2212 			al->addr - al->sym->start);
2213 	else if (al->map)
2214 		fprintf(f, "0x%" PRIx64, al->addr);
2215 	else
2216 		fprintf(f, "0x%" PRIx64, sample->addr);
2217 }
2218 
2219 static int trace__pgfault(struct trace *trace,
2220 			  struct perf_evsel *evsel,
2221 			  union perf_event *event,
2222 			  struct perf_sample *sample)
2223 {
2224 	struct thread *thread;
2225 	u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
2226 	struct addr_location al;
2227 	char map_type = 'd';
2228 	struct thread_trace *ttrace;
2229 	int err = -1;
2230 
2231 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2232 	ttrace = thread__trace(thread, trace->output);
2233 	if (ttrace == NULL)
2234 		goto out_put;
2235 
2236 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2237 		ttrace->pfmaj++;
2238 	else
2239 		ttrace->pfmin++;
2240 
2241 	if (trace->summary_only)
2242 		goto out;
2243 
2244 	thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
2245 			      sample->ip, &al);
2246 
2247 	trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
2248 
2249 	fprintf(trace->output, "%sfault [",
2250 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2251 		"maj" : "min");
2252 
2253 	print_location(trace->output, sample, &al, false, true);
2254 
2255 	fprintf(trace->output, "] => ");
2256 
2257 	thread__find_addr_location(thread, cpumode, MAP__VARIABLE,
2258 				   sample->addr, &al);
2259 
2260 	if (!al.map) {
2261 		thread__find_addr_location(thread, cpumode,
2262 					   MAP__FUNCTION, sample->addr, &al);
2263 
2264 		if (al.map)
2265 			map_type = 'x';
2266 		else
2267 			map_type = '?';
2268 	}
2269 
2270 	print_location(trace->output, sample, &al, true, false);
2271 
2272 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2273 out:
2274 	err = 0;
2275 out_put:
2276 	thread__put(thread);
2277 	return err;
2278 }
2279 
2280 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
2281 {
2282 	if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
2283 	    (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
2284 		return false;
2285 
2286 	if (trace->pid_list || trace->tid_list)
2287 		return true;
2288 
2289 	return false;
2290 }
2291 
2292 static int trace__process_sample(struct perf_tool *tool,
2293 				 union perf_event *event,
2294 				 struct perf_sample *sample,
2295 				 struct perf_evsel *evsel,
2296 				 struct machine *machine __maybe_unused)
2297 {
2298 	struct trace *trace = container_of(tool, struct trace, tool);
2299 	int err = 0;
2300 
2301 	tracepoint_handler handler = evsel->handler;
2302 
2303 	if (skip_sample(trace, sample))
2304 		return 0;
2305 
2306 	if (!trace->full_time && trace->base_time == 0)
2307 		trace->base_time = sample->time;
2308 
2309 	if (handler) {
2310 		++trace->nr_events;
2311 		handler(trace, evsel, event, sample);
2312 	}
2313 
2314 	return err;
2315 }
2316 
2317 static int parse_target_str(struct trace *trace)
2318 {
2319 	if (trace->opts.target.pid) {
2320 		trace->pid_list = intlist__new(trace->opts.target.pid);
2321 		if (trace->pid_list == NULL) {
2322 			pr_err("Error parsing process id string\n");
2323 			return -EINVAL;
2324 		}
2325 	}
2326 
2327 	if (trace->opts.target.tid) {
2328 		trace->tid_list = intlist__new(trace->opts.target.tid);
2329 		if (trace->tid_list == NULL) {
2330 			pr_err("Error parsing thread id string\n");
2331 			return -EINVAL;
2332 		}
2333 	}
2334 
2335 	return 0;
2336 }
2337 
2338 static int trace__record(struct trace *trace, int argc, const char **argv)
2339 {
2340 	unsigned int rec_argc, i, j;
2341 	const char **rec_argv;
2342 	const char * const record_args[] = {
2343 		"record",
2344 		"-R",
2345 		"-m", "1024",
2346 		"-c", "1",
2347 	};
2348 
2349 	const char * const sc_args[] = { "-e", };
2350 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2351 	const char * const majpf_args[] = { "-e", "major-faults" };
2352 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2353 	const char * const minpf_args[] = { "-e", "minor-faults" };
2354 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2355 
2356 	/* +1 is for the event string below */
2357 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2358 		majpf_args_nr + minpf_args_nr + argc;
2359 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2360 
2361 	if (rec_argv == NULL)
2362 		return -ENOMEM;
2363 
2364 	j = 0;
2365 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2366 		rec_argv[j++] = record_args[i];
2367 
2368 	if (trace->trace_syscalls) {
2369 		for (i = 0; i < sc_args_nr; i++)
2370 			rec_argv[j++] = sc_args[i];
2371 
2372 		/* event string may be different for older kernels - e.g., RHEL6 */
2373 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2374 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2375 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2376 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2377 		else {
2378 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2379 			return -1;
2380 		}
2381 	}
2382 
2383 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2384 		for (i = 0; i < majpf_args_nr; i++)
2385 			rec_argv[j++] = majpf_args[i];
2386 
2387 	if (trace->trace_pgfaults & TRACE_PFMIN)
2388 		for (i = 0; i < minpf_args_nr; i++)
2389 			rec_argv[j++] = minpf_args[i];
2390 
2391 	for (i = 0; i < (unsigned int)argc; i++)
2392 		rec_argv[j++] = argv[i];
2393 
2394 	return cmd_record(j, rec_argv, NULL);
2395 }
2396 
2397 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2398 
2399 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2400 {
2401 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2402 
2403 	if (IS_ERR(evsel))
2404 		return false;
2405 
2406 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2407 		perf_evsel__delete(evsel);
2408 		return false;
2409 	}
2410 
2411 	evsel->handler = trace__vfs_getname;
2412 	perf_evlist__add(evlist, evsel);
2413 	return true;
2414 }
2415 
2416 static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
2417 				    u64 config)
2418 {
2419 	struct perf_evsel *evsel;
2420 	struct perf_event_attr attr = {
2421 		.type = PERF_TYPE_SOFTWARE,
2422 		.mmap_data = 1,
2423 	};
2424 
2425 	attr.config = config;
2426 	attr.sample_period = 1;
2427 
2428 	event_attr_init(&attr);
2429 
2430 	evsel = perf_evsel__new(&attr);
2431 	if (!evsel)
2432 		return -ENOMEM;
2433 
2434 	evsel->handler = trace__pgfault;
2435 	perf_evlist__add(evlist, evsel);
2436 
2437 	return 0;
2438 }
2439 
2440 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2441 {
2442 	const u32 type = event->header.type;
2443 	struct perf_evsel *evsel;
2444 
2445 	if (!trace->full_time && trace->base_time == 0)
2446 		trace->base_time = sample->time;
2447 
2448 	if (type != PERF_RECORD_SAMPLE) {
2449 		trace__process_event(trace, trace->host, event, sample);
2450 		return;
2451 	}
2452 
2453 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2454 	if (evsel == NULL) {
2455 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2456 		return;
2457 	}
2458 
2459 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2460 	    sample->raw_data == NULL) {
2461 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2462 		       perf_evsel__name(evsel), sample->tid,
2463 		       sample->cpu, sample->raw_size);
2464 	} else {
2465 		tracepoint_handler handler = evsel->handler;
2466 		handler(trace, evsel, event, sample);
2467 	}
2468 }
2469 
2470 static int trace__add_syscall_newtp(struct trace *trace)
2471 {
2472 	int ret = -1;
2473 	struct perf_evlist *evlist = trace->evlist;
2474 	struct perf_evsel *sys_enter, *sys_exit;
2475 
2476 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2477 	if (sys_enter == NULL)
2478 		goto out;
2479 
2480 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2481 		goto out_delete_sys_enter;
2482 
2483 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2484 	if (sys_exit == NULL)
2485 		goto out_delete_sys_enter;
2486 
2487 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2488 		goto out_delete_sys_exit;
2489 
2490 	perf_evlist__add(evlist, sys_enter);
2491 	perf_evlist__add(evlist, sys_exit);
2492 
2493 	trace->syscalls.events.sys_enter = sys_enter;
2494 	trace->syscalls.events.sys_exit  = sys_exit;
2495 
2496 	ret = 0;
2497 out:
2498 	return ret;
2499 
2500 out_delete_sys_exit:
2501 	perf_evsel__delete_priv(sys_exit);
2502 out_delete_sys_enter:
2503 	perf_evsel__delete_priv(sys_enter);
2504 	goto out;
2505 }
2506 
2507 static int trace__set_ev_qualifier_filter(struct trace *trace)
2508 {
2509 	int err = -1;
2510 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2511 						trace->ev_qualifier_ids.nr,
2512 						trace->ev_qualifier_ids.entries);
2513 
2514 	if (filter == NULL)
2515 		goto out_enomem;
2516 
2517 	if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2518 		err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2519 
2520 	free(filter);
2521 out:
2522 	return err;
2523 out_enomem:
2524 	errno = ENOMEM;
2525 	goto out;
2526 }
2527 
2528 static int trace__run(struct trace *trace, int argc, const char **argv)
2529 {
2530 	struct perf_evlist *evlist = trace->evlist;
2531 	struct perf_evsel *evsel;
2532 	int err = -1, i;
2533 	unsigned long before;
2534 	const bool forks = argc > 0;
2535 	bool draining = false;
2536 
2537 	trace->live = true;
2538 
2539 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2540 		goto out_error_raw_syscalls;
2541 
2542 	if (trace->trace_syscalls)
2543 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2544 
2545 	if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2546 	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
2547 		goto out_error_mem;
2548 	}
2549 
2550 	if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2551 	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2552 		goto out_error_mem;
2553 
2554 	if (trace->sched &&
2555 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2556 				   trace__sched_stat_runtime))
2557 		goto out_error_sched_stat_runtime;
2558 
2559 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2560 	if (err < 0) {
2561 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2562 		goto out_delete_evlist;
2563 	}
2564 
2565 	err = trace__symbols_init(trace, evlist);
2566 	if (err < 0) {
2567 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2568 		goto out_delete_evlist;
2569 	}
2570 
2571 	perf_evlist__config(evlist, &trace->opts);
2572 
2573 	signal(SIGCHLD, sig_handler);
2574 	signal(SIGINT, sig_handler);
2575 
2576 	if (forks) {
2577 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2578 						    argv, false, NULL);
2579 		if (err < 0) {
2580 			fprintf(trace->output, "Couldn't run the workload!\n");
2581 			goto out_delete_evlist;
2582 		}
2583 	}
2584 
2585 	err = perf_evlist__open(evlist);
2586 	if (err < 0)
2587 		goto out_error_open;
2588 
2589 	/*
2590 	 * Better not use !target__has_task() here because we need to cover the
2591 	 * case where no threads were specified in the command line, but a
2592 	 * workload was, and in that case we will fill in the thread_map when
2593 	 * we fork the workload in perf_evlist__prepare_workload.
2594 	 */
2595 	if (trace->filter_pids.nr > 0)
2596 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2597 	else if (thread_map__pid(evlist->threads, 0) == -1)
2598 		err = perf_evlist__set_filter_pid(evlist, getpid());
2599 
2600 	if (err < 0)
2601 		goto out_error_mem;
2602 
2603 	if (trace->ev_qualifier_ids.nr > 0) {
2604 		err = trace__set_ev_qualifier_filter(trace);
2605 		if (err < 0)
2606 			goto out_errno;
2607 
2608 		pr_debug("event qualifier tracepoint filter: %s\n",
2609 			 trace->syscalls.events.sys_exit->filter);
2610 	}
2611 
2612 	err = perf_evlist__apply_filters(evlist, &evsel);
2613 	if (err < 0)
2614 		goto out_error_apply_filters;
2615 
2616 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2617 	if (err < 0)
2618 		goto out_error_mmap;
2619 
2620 	if (!target__none(&trace->opts.target))
2621 		perf_evlist__enable(evlist);
2622 
2623 	if (forks)
2624 		perf_evlist__start_workload(evlist);
2625 
2626 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2627 				  evlist->threads->nr > 1 ||
2628 				  perf_evlist__first(evlist)->attr.inherit;
2629 again:
2630 	before = trace->nr_events;
2631 
2632 	for (i = 0; i < evlist->nr_mmaps; i++) {
2633 		union perf_event *event;
2634 
2635 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2636 			struct perf_sample sample;
2637 
2638 			++trace->nr_events;
2639 
2640 			err = perf_evlist__parse_sample(evlist, event, &sample);
2641 			if (err) {
2642 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2643 				goto next_event;
2644 			}
2645 
2646 			trace__handle_event(trace, event, &sample);
2647 next_event:
2648 			perf_evlist__mmap_consume(evlist, i);
2649 
2650 			if (interrupted)
2651 				goto out_disable;
2652 
2653 			if (done && !draining) {
2654 				perf_evlist__disable(evlist);
2655 				draining = true;
2656 			}
2657 		}
2658 	}
2659 
2660 	if (trace->nr_events == before) {
2661 		int timeout = done ? 100 : -1;
2662 
2663 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2664 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2665 				draining = true;
2666 
2667 			goto again;
2668 		}
2669 	} else {
2670 		goto again;
2671 	}
2672 
2673 out_disable:
2674 	thread__zput(trace->current);
2675 
2676 	perf_evlist__disable(evlist);
2677 
2678 	if (!err) {
2679 		if (trace->summary)
2680 			trace__fprintf_thread_summary(trace, trace->output);
2681 
2682 		if (trace->show_tool_stats) {
2683 			fprintf(trace->output, "Stats:\n "
2684 					       " vfs_getname : %" PRIu64 "\n"
2685 					       " proc_getname: %" PRIu64 "\n",
2686 				trace->stats.vfs_getname,
2687 				trace->stats.proc_getname);
2688 		}
2689 	}
2690 
2691 out_delete_evlist:
2692 	perf_evlist__delete(evlist);
2693 	trace->evlist = NULL;
2694 	trace->live = false;
2695 	return err;
2696 {
2697 	char errbuf[BUFSIZ];
2698 
2699 out_error_sched_stat_runtime:
2700 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2701 	goto out_error;
2702 
2703 out_error_raw_syscalls:
2704 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2705 	goto out_error;
2706 
2707 out_error_mmap:
2708 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2709 	goto out_error;
2710 
2711 out_error_open:
2712 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2713 
2714 out_error:
2715 	fprintf(trace->output, "%s\n", errbuf);
2716 	goto out_delete_evlist;
2717 
2718 out_error_apply_filters:
2719 	fprintf(trace->output,
2720 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2721 		evsel->filter, perf_evsel__name(evsel), errno,
2722 		strerror_r(errno, errbuf, sizeof(errbuf)));
2723 	goto out_delete_evlist;
2724 }
2725 out_error_mem:
2726 	fprintf(trace->output, "Not enough memory to run!\n");
2727 	goto out_delete_evlist;
2728 
2729 out_errno:
2730 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2731 	goto out_delete_evlist;
2732 }
2733 
2734 static int trace__replay(struct trace *trace)
2735 {
2736 	const struct perf_evsel_str_handler handlers[] = {
2737 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2738 	};
2739 	struct perf_data_file file = {
2740 		.path  = input_name,
2741 		.mode  = PERF_DATA_MODE_READ,
2742 		.force = trace->force,
2743 	};
2744 	struct perf_session *session;
2745 	struct perf_evsel *evsel;
2746 	int err = -1;
2747 
2748 	trace->tool.sample	  = trace__process_sample;
2749 	trace->tool.mmap	  = perf_event__process_mmap;
2750 	trace->tool.mmap2	  = perf_event__process_mmap2;
2751 	trace->tool.comm	  = perf_event__process_comm;
2752 	trace->tool.exit	  = perf_event__process_exit;
2753 	trace->tool.fork	  = perf_event__process_fork;
2754 	trace->tool.attr	  = perf_event__process_attr;
2755 	trace->tool.tracing_data = perf_event__process_tracing_data;
2756 	trace->tool.build_id	  = perf_event__process_build_id;
2757 
2758 	trace->tool.ordered_events = true;
2759 	trace->tool.ordering_requires_timestamps = true;
2760 
2761 	/* add tid to output */
2762 	trace->multiple_threads = true;
2763 
2764 	session = perf_session__new(&file, false, &trace->tool);
2765 	if (session == NULL)
2766 		return -1;
2767 
2768 	if (symbol__init(&session->header.env) < 0)
2769 		goto out;
2770 
2771 	trace->host = &session->machines.host;
2772 
2773 	err = perf_session__set_tracepoints_handlers(session, handlers);
2774 	if (err)
2775 		goto out;
2776 
2777 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2778 						     "raw_syscalls:sys_enter");
2779 	/* older kernels have syscalls tp versus raw_syscalls */
2780 	if (evsel == NULL)
2781 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2782 							     "syscalls:sys_enter");
2783 
2784 	if (evsel &&
2785 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2786 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2787 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2788 		goto out;
2789 	}
2790 
2791 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2792 						     "raw_syscalls:sys_exit");
2793 	if (evsel == NULL)
2794 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2795 							     "syscalls:sys_exit");
2796 	if (evsel &&
2797 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2798 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2799 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2800 		goto out;
2801 	}
2802 
2803 	evlist__for_each(session->evlist, evsel) {
2804 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2805 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2806 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2807 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2808 			evsel->handler = trace__pgfault;
2809 	}
2810 
2811 	err = parse_target_str(trace);
2812 	if (err != 0)
2813 		goto out;
2814 
2815 	setup_pager();
2816 
2817 	err = perf_session__process_events(session);
2818 	if (err)
2819 		pr_err("Failed to process events, error %d", err);
2820 
2821 	else if (trace->summary)
2822 		trace__fprintf_thread_summary(trace, trace->output);
2823 
2824 out:
2825 	perf_session__delete(session);
2826 
2827 	return err;
2828 }
2829 
2830 static size_t trace__fprintf_threads_header(FILE *fp)
2831 {
2832 	size_t printed;
2833 
2834 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2835 
2836 	return printed;
2837 }
2838 
2839 static size_t thread__dump_stats(struct thread_trace *ttrace,
2840 				 struct trace *trace, FILE *fp)
2841 {
2842 	struct stats *stats;
2843 	size_t printed = 0;
2844 	struct syscall *sc;
2845 	struct int_node *inode = intlist__first(ttrace->syscall_stats);
2846 
2847 	if (inode == NULL)
2848 		return 0;
2849 
2850 	printed += fprintf(fp, "\n");
2851 
2852 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2853 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2854 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2855 
2856 	/* each int_node is a syscall */
2857 	while (inode) {
2858 		stats = inode->priv;
2859 		if (stats) {
2860 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2861 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2862 			double avg = avg_stats(stats);
2863 			double pct;
2864 			u64 n = (u64) stats->n;
2865 
2866 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2867 			avg /= NSEC_PER_MSEC;
2868 
2869 			sc = &trace->syscalls.table[inode->i];
2870 			printed += fprintf(fp, "   %-15s", sc->name);
2871 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2872 					   n, avg * n, min, avg);
2873 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2874 		}
2875 
2876 		inode = intlist__next(inode);
2877 	}
2878 
2879 	printed += fprintf(fp, "\n\n");
2880 
2881 	return printed;
2882 }
2883 
2884 /* struct used to pass data to per-thread function */
2885 struct summary_data {
2886 	FILE *fp;
2887 	struct trace *trace;
2888 	size_t printed;
2889 };
2890 
2891 static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2892 {
2893 	struct summary_data *data = priv;
2894 	FILE *fp = data->fp;
2895 	size_t printed = data->printed;
2896 	struct trace *trace = data->trace;
2897 	struct thread_trace *ttrace = thread__priv(thread);
2898 	double ratio;
2899 
2900 	if (ttrace == NULL)
2901 		return 0;
2902 
2903 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2904 
2905 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2906 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2907 	printed += fprintf(fp, "%.1f%%", ratio);
2908 	if (ttrace->pfmaj)
2909 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2910 	if (ttrace->pfmin)
2911 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2912 	printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2913 	printed += thread__dump_stats(ttrace, trace, fp);
2914 
2915 	data->printed += printed;
2916 
2917 	return 0;
2918 }
2919 
2920 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2921 {
2922 	struct summary_data data = {
2923 		.fp = fp,
2924 		.trace = trace
2925 	};
2926 	data.printed = trace__fprintf_threads_header(fp);
2927 
2928 	machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2929 
2930 	return data.printed;
2931 }
2932 
2933 static int trace__set_duration(const struct option *opt, const char *str,
2934 			       int unset __maybe_unused)
2935 {
2936 	struct trace *trace = opt->value;
2937 
2938 	trace->duration_filter = atof(str);
2939 	return 0;
2940 }
2941 
2942 static int trace__set_filter_pids(const struct option *opt, const char *str,
2943 				  int unset __maybe_unused)
2944 {
2945 	int ret = -1;
2946 	size_t i;
2947 	struct trace *trace = opt->value;
2948 	/*
2949 	 * FIXME: introduce a intarray class, plain parse csv and create a
2950 	 * { int nr, int entries[] } struct...
2951 	 */
2952 	struct intlist *list = intlist__new(str);
2953 
2954 	if (list == NULL)
2955 		return -1;
2956 
2957 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2958 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2959 
2960 	if (trace->filter_pids.entries == NULL)
2961 		goto out;
2962 
2963 	trace->filter_pids.entries[0] = getpid();
2964 
2965 	for (i = 1; i < trace->filter_pids.nr; ++i)
2966 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2967 
2968 	intlist__delete(list);
2969 	ret = 0;
2970 out:
2971 	return ret;
2972 }
2973 
2974 static int trace__open_output(struct trace *trace, const char *filename)
2975 {
2976 	struct stat st;
2977 
2978 	if (!stat(filename, &st) && st.st_size) {
2979 		char oldname[PATH_MAX];
2980 
2981 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2982 		unlink(oldname);
2983 		rename(filename, oldname);
2984 	}
2985 
2986 	trace->output = fopen(filename, "w");
2987 
2988 	return trace->output == NULL ? -errno : 0;
2989 }
2990 
2991 static int parse_pagefaults(const struct option *opt, const char *str,
2992 			    int unset __maybe_unused)
2993 {
2994 	int *trace_pgfaults = opt->value;
2995 
2996 	if (strcmp(str, "all") == 0)
2997 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2998 	else if (strcmp(str, "maj") == 0)
2999 		*trace_pgfaults |= TRACE_PFMAJ;
3000 	else if (strcmp(str, "min") == 0)
3001 		*trace_pgfaults |= TRACE_PFMIN;
3002 	else
3003 		return -1;
3004 
3005 	return 0;
3006 }
3007 
3008 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
3009 {
3010 	struct perf_evsel *evsel;
3011 
3012 	evlist__for_each(evlist, evsel)
3013 		evsel->handler = handler;
3014 }
3015 
3016 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
3017 {
3018 	const char *trace_usage[] = {
3019 		"perf trace [<options>] [<command>]",
3020 		"perf trace [<options>] -- <command> [<options>]",
3021 		"perf trace record [<options>] [<command>]",
3022 		"perf trace record [<options>] -- <command> [<options>]",
3023 		NULL
3024 	};
3025 	struct trace trace = {
3026 		.audit = {
3027 			.machine = audit_detect_machine(),
3028 			.open_id = audit_name_to_syscall("open", trace.audit.machine),
3029 		},
3030 		.syscalls = {
3031 			. max = -1,
3032 		},
3033 		.opts = {
3034 			.target = {
3035 				.uid	   = UINT_MAX,
3036 				.uses_mmap = true,
3037 			},
3038 			.user_freq     = UINT_MAX,
3039 			.user_interval = ULLONG_MAX,
3040 			.no_buffering  = true,
3041 			.mmap_pages    = UINT_MAX,
3042 			.proc_map_timeout  = 500,
3043 		},
3044 		.output = stderr,
3045 		.show_comm = true,
3046 		.trace_syscalls = true,
3047 	};
3048 	const char *output_name = NULL;
3049 	const char *ev_qualifier_str = NULL;
3050 	const struct option trace_options[] = {
3051 	OPT_CALLBACK(0, "event", &trace.evlist, "event",
3052 		     "event selector. use 'perf list' to list available events",
3053 		     parse_events_option),
3054 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
3055 		    "show the thread COMM next to its id"),
3056 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3057 	OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
3058 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
3059 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3060 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3061 		    "trace events on existing process id"),
3062 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3063 		    "trace events on existing thread id"),
3064 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3065 		     "pids to filter (by the kernel)", trace__set_filter_pids),
3066 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3067 		    "system-wide collection from all CPUs"),
3068 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3069 		    "list of cpus to monitor"),
3070 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3071 		    "child tasks do not inherit counters"),
3072 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3073 		     "number of mmap data pages",
3074 		     perf_evlist__parse_mmap_pages),
3075 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3076 		   "user to profile"),
3077 	OPT_CALLBACK(0, "duration", &trace, "float",
3078 		     "show only events with duration > N.M ms",
3079 		     trace__set_duration),
3080 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3081 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3082 	OPT_BOOLEAN('T', "time", &trace.full_time,
3083 		    "Show full timestamp, not time relative to first start"),
3084 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
3085 		    "Show only syscall summary with statistics"),
3086 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
3087 		    "Show all syscalls and summary with statistics"),
3088 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3089 		     "Trace pagefaults", parse_pagefaults, "maj"),
3090 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3091 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3092 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3093 			"per thread proc mmap processing timeout in ms"),
3094 	OPT_END()
3095 	};
3096 	const char * const trace_subcommands[] = { "record", NULL };
3097 	int err;
3098 	char bf[BUFSIZ];
3099 
3100 	signal(SIGSEGV, sighandler_dump_stack);
3101 	signal(SIGFPE, sighandler_dump_stack);
3102 
3103 	trace.evlist = perf_evlist__new();
3104 
3105 	if (trace.evlist == NULL) {
3106 		pr_err("Not enough memory to run!\n");
3107 		err = -ENOMEM;
3108 		goto out;
3109 	}
3110 
3111 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3112 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3113 
3114 	if (trace.trace_pgfaults) {
3115 		trace.opts.sample_address = true;
3116 		trace.opts.sample_time = true;
3117 	}
3118 
3119 	if (trace.evlist->nr_entries > 0)
3120 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3121 
3122 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3123 		return trace__record(&trace, argc-1, &argv[1]);
3124 
3125 	/* summary_only implies summary option, but don't overwrite summary if set */
3126 	if (trace.summary_only)
3127 		trace.summary = trace.summary_only;
3128 
3129 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3130 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3131 		pr_err("Please specify something to trace.\n");
3132 		return -1;
3133 	}
3134 
3135 	if (output_name != NULL) {
3136 		err = trace__open_output(&trace, output_name);
3137 		if (err < 0) {
3138 			perror("failed to create output file");
3139 			goto out;
3140 		}
3141 	}
3142 
3143 	if (ev_qualifier_str != NULL) {
3144 		const char *s = ev_qualifier_str;
3145 		struct strlist_config slist_config = {
3146 			.dirname = system_path(STRACE_GROUPS_DIR),
3147 		};
3148 
3149 		trace.not_ev_qualifier = *s == '!';
3150 		if (trace.not_ev_qualifier)
3151 			++s;
3152 		trace.ev_qualifier = strlist__new(s, &slist_config);
3153 		if (trace.ev_qualifier == NULL) {
3154 			fputs("Not enough memory to parse event qualifier",
3155 			      trace.output);
3156 			err = -ENOMEM;
3157 			goto out_close;
3158 		}
3159 
3160 		err = trace__validate_ev_qualifier(&trace);
3161 		if (err)
3162 			goto out_close;
3163 	}
3164 
3165 	err = target__validate(&trace.opts.target);
3166 	if (err) {
3167 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3168 		fprintf(trace.output, "%s", bf);
3169 		goto out_close;
3170 	}
3171 
3172 	err = target__parse_uid(&trace.opts.target);
3173 	if (err) {
3174 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3175 		fprintf(trace.output, "%s", bf);
3176 		goto out_close;
3177 	}
3178 
3179 	if (!argc && target__none(&trace.opts.target))
3180 		trace.opts.target.system_wide = true;
3181 
3182 	if (input_name)
3183 		err = trace__replay(&trace);
3184 	else
3185 		err = trace__run(&trace, argc, argv);
3186 
3187 out_close:
3188 	if (output_name != NULL)
3189 		fclose(trace.output);
3190 out:
3191 	return err;
3192 }
3193