xref: /linux/tools/perf/builtin-trace.c (revision 5e4e38446a62a4f50d77b0dd11d4b379dee08988)
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 #include "util/bpf-loader.h"
37 
38 #include <libaudit.h>
39 #include <stdlib.h>
40 #include <sys/mman.h>
41 #include <linux/futex.h>
42 #include <linux/err.h>
43 
44 /* For older distros: */
45 #ifndef MAP_STACK
46 # define MAP_STACK		0x20000
47 #endif
48 
49 #ifndef MADV_HWPOISON
50 # define MADV_HWPOISON		100
51 
52 #endif
53 
54 #ifndef MADV_MERGEABLE
55 # define MADV_MERGEABLE		12
56 #endif
57 
58 #ifndef MADV_UNMERGEABLE
59 # define MADV_UNMERGEABLE	13
60 #endif
61 
62 #ifndef EFD_SEMAPHORE
63 # define EFD_SEMAPHORE		1
64 #endif
65 
66 #ifndef EFD_NONBLOCK
67 # define EFD_NONBLOCK		00004000
68 #endif
69 
70 #ifndef EFD_CLOEXEC
71 # define EFD_CLOEXEC		02000000
72 #endif
73 
74 #ifndef O_CLOEXEC
75 # define O_CLOEXEC		02000000
76 #endif
77 
78 #ifndef SOCK_DCCP
79 # define SOCK_DCCP		6
80 #endif
81 
82 #ifndef SOCK_CLOEXEC
83 # define SOCK_CLOEXEC		02000000
84 #endif
85 
86 #ifndef SOCK_NONBLOCK
87 # define SOCK_NONBLOCK		00004000
88 #endif
89 
90 #ifndef MSG_CMSG_CLOEXEC
91 # define MSG_CMSG_CLOEXEC	0x40000000
92 #endif
93 
94 #ifndef PERF_FLAG_FD_NO_GROUP
95 # define PERF_FLAG_FD_NO_GROUP		(1UL << 0)
96 #endif
97 
98 #ifndef PERF_FLAG_FD_OUTPUT
99 # define PERF_FLAG_FD_OUTPUT		(1UL << 1)
100 #endif
101 
102 #ifndef PERF_FLAG_PID_CGROUP
103 # define PERF_FLAG_PID_CGROUP		(1UL << 2) /* pid=cgroup id, per-cpu mode only */
104 #endif
105 
106 #ifndef PERF_FLAG_FD_CLOEXEC
107 # define PERF_FLAG_FD_CLOEXEC		(1UL << 3) /* O_CLOEXEC */
108 #endif
109 
110 
111 struct tp_field {
112 	int offset;
113 	union {
114 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
115 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
116 	};
117 };
118 
119 #define TP_UINT_FIELD(bits) \
120 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
121 { \
122 	u##bits value; \
123 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
124 	return value;  \
125 }
126 
127 TP_UINT_FIELD(8);
128 TP_UINT_FIELD(16);
129 TP_UINT_FIELD(32);
130 TP_UINT_FIELD(64);
131 
132 #define TP_UINT_FIELD__SWAPPED(bits) \
133 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
134 { \
135 	u##bits value; \
136 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
137 	return bswap_##bits(value);\
138 }
139 
140 TP_UINT_FIELD__SWAPPED(16);
141 TP_UINT_FIELD__SWAPPED(32);
142 TP_UINT_FIELD__SWAPPED(64);
143 
144 static int tp_field__init_uint(struct tp_field *field,
145 			       struct format_field *format_field,
146 			       bool needs_swap)
147 {
148 	field->offset = format_field->offset;
149 
150 	switch (format_field->size) {
151 	case 1:
152 		field->integer = tp_field__u8;
153 		break;
154 	case 2:
155 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
156 		break;
157 	case 4:
158 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
159 		break;
160 	case 8:
161 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
162 		break;
163 	default:
164 		return -1;
165 	}
166 
167 	return 0;
168 }
169 
170 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
171 {
172 	return sample->raw_data + field->offset;
173 }
174 
175 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
176 {
177 	field->offset = format_field->offset;
178 	field->pointer = tp_field__ptr;
179 	return 0;
180 }
181 
182 struct syscall_tp {
183 	struct tp_field id;
184 	union {
185 		struct tp_field args, ret;
186 	};
187 };
188 
189 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
190 					  struct tp_field *field,
191 					  const char *name)
192 {
193 	struct format_field *format_field = perf_evsel__field(evsel, name);
194 
195 	if (format_field == NULL)
196 		return -1;
197 
198 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
199 }
200 
201 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
202 	({ struct syscall_tp *sc = evsel->priv;\
203 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
204 
205 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
206 					 struct tp_field *field,
207 					 const char *name)
208 {
209 	struct format_field *format_field = perf_evsel__field(evsel, name);
210 
211 	if (format_field == NULL)
212 		return -1;
213 
214 	return tp_field__init_ptr(field, format_field);
215 }
216 
217 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
218 	({ struct syscall_tp *sc = evsel->priv;\
219 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
220 
221 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
222 {
223 	zfree(&evsel->priv);
224 	perf_evsel__delete(evsel);
225 }
226 
227 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
228 {
229 	evsel->priv = malloc(sizeof(struct syscall_tp));
230 	if (evsel->priv != NULL) {
231 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
232 			goto out_delete;
233 
234 		evsel->handler = handler;
235 		return 0;
236 	}
237 
238 	return -ENOMEM;
239 
240 out_delete:
241 	zfree(&evsel->priv);
242 	return -ENOENT;
243 }
244 
245 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
246 {
247 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
248 
249 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
250 	if (IS_ERR(evsel))
251 		evsel = perf_evsel__newtp("syscalls", direction);
252 
253 	if (IS_ERR(evsel))
254 		return NULL;
255 
256 	if (perf_evsel__init_syscall_tp(evsel, handler))
257 		goto out_delete;
258 
259 	return evsel;
260 
261 out_delete:
262 	perf_evsel__delete_priv(evsel);
263 	return NULL;
264 }
265 
266 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
267 	({ struct syscall_tp *fields = evsel->priv; \
268 	   fields->name.integer(&fields->name, sample); })
269 
270 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
271 	({ struct syscall_tp *fields = evsel->priv; \
272 	   fields->name.pointer(&fields->name, sample); })
273 
274 struct syscall_arg {
275 	unsigned long val;
276 	struct thread *thread;
277 	struct trace  *trace;
278 	void	      *parm;
279 	u8	      idx;
280 	u8	      mask;
281 };
282 
283 struct strarray {
284 	int	    offset;
285 	int	    nr_entries;
286 	const char **entries;
287 };
288 
289 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
290 	.nr_entries = ARRAY_SIZE(array), \
291 	.entries = array, \
292 }
293 
294 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
295 	.offset	    = off, \
296 	.nr_entries = ARRAY_SIZE(array), \
297 	.entries = array, \
298 }
299 
300 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
301 						const char *intfmt,
302 					        struct syscall_arg *arg)
303 {
304 	struct strarray *sa = arg->parm;
305 	int idx = arg->val - sa->offset;
306 
307 	if (idx < 0 || idx >= sa->nr_entries)
308 		return scnprintf(bf, size, intfmt, arg->val);
309 
310 	return scnprintf(bf, size, "%s", sa->entries[idx]);
311 }
312 
313 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
314 					      struct syscall_arg *arg)
315 {
316 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
317 }
318 
319 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
320 
321 #if defined(__i386__) || defined(__x86_64__)
322 /*
323  * FIXME: Make this available to all arches as soon as the ioctl beautifier
324  * 	  gets rewritten to support all arches.
325  */
326 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
327 						 struct syscall_arg *arg)
328 {
329 	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
330 }
331 
332 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
333 #endif /* defined(__i386__) || defined(__x86_64__) */
334 
335 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
336 					struct syscall_arg *arg);
337 
338 #define SCA_FD syscall_arg__scnprintf_fd
339 
340 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
341 					   struct syscall_arg *arg)
342 {
343 	int fd = arg->val;
344 
345 	if (fd == AT_FDCWD)
346 		return scnprintf(bf, size, "CWD");
347 
348 	return syscall_arg__scnprintf_fd(bf, size, arg);
349 }
350 
351 #define SCA_FDAT syscall_arg__scnprintf_fd_at
352 
353 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
354 					      struct syscall_arg *arg);
355 
356 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
357 
358 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
359 					 struct syscall_arg *arg)
360 {
361 	return scnprintf(bf, size, "%#lx", arg->val);
362 }
363 
364 #define SCA_HEX syscall_arg__scnprintf_hex
365 
366 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
367 					 struct syscall_arg *arg)
368 {
369 	return scnprintf(bf, size, "%d", arg->val);
370 }
371 
372 #define SCA_INT syscall_arg__scnprintf_int
373 
374 static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
375 					       struct syscall_arg *arg)
376 {
377 	int printed = 0, prot = arg->val;
378 
379 	if (prot == PROT_NONE)
380 		return scnprintf(bf, size, "NONE");
381 #define	P_MMAP_PROT(n) \
382 	if (prot & PROT_##n) { \
383 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
384 		prot &= ~PROT_##n; \
385 	}
386 
387 	P_MMAP_PROT(EXEC);
388 	P_MMAP_PROT(READ);
389 	P_MMAP_PROT(WRITE);
390 #ifdef PROT_SEM
391 	P_MMAP_PROT(SEM);
392 #endif
393 	P_MMAP_PROT(GROWSDOWN);
394 	P_MMAP_PROT(GROWSUP);
395 #undef P_MMAP_PROT
396 
397 	if (prot)
398 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
399 
400 	return printed;
401 }
402 
403 #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
404 
405 static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
406 						struct syscall_arg *arg)
407 {
408 	int printed = 0, flags = arg->val;
409 
410 #define	P_MMAP_FLAG(n) \
411 	if (flags & MAP_##n) { \
412 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
413 		flags &= ~MAP_##n; \
414 	}
415 
416 	P_MMAP_FLAG(SHARED);
417 	P_MMAP_FLAG(PRIVATE);
418 #ifdef MAP_32BIT
419 	P_MMAP_FLAG(32BIT);
420 #endif
421 	P_MMAP_FLAG(ANONYMOUS);
422 	P_MMAP_FLAG(DENYWRITE);
423 	P_MMAP_FLAG(EXECUTABLE);
424 	P_MMAP_FLAG(FILE);
425 	P_MMAP_FLAG(FIXED);
426 	P_MMAP_FLAG(GROWSDOWN);
427 #ifdef MAP_HUGETLB
428 	P_MMAP_FLAG(HUGETLB);
429 #endif
430 	P_MMAP_FLAG(LOCKED);
431 	P_MMAP_FLAG(NONBLOCK);
432 	P_MMAP_FLAG(NORESERVE);
433 	P_MMAP_FLAG(POPULATE);
434 	P_MMAP_FLAG(STACK);
435 #ifdef MAP_UNINITIALIZED
436 	P_MMAP_FLAG(UNINITIALIZED);
437 #endif
438 #undef P_MMAP_FLAG
439 
440 	if (flags)
441 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
442 
443 	return printed;
444 }
445 
446 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
447 
448 static size_t syscall_arg__scnprintf_mremap_flags(char *bf, size_t size,
449 						  struct syscall_arg *arg)
450 {
451 	int printed = 0, flags = arg->val;
452 
453 #define P_MREMAP_FLAG(n) \
454 	if (flags & MREMAP_##n) { \
455 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
456 		flags &= ~MREMAP_##n; \
457 	}
458 
459 	P_MREMAP_FLAG(MAYMOVE);
460 #ifdef MREMAP_FIXED
461 	P_MREMAP_FLAG(FIXED);
462 #endif
463 #undef P_MREMAP_FLAG
464 
465 	if (flags)
466 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
467 
468 	return printed;
469 }
470 
471 #define SCA_MREMAP_FLAGS syscall_arg__scnprintf_mremap_flags
472 
473 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
474 						      struct syscall_arg *arg)
475 {
476 	int behavior = arg->val;
477 
478 	switch (behavior) {
479 #define	P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
480 	P_MADV_BHV(NORMAL);
481 	P_MADV_BHV(RANDOM);
482 	P_MADV_BHV(SEQUENTIAL);
483 	P_MADV_BHV(WILLNEED);
484 	P_MADV_BHV(DONTNEED);
485 	P_MADV_BHV(REMOVE);
486 	P_MADV_BHV(DONTFORK);
487 	P_MADV_BHV(DOFORK);
488 	P_MADV_BHV(HWPOISON);
489 #ifdef MADV_SOFT_OFFLINE
490 	P_MADV_BHV(SOFT_OFFLINE);
491 #endif
492 	P_MADV_BHV(MERGEABLE);
493 	P_MADV_BHV(UNMERGEABLE);
494 #ifdef MADV_HUGEPAGE
495 	P_MADV_BHV(HUGEPAGE);
496 #endif
497 #ifdef MADV_NOHUGEPAGE
498 	P_MADV_BHV(NOHUGEPAGE);
499 #endif
500 #ifdef MADV_DONTDUMP
501 	P_MADV_BHV(DONTDUMP);
502 #endif
503 #ifdef MADV_DODUMP
504 	P_MADV_BHV(DODUMP);
505 #endif
506 #undef P_MADV_PHV
507 	default: break;
508 	}
509 
510 	return scnprintf(bf, size, "%#x", behavior);
511 }
512 
513 #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
514 
515 static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
516 					   struct syscall_arg *arg)
517 {
518 	int printed = 0, op = arg->val;
519 
520 	if (op == 0)
521 		return scnprintf(bf, size, "NONE");
522 #define	P_CMD(cmd) \
523 	if ((op & LOCK_##cmd) == LOCK_##cmd) { \
524 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
525 		op &= ~LOCK_##cmd; \
526 	}
527 
528 	P_CMD(SH);
529 	P_CMD(EX);
530 	P_CMD(NB);
531 	P_CMD(UN);
532 	P_CMD(MAND);
533 	P_CMD(RW);
534 	P_CMD(READ);
535 	P_CMD(WRITE);
536 #undef P_OP
537 
538 	if (op)
539 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
540 
541 	return printed;
542 }
543 
544 #define SCA_FLOCK syscall_arg__scnprintf_flock
545 
546 static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
547 {
548 	enum syscall_futex_args {
549 		SCF_UADDR   = (1 << 0),
550 		SCF_OP	    = (1 << 1),
551 		SCF_VAL	    = (1 << 2),
552 		SCF_TIMEOUT = (1 << 3),
553 		SCF_UADDR2  = (1 << 4),
554 		SCF_VAL3    = (1 << 5),
555 	};
556 	int op = arg->val;
557 	int cmd = op & FUTEX_CMD_MASK;
558 	size_t printed = 0;
559 
560 	switch (cmd) {
561 #define	P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
562 	P_FUTEX_OP(WAIT);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
563 	P_FUTEX_OP(WAKE);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
564 	P_FUTEX_OP(FD);		    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
565 	P_FUTEX_OP(REQUEUE);	    arg->mask |= SCF_VAL3|SCF_TIMEOUT;	          break;
566 	P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;			  break;
567 	P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;			  break;
568 	P_FUTEX_OP(WAKE_OP);							  break;
569 	P_FUTEX_OP(LOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
570 	P_FUTEX_OP(UNLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
571 	P_FUTEX_OP(TRYLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
572 	P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;			  break;
573 	P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;			  break;
574 	P_FUTEX_OP(WAIT_REQUEUE_PI);						  break;
575 	default: printed = scnprintf(bf, size, "%#x", cmd);			  break;
576 	}
577 
578 	if (op & FUTEX_PRIVATE_FLAG)
579 		printed += scnprintf(bf + printed, size - printed, "|PRIV");
580 
581 	if (op & FUTEX_CLOCK_REALTIME)
582 		printed += scnprintf(bf + printed, size - printed, "|CLKRT");
583 
584 	return printed;
585 }
586 
587 #define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
588 
589 static const char *bpf_cmd[] = {
590 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
591 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
592 };
593 static DEFINE_STRARRAY(bpf_cmd);
594 
595 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
596 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
597 
598 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
599 static DEFINE_STRARRAY(itimers);
600 
601 static const char *keyctl_options[] = {
602 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
603 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
604 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
605 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
606 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
607 };
608 static DEFINE_STRARRAY(keyctl_options);
609 
610 static const char *whences[] = { "SET", "CUR", "END",
611 #ifdef SEEK_DATA
612 "DATA",
613 #endif
614 #ifdef SEEK_HOLE
615 "HOLE",
616 #endif
617 };
618 static DEFINE_STRARRAY(whences);
619 
620 static const char *fcntl_cmds[] = {
621 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
622 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
623 	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
624 	"F_GETOWNER_UIDS",
625 };
626 static DEFINE_STRARRAY(fcntl_cmds);
627 
628 static const char *rlimit_resources[] = {
629 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
630 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
631 	"RTTIME",
632 };
633 static DEFINE_STRARRAY(rlimit_resources);
634 
635 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
636 static DEFINE_STRARRAY(sighow);
637 
638 static const char *clockid[] = {
639 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
640 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
641 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
642 };
643 static DEFINE_STRARRAY(clockid);
644 
645 static const char *socket_families[] = {
646 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
647 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
648 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
649 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
650 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
651 	"ALG", "NFC", "VSOCK",
652 };
653 static DEFINE_STRARRAY(socket_families);
654 
655 #ifndef SOCK_TYPE_MASK
656 #define SOCK_TYPE_MASK 0xf
657 #endif
658 
659 static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
660 						      struct syscall_arg *arg)
661 {
662 	size_t printed;
663 	int type = arg->val,
664 	    flags = type & ~SOCK_TYPE_MASK;
665 
666 	type &= SOCK_TYPE_MASK;
667 	/*
668  	 * Can't use a strarray, MIPS may override for ABI reasons.
669  	 */
670 	switch (type) {
671 #define	P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
672 	P_SK_TYPE(STREAM);
673 	P_SK_TYPE(DGRAM);
674 	P_SK_TYPE(RAW);
675 	P_SK_TYPE(RDM);
676 	P_SK_TYPE(SEQPACKET);
677 	P_SK_TYPE(DCCP);
678 	P_SK_TYPE(PACKET);
679 #undef P_SK_TYPE
680 	default:
681 		printed = scnprintf(bf, size, "%#x", type);
682 	}
683 
684 #define	P_SK_FLAG(n) \
685 	if (flags & SOCK_##n) { \
686 		printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
687 		flags &= ~SOCK_##n; \
688 	}
689 
690 	P_SK_FLAG(CLOEXEC);
691 	P_SK_FLAG(NONBLOCK);
692 #undef P_SK_FLAG
693 
694 	if (flags)
695 		printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
696 
697 	return printed;
698 }
699 
700 #define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
701 
702 #ifndef MSG_PROBE
703 #define MSG_PROBE	     0x10
704 #endif
705 #ifndef MSG_WAITFORONE
706 #define MSG_WAITFORONE	0x10000
707 #endif
708 #ifndef MSG_SENDPAGE_NOTLAST
709 #define MSG_SENDPAGE_NOTLAST 0x20000
710 #endif
711 #ifndef MSG_FASTOPEN
712 #define MSG_FASTOPEN	     0x20000000
713 #endif
714 
715 static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
716 					       struct syscall_arg *arg)
717 {
718 	int printed = 0, flags = arg->val;
719 
720 	if (flags == 0)
721 		return scnprintf(bf, size, "NONE");
722 #define	P_MSG_FLAG(n) \
723 	if (flags & MSG_##n) { \
724 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
725 		flags &= ~MSG_##n; \
726 	}
727 
728 	P_MSG_FLAG(OOB);
729 	P_MSG_FLAG(PEEK);
730 	P_MSG_FLAG(DONTROUTE);
731 	P_MSG_FLAG(TRYHARD);
732 	P_MSG_FLAG(CTRUNC);
733 	P_MSG_FLAG(PROBE);
734 	P_MSG_FLAG(TRUNC);
735 	P_MSG_FLAG(DONTWAIT);
736 	P_MSG_FLAG(EOR);
737 	P_MSG_FLAG(WAITALL);
738 	P_MSG_FLAG(FIN);
739 	P_MSG_FLAG(SYN);
740 	P_MSG_FLAG(CONFIRM);
741 	P_MSG_FLAG(RST);
742 	P_MSG_FLAG(ERRQUEUE);
743 	P_MSG_FLAG(NOSIGNAL);
744 	P_MSG_FLAG(MORE);
745 	P_MSG_FLAG(WAITFORONE);
746 	P_MSG_FLAG(SENDPAGE_NOTLAST);
747 	P_MSG_FLAG(FASTOPEN);
748 	P_MSG_FLAG(CMSG_CLOEXEC);
749 #undef P_MSG_FLAG
750 
751 	if (flags)
752 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
753 
754 	return printed;
755 }
756 
757 #define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
758 
759 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
760 						 struct syscall_arg *arg)
761 {
762 	size_t printed = 0;
763 	int mode = arg->val;
764 
765 	if (mode == F_OK) /* 0 */
766 		return scnprintf(bf, size, "F");
767 #define	P_MODE(n) \
768 	if (mode & n##_OK) { \
769 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
770 		mode &= ~n##_OK; \
771 	}
772 
773 	P_MODE(R);
774 	P_MODE(W);
775 	P_MODE(X);
776 #undef P_MODE
777 
778 	if (mode)
779 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
780 
781 	return printed;
782 }
783 
784 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
785 
786 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
787 					      struct syscall_arg *arg);
788 
789 #define SCA_FILENAME syscall_arg__scnprintf_filename
790 
791 static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
792 					       struct syscall_arg *arg)
793 {
794 	int printed = 0, flags = arg->val;
795 
796 	if (!(flags & O_CREAT))
797 		arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
798 
799 	if (flags == 0)
800 		return scnprintf(bf, size, "RDONLY");
801 #define	P_FLAG(n) \
802 	if (flags & O_##n) { \
803 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
804 		flags &= ~O_##n; \
805 	}
806 
807 	P_FLAG(APPEND);
808 	P_FLAG(ASYNC);
809 	P_FLAG(CLOEXEC);
810 	P_FLAG(CREAT);
811 	P_FLAG(DIRECT);
812 	P_FLAG(DIRECTORY);
813 	P_FLAG(EXCL);
814 	P_FLAG(LARGEFILE);
815 	P_FLAG(NOATIME);
816 	P_FLAG(NOCTTY);
817 #ifdef O_NONBLOCK
818 	P_FLAG(NONBLOCK);
819 #elif O_NDELAY
820 	P_FLAG(NDELAY);
821 #endif
822 #ifdef O_PATH
823 	P_FLAG(PATH);
824 #endif
825 	P_FLAG(RDWR);
826 #ifdef O_DSYNC
827 	if ((flags & O_SYNC) == O_SYNC)
828 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
829 	else {
830 		P_FLAG(DSYNC);
831 	}
832 #else
833 	P_FLAG(SYNC);
834 #endif
835 	P_FLAG(TRUNC);
836 	P_FLAG(WRONLY);
837 #undef P_FLAG
838 
839 	if (flags)
840 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
841 
842 	return printed;
843 }
844 
845 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
846 
847 static size_t syscall_arg__scnprintf_perf_flags(char *bf, size_t size,
848 						struct syscall_arg *arg)
849 {
850 	int printed = 0, flags = arg->val;
851 
852 	if (flags == 0)
853 		return 0;
854 
855 #define	P_FLAG(n) \
856 	if (flags & PERF_FLAG_##n) { \
857 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
858 		flags &= ~PERF_FLAG_##n; \
859 	}
860 
861 	P_FLAG(FD_NO_GROUP);
862 	P_FLAG(FD_OUTPUT);
863 	P_FLAG(PID_CGROUP);
864 	P_FLAG(FD_CLOEXEC);
865 #undef P_FLAG
866 
867 	if (flags)
868 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
869 
870 	return printed;
871 }
872 
873 #define SCA_PERF_FLAGS syscall_arg__scnprintf_perf_flags
874 
875 static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
876 						   struct syscall_arg *arg)
877 {
878 	int printed = 0, flags = arg->val;
879 
880 	if (flags == 0)
881 		return scnprintf(bf, size, "NONE");
882 #define	P_FLAG(n) \
883 	if (flags & EFD_##n) { \
884 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
885 		flags &= ~EFD_##n; \
886 	}
887 
888 	P_FLAG(SEMAPHORE);
889 	P_FLAG(CLOEXEC);
890 	P_FLAG(NONBLOCK);
891 #undef P_FLAG
892 
893 	if (flags)
894 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
895 
896 	return printed;
897 }
898 
899 #define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
900 
901 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
902 						struct syscall_arg *arg)
903 {
904 	int printed = 0, flags = arg->val;
905 
906 #define	P_FLAG(n) \
907 	if (flags & O_##n) { \
908 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
909 		flags &= ~O_##n; \
910 	}
911 
912 	P_FLAG(CLOEXEC);
913 	P_FLAG(NONBLOCK);
914 #undef P_FLAG
915 
916 	if (flags)
917 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
918 
919 	return printed;
920 }
921 
922 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
923 
924 static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
925 {
926 	int sig = arg->val;
927 
928 	switch (sig) {
929 #define	P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
930 	P_SIGNUM(HUP);
931 	P_SIGNUM(INT);
932 	P_SIGNUM(QUIT);
933 	P_SIGNUM(ILL);
934 	P_SIGNUM(TRAP);
935 	P_SIGNUM(ABRT);
936 	P_SIGNUM(BUS);
937 	P_SIGNUM(FPE);
938 	P_SIGNUM(KILL);
939 	P_SIGNUM(USR1);
940 	P_SIGNUM(SEGV);
941 	P_SIGNUM(USR2);
942 	P_SIGNUM(PIPE);
943 	P_SIGNUM(ALRM);
944 	P_SIGNUM(TERM);
945 	P_SIGNUM(CHLD);
946 	P_SIGNUM(CONT);
947 	P_SIGNUM(STOP);
948 	P_SIGNUM(TSTP);
949 	P_SIGNUM(TTIN);
950 	P_SIGNUM(TTOU);
951 	P_SIGNUM(URG);
952 	P_SIGNUM(XCPU);
953 	P_SIGNUM(XFSZ);
954 	P_SIGNUM(VTALRM);
955 	P_SIGNUM(PROF);
956 	P_SIGNUM(WINCH);
957 	P_SIGNUM(IO);
958 	P_SIGNUM(PWR);
959 	P_SIGNUM(SYS);
960 #ifdef SIGEMT
961 	P_SIGNUM(EMT);
962 #endif
963 #ifdef SIGSTKFLT
964 	P_SIGNUM(STKFLT);
965 #endif
966 #ifdef SIGSWI
967 	P_SIGNUM(SWI);
968 #endif
969 	default: break;
970 	}
971 
972 	return scnprintf(bf, size, "%#x", sig);
973 }
974 
975 #define SCA_SIGNUM syscall_arg__scnprintf_signum
976 
977 #if defined(__i386__) || defined(__x86_64__)
978 /*
979  * FIXME: Make this available to all arches.
980  */
981 #define TCGETS		0x5401
982 
983 static const char *tioctls[] = {
984 	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
985 	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
986 	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
987 	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
988 	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
989 	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
990 	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
991 	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
992 	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
993 	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
994 	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
995 	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
996 	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
997 	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
998 	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
999 };
1000 
1001 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
1002 #endif /* defined(__i386__) || defined(__x86_64__) */
1003 
1004 #define STRARRAY(arg, name, array) \
1005 	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
1006 	  .arg_parm	 = { [arg] = &strarray__##array, }
1007 
1008 static struct syscall_fmt {
1009 	const char *name;
1010 	const char *alias;
1011 	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
1012 	void	   *arg_parm[6];
1013 	bool	   errmsg;
1014 	bool	   timeout;
1015 	bool	   hexret;
1016 } syscall_fmts[] = {
1017 	{ .name	    = "access",	    .errmsg = true,
1018 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */
1019 			     [1] = SCA_ACCMODE,  /* mode */ }, },
1020 	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
1021 	{ .name	    = "bpf",	    .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
1022 	{ .name	    = "brk",	    .hexret = true,
1023 	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
1024 	{ .name	    = "chdir",	    .errmsg = true,
1025 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1026 	{ .name	    = "chmod",	    .errmsg = true,
1027 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1028 	{ .name	    = "chroot",	    .errmsg = true,
1029 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1030 	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
1031 	{ .name	    = "close",	    .errmsg = true,
1032 	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
1033 	{ .name	    = "connect",    .errmsg = true, },
1034 	{ .name	    = "creat",	    .errmsg = true,
1035 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1036 	{ .name	    = "dup",	    .errmsg = true,
1037 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1038 	{ .name	    = "dup2",	    .errmsg = true,
1039 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1040 	{ .name	    = "dup3",	    .errmsg = true,
1041 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1042 	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
1043 	{ .name	    = "eventfd2",   .errmsg = true,
1044 	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
1045 	{ .name	    = "faccessat",  .errmsg = true,
1046 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1047 			     [1] = SCA_FILENAME, /* filename */ }, },
1048 	{ .name	    = "fadvise64",  .errmsg = true,
1049 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1050 	{ .name	    = "fallocate",  .errmsg = true,
1051 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1052 	{ .name	    = "fchdir",	    .errmsg = true,
1053 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1054 	{ .name	    = "fchmod",	    .errmsg = true,
1055 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1056 	{ .name	    = "fchmodat",   .errmsg = true,
1057 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1058 			     [1] = SCA_FILENAME, /* filename */ }, },
1059 	{ .name	    = "fchown",	    .errmsg = true,
1060 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1061 	{ .name	    = "fchownat",   .errmsg = true,
1062 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1063 			     [1] = SCA_FILENAME, /* filename */ }, },
1064 	{ .name	    = "fcntl",	    .errmsg = true,
1065 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1066 			     [1] = SCA_STRARRAY, /* cmd */ },
1067 	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
1068 	{ .name	    = "fdatasync",  .errmsg = true,
1069 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1070 	{ .name	    = "flock",	    .errmsg = true,
1071 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1072 			     [1] = SCA_FLOCK, /* cmd */ }, },
1073 	{ .name	    = "fsetxattr",  .errmsg = true,
1074 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1075 	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat",
1076 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1077 	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat",
1078 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1079 			     [1] = SCA_FILENAME, /* filename */ }, },
1080 	{ .name	    = "fstatfs",    .errmsg = true,
1081 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1082 	{ .name	    = "fsync",    .errmsg = true,
1083 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1084 	{ .name	    = "ftruncate", .errmsg = true,
1085 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1086 	{ .name	    = "futex",	    .errmsg = true,
1087 	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
1088 	{ .name	    = "futimesat", .errmsg = true,
1089 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1090 			     [1] = SCA_FILENAME, /* filename */ }, },
1091 	{ .name	    = "getdents",   .errmsg = true,
1092 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1093 	{ .name	    = "getdents64", .errmsg = true,
1094 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1095 	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1096 	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1097 	{ .name	    = "getxattr",    .errmsg = true,
1098 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1099 	{ .name	    = "inotify_add_watch",	    .errmsg = true,
1100 	  .arg_scnprintf = { [1] = SCA_FILENAME, /* pathname */ }, },
1101 	{ .name	    = "ioctl",	    .errmsg = true,
1102 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1103 #if defined(__i386__) || defined(__x86_64__)
1104 /*
1105  * FIXME: Make this available to all arches.
1106  */
1107 			     [1] = SCA_STRHEXARRAY, /* cmd */
1108 			     [2] = SCA_HEX, /* arg */ },
1109 	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
1110 #else
1111 			     [2] = SCA_HEX, /* arg */ }, },
1112 #endif
1113 	{ .name	    = "keyctl",	    .errmsg = true, STRARRAY(0, option, keyctl_options), },
1114 	{ .name	    = "kill",	    .errmsg = true,
1115 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1116 	{ .name	    = "lchown",    .errmsg = true,
1117 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1118 	{ .name	    = "lgetxattr",  .errmsg = true,
1119 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1120 	{ .name	    = "linkat",	    .errmsg = true,
1121 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
1122 	{ .name	    = "listxattr",  .errmsg = true,
1123 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1124 	{ .name	    = "llistxattr", .errmsg = true,
1125 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1126 	{ .name	    = "lremovexattr",  .errmsg = true,
1127 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1128 	{ .name	    = "lseek",	    .errmsg = true,
1129 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1130 			     [2] = SCA_STRARRAY, /* whence */ },
1131 	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
1132 	{ .name	    = "lsetxattr",  .errmsg = true,
1133 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1134 	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat",
1135 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1136 	{ .name	    = "lsxattr",    .errmsg = true,
1137 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1138 	{ .name     = "madvise",    .errmsg = true,
1139 	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
1140 			     [2] = SCA_MADV_BHV, /* behavior */ }, },
1141 	{ .name	    = "mkdir",    .errmsg = true,
1142 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1143 	{ .name	    = "mkdirat",    .errmsg = true,
1144 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1145 			     [1] = SCA_FILENAME, /* pathname */ }, },
1146 	{ .name	    = "mknod",      .errmsg = true,
1147 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1148 	{ .name	    = "mknodat",    .errmsg = true,
1149 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */
1150 			     [1] = SCA_FILENAME, /* filename */ }, },
1151 	{ .name	    = "mlock",	    .errmsg = true,
1152 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1153 	{ .name	    = "mlockall",   .errmsg = true,
1154 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1155 	{ .name	    = "mmap",	    .hexret = true,
1156 	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
1157 			     [2] = SCA_MMAP_PROT, /* prot */
1158 			     [3] = SCA_MMAP_FLAGS, /* flags */
1159 			     [4] = SCA_FD, 	  /* fd */ }, },
1160 	{ .name	    = "mprotect",   .errmsg = true,
1161 	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
1162 			     [2] = SCA_MMAP_PROT, /* prot */ }, },
1163 	{ .name	    = "mq_unlink", .errmsg = true,
1164 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
1165 	{ .name	    = "mremap",	    .hexret = true,
1166 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1167 			     [3] = SCA_MREMAP_FLAGS, /* flags */
1168 			     [4] = SCA_HEX, /* new_addr */ }, },
1169 	{ .name	    = "munlock",    .errmsg = true,
1170 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1171 	{ .name	    = "munmap",	    .errmsg = true,
1172 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1173 	{ .name	    = "name_to_handle_at", .errmsg = true,
1174 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1175 	{ .name	    = "newfstatat", .errmsg = true,
1176 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1177 			     [1] = SCA_FILENAME, /* filename */ }, },
1178 	{ .name	    = "open",	    .errmsg = true,
1179 	  .arg_scnprintf = { [0] = SCA_FILENAME,   /* filename */
1180 			     [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1181 	{ .name	    = "open_by_handle_at", .errmsg = true,
1182 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1183 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1184 	{ .name	    = "openat",	    .errmsg = true,
1185 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1186 			     [1] = SCA_FILENAME, /* filename */
1187 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1188 	{ .name	    = "perf_event_open", .errmsg = true,
1189 	  .arg_scnprintf = { [1] = SCA_INT, /* pid */
1190 			     [2] = SCA_INT, /* cpu */
1191 			     [3] = SCA_FD,  /* group_fd */
1192 			     [4] = SCA_PERF_FLAGS,  /* flags */ }, },
1193 	{ .name	    = "pipe2",	    .errmsg = true,
1194 	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1195 	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
1196 	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
1197 	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64",
1198 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1199 	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread",
1200 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1201 	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1202 	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64",
1203 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1204 	{ .name	    = "pwritev",    .errmsg = true,
1205 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1206 	{ .name	    = "read",	    .errmsg = true,
1207 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1208 	{ .name	    = "readlink",   .errmsg = true,
1209 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
1210 	{ .name	    = "readlinkat", .errmsg = true,
1211 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1212 			     [1] = SCA_FILENAME, /* pathname */ }, },
1213 	{ .name	    = "readv",	    .errmsg = true,
1214 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1215 	{ .name	    = "recvfrom",   .errmsg = true,
1216 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1217 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
1218 	{ .name	    = "recvmmsg",   .errmsg = true,
1219 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1220 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
1221 	{ .name	    = "recvmsg",    .errmsg = true,
1222 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1223 			     [2] = SCA_MSG_FLAGS, /* flags */ }, },
1224 	{ .name	    = "removexattr", .errmsg = true,
1225 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1226 	{ .name	    = "renameat",   .errmsg = true,
1227 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1228 	{ .name	    = "rmdir",    .errmsg = true,
1229 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1230 	{ .name	    = "rt_sigaction", .errmsg = true,
1231 	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1232 	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
1233 	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
1234 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1235 	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
1236 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1237 	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
1238 	{ .name	    = "sendmmsg",    .errmsg = true,
1239 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1240 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
1241 	{ .name	    = "sendmsg",    .errmsg = true,
1242 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1243 			     [2] = SCA_MSG_FLAGS, /* flags */ }, },
1244 	{ .name	    = "sendto",	    .errmsg = true,
1245 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
1246 			     [3] = SCA_MSG_FLAGS, /* flags */ }, },
1247 	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1248 	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1249 	{ .name	    = "setxattr",   .errmsg = true,
1250 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1251 	{ .name	    = "shutdown",   .errmsg = true,
1252 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1253 	{ .name	    = "socket",	    .errmsg = true,
1254 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1255 			     [1] = SCA_SK_TYPE, /* type */ },
1256 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
1257 	{ .name	    = "socketpair", .errmsg = true,
1258 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1259 			     [1] = SCA_SK_TYPE, /* type */ },
1260 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
1261 	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat",
1262 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1263 	{ .name	    = "statfs",	    .errmsg = true,
1264 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* pathname */ }, },
1265 	{ .name	    = "swapoff",    .errmsg = true,
1266 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
1267 	{ .name	    = "swapon",	    .errmsg = true,
1268 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
1269 	{ .name	    = "symlinkat",  .errmsg = true,
1270 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1271 	{ .name	    = "tgkill",	    .errmsg = true,
1272 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1273 	{ .name	    = "tkill",	    .errmsg = true,
1274 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1275 	{ .name	    = "truncate",   .errmsg = true,
1276 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* path */ }, },
1277 	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
1278 	{ .name	    = "unlinkat",   .errmsg = true,
1279 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1280 			     [1] = SCA_FILENAME, /* pathname */ }, },
1281 	{ .name	    = "utime",  .errmsg = true,
1282 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1283 	{ .name	    = "utimensat",  .errmsg = true,
1284 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */
1285 			     [1] = SCA_FILENAME, /* filename */ }, },
1286 	{ .name	    = "utimes",  .errmsg = true,
1287 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* filename */ }, },
1288 	{ .name	    = "vmsplice",  .errmsg = true,
1289 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1290 	{ .name	    = "write",	    .errmsg = true,
1291 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1292 	{ .name	    = "writev",	    .errmsg = true,
1293 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1294 };
1295 
1296 static int syscall_fmt__cmp(const void *name, const void *fmtp)
1297 {
1298 	const struct syscall_fmt *fmt = fmtp;
1299 	return strcmp(name, fmt->name);
1300 }
1301 
1302 static struct syscall_fmt *syscall_fmt__find(const char *name)
1303 {
1304 	const int nmemb = ARRAY_SIZE(syscall_fmts);
1305 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1306 }
1307 
1308 struct syscall {
1309 	struct event_format *tp_format;
1310 	int		    nr_args;
1311 	struct format_field *args;
1312 	const char	    *name;
1313 	bool		    is_exit;
1314 	struct syscall_fmt  *fmt;
1315 	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1316 	void		    **arg_parm;
1317 };
1318 
1319 static size_t fprintf_duration(unsigned long t, FILE *fp)
1320 {
1321 	double duration = (double)t / NSEC_PER_MSEC;
1322 	size_t printed = fprintf(fp, "(");
1323 
1324 	if (duration >= 1.0)
1325 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1326 	else if (duration >= 0.01)
1327 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1328 	else
1329 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1330 	return printed + fprintf(fp, "): ");
1331 }
1332 
1333 /**
1334  * filename.ptr: The filename char pointer that will be vfs_getname'd
1335  * filename.entry_str_pos: Where to insert the string translated from
1336  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
1337  */
1338 struct thread_trace {
1339 	u64		  entry_time;
1340 	u64		  exit_time;
1341 	bool		  entry_pending;
1342 	unsigned long	  nr_events;
1343 	unsigned long	  pfmaj, pfmin;
1344 	char		  *entry_str;
1345 	double		  runtime_ms;
1346         struct {
1347 		unsigned long ptr;
1348 		short int     entry_str_pos;
1349 		bool	      pending_open;
1350 		unsigned int  namelen;
1351 		char	      *name;
1352 	} filename;
1353 	struct {
1354 		int	  max;
1355 		char	  **table;
1356 	} paths;
1357 
1358 	struct intlist *syscall_stats;
1359 };
1360 
1361 static struct thread_trace *thread_trace__new(void)
1362 {
1363 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1364 
1365 	if (ttrace)
1366 		ttrace->paths.max = -1;
1367 
1368 	ttrace->syscall_stats = intlist__new(NULL);
1369 
1370 	return ttrace;
1371 }
1372 
1373 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1374 {
1375 	struct thread_trace *ttrace;
1376 
1377 	if (thread == NULL)
1378 		goto fail;
1379 
1380 	if (thread__priv(thread) == NULL)
1381 		thread__set_priv(thread, thread_trace__new());
1382 
1383 	if (thread__priv(thread) == NULL)
1384 		goto fail;
1385 
1386 	ttrace = thread__priv(thread);
1387 	++ttrace->nr_events;
1388 
1389 	return ttrace;
1390 fail:
1391 	color_fprintf(fp, PERF_COLOR_RED,
1392 		      "WARNING: not enough memory, dropping samples!\n");
1393 	return NULL;
1394 }
1395 
1396 #define TRACE_PFMAJ		(1 << 0)
1397 #define TRACE_PFMIN		(1 << 1)
1398 
1399 static const size_t trace__entry_str_size = 2048;
1400 
1401 struct trace {
1402 	struct perf_tool	tool;
1403 	struct {
1404 		int		machine;
1405 		int		open_id;
1406 	}			audit;
1407 	struct {
1408 		int		max;
1409 		struct syscall  *table;
1410 		struct {
1411 			struct perf_evsel *sys_enter,
1412 					  *sys_exit;
1413 		}		events;
1414 	} syscalls;
1415 	struct record_opts	opts;
1416 	struct perf_evlist	*evlist;
1417 	struct machine		*host;
1418 	struct thread		*current;
1419 	u64			base_time;
1420 	FILE			*output;
1421 	unsigned long		nr_events;
1422 	struct strlist		*ev_qualifier;
1423 	struct {
1424 		size_t		nr;
1425 		int		*entries;
1426 	}			ev_qualifier_ids;
1427 	struct intlist		*tid_list;
1428 	struct intlist		*pid_list;
1429 	struct {
1430 		size_t		nr;
1431 		pid_t		*entries;
1432 	}			filter_pids;
1433 	double			duration_filter;
1434 	double			runtime_ms;
1435 	struct {
1436 		u64		vfs_getname,
1437 				proc_getname;
1438 	} stats;
1439 	bool			not_ev_qualifier;
1440 	bool			live;
1441 	bool			full_time;
1442 	bool			sched;
1443 	bool			multiple_threads;
1444 	bool			summary;
1445 	bool			summary_only;
1446 	bool			show_comm;
1447 	bool			show_tool_stats;
1448 	bool			trace_syscalls;
1449 	bool			force;
1450 	bool			vfs_getname;
1451 	int			trace_pgfaults;
1452 };
1453 
1454 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1455 {
1456 	struct thread_trace *ttrace = thread__priv(thread);
1457 
1458 	if (fd > ttrace->paths.max) {
1459 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1460 
1461 		if (npath == NULL)
1462 			return -1;
1463 
1464 		if (ttrace->paths.max != -1) {
1465 			memset(npath + ttrace->paths.max + 1, 0,
1466 			       (fd - ttrace->paths.max) * sizeof(char *));
1467 		} else {
1468 			memset(npath, 0, (fd + 1) * sizeof(char *));
1469 		}
1470 
1471 		ttrace->paths.table = npath;
1472 		ttrace->paths.max   = fd;
1473 	}
1474 
1475 	ttrace->paths.table[fd] = strdup(pathname);
1476 
1477 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
1478 }
1479 
1480 static int thread__read_fd_path(struct thread *thread, int fd)
1481 {
1482 	char linkname[PATH_MAX], pathname[PATH_MAX];
1483 	struct stat st;
1484 	int ret;
1485 
1486 	if (thread->pid_ == thread->tid) {
1487 		scnprintf(linkname, sizeof(linkname),
1488 			  "/proc/%d/fd/%d", thread->pid_, fd);
1489 	} else {
1490 		scnprintf(linkname, sizeof(linkname),
1491 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1492 	}
1493 
1494 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1495 		return -1;
1496 
1497 	ret = readlink(linkname, pathname, sizeof(pathname));
1498 
1499 	if (ret < 0 || ret > st.st_size)
1500 		return -1;
1501 
1502 	pathname[ret] = '\0';
1503 	return trace__set_fd_pathname(thread, fd, pathname);
1504 }
1505 
1506 static const char *thread__fd_path(struct thread *thread, int fd,
1507 				   struct trace *trace)
1508 {
1509 	struct thread_trace *ttrace = thread__priv(thread);
1510 
1511 	if (ttrace == NULL)
1512 		return NULL;
1513 
1514 	if (fd < 0)
1515 		return NULL;
1516 
1517 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1518 		if (!trace->live)
1519 			return NULL;
1520 		++trace->stats.proc_getname;
1521 		if (thread__read_fd_path(thread, fd))
1522 			return NULL;
1523 	}
1524 
1525 	return ttrace->paths.table[fd];
1526 }
1527 
1528 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1529 					struct syscall_arg *arg)
1530 {
1531 	int fd = arg->val;
1532 	size_t printed = scnprintf(bf, size, "%d", fd);
1533 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1534 
1535 	if (path)
1536 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1537 
1538 	return printed;
1539 }
1540 
1541 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1542 					      struct syscall_arg *arg)
1543 {
1544 	int fd = arg->val;
1545 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1546 	struct thread_trace *ttrace = thread__priv(arg->thread);
1547 
1548 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1549 		zfree(&ttrace->paths.table[fd]);
1550 
1551 	return printed;
1552 }
1553 
1554 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1555 				     unsigned long ptr)
1556 {
1557 	struct thread_trace *ttrace = thread__priv(thread);
1558 
1559 	ttrace->filename.ptr = ptr;
1560 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1561 }
1562 
1563 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1564 					      struct syscall_arg *arg)
1565 {
1566 	unsigned long ptr = arg->val;
1567 
1568 	if (!arg->trace->vfs_getname)
1569 		return scnprintf(bf, size, "%#x", ptr);
1570 
1571 	thread__set_filename_pos(arg->thread, bf, ptr);
1572 	return 0;
1573 }
1574 
1575 static bool trace__filter_duration(struct trace *trace, double t)
1576 {
1577 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1578 }
1579 
1580 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1581 {
1582 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1583 
1584 	return fprintf(fp, "%10.3f ", ts);
1585 }
1586 
1587 static bool done = false;
1588 static bool interrupted = false;
1589 
1590 static void sig_handler(int sig)
1591 {
1592 	done = true;
1593 	interrupted = sig == SIGINT;
1594 }
1595 
1596 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1597 					u64 duration, u64 tstamp, FILE *fp)
1598 {
1599 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1600 	printed += fprintf_duration(duration, fp);
1601 
1602 	if (trace->multiple_threads) {
1603 		if (trace->show_comm)
1604 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1605 		printed += fprintf(fp, "%d ", thread->tid);
1606 	}
1607 
1608 	return printed;
1609 }
1610 
1611 static int trace__process_event(struct trace *trace, struct machine *machine,
1612 				union perf_event *event, struct perf_sample *sample)
1613 {
1614 	int ret = 0;
1615 
1616 	switch (event->header.type) {
1617 	case PERF_RECORD_LOST:
1618 		color_fprintf(trace->output, PERF_COLOR_RED,
1619 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1620 		ret = machine__process_lost_event(machine, event, sample);
1621 	default:
1622 		ret = machine__process_event(machine, event, sample);
1623 		break;
1624 	}
1625 
1626 	return ret;
1627 }
1628 
1629 static int trace__tool_process(struct perf_tool *tool,
1630 			       union perf_event *event,
1631 			       struct perf_sample *sample,
1632 			       struct machine *machine)
1633 {
1634 	struct trace *trace = container_of(tool, struct trace, tool);
1635 	return trace__process_event(trace, machine, event, sample);
1636 }
1637 
1638 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1639 {
1640 	int err = symbol__init(NULL);
1641 
1642 	if (err)
1643 		return err;
1644 
1645 	trace->host = machine__new_host();
1646 	if (trace->host == NULL)
1647 		return -ENOMEM;
1648 
1649 	if (trace_event__register_resolver(trace->host, machine__resolve_kernel_addr) < 0)
1650 		return -errno;
1651 
1652 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1653 					    evlist->threads, trace__tool_process, false,
1654 					    trace->opts.proc_map_timeout);
1655 	if (err)
1656 		symbol__exit();
1657 
1658 	return err;
1659 }
1660 
1661 static int syscall__set_arg_fmts(struct syscall *sc)
1662 {
1663 	struct format_field *field;
1664 	int idx = 0;
1665 
1666 	sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1667 	if (sc->arg_scnprintf == NULL)
1668 		return -1;
1669 
1670 	if (sc->fmt)
1671 		sc->arg_parm = sc->fmt->arg_parm;
1672 
1673 	for (field = sc->args; field; field = field->next) {
1674 		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1675 			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1676 		else if (field->flags & FIELD_IS_POINTER)
1677 			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1678 		++idx;
1679 	}
1680 
1681 	return 0;
1682 }
1683 
1684 static int trace__read_syscall_info(struct trace *trace, int id)
1685 {
1686 	char tp_name[128];
1687 	struct syscall *sc;
1688 	const char *name = audit_syscall_to_name(id, trace->audit.machine);
1689 
1690 	if (name == NULL)
1691 		return -1;
1692 
1693 	if (id > trace->syscalls.max) {
1694 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1695 
1696 		if (nsyscalls == NULL)
1697 			return -1;
1698 
1699 		if (trace->syscalls.max != -1) {
1700 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1701 			       (id - trace->syscalls.max) * sizeof(*sc));
1702 		} else {
1703 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1704 		}
1705 
1706 		trace->syscalls.table = nsyscalls;
1707 		trace->syscalls.max   = id;
1708 	}
1709 
1710 	sc = trace->syscalls.table + id;
1711 	sc->name = name;
1712 
1713 	sc->fmt  = syscall_fmt__find(sc->name);
1714 
1715 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1716 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1717 
1718 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1719 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1720 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1721 	}
1722 
1723 	if (IS_ERR(sc->tp_format))
1724 		return -1;
1725 
1726 	sc->args = sc->tp_format->format.fields;
1727 	sc->nr_args = sc->tp_format->format.nr_fields;
1728 	/*
1729 	 * We need to check and discard the first variable '__syscall_nr'
1730 	 * or 'nr' that mean the syscall number. It is needless here.
1731 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1732 	 */
1733 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1734 		sc->args = sc->args->next;
1735 		--sc->nr_args;
1736 	}
1737 
1738 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1739 
1740 	return syscall__set_arg_fmts(sc);
1741 }
1742 
1743 static int trace__validate_ev_qualifier(struct trace *trace)
1744 {
1745 	int err = 0, i;
1746 	struct str_node *pos;
1747 
1748 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1749 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1750 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1751 
1752 	if (trace->ev_qualifier_ids.entries == NULL) {
1753 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1754 		       trace->output);
1755 		err = -EINVAL;
1756 		goto out;
1757 	}
1758 
1759 	i = 0;
1760 
1761 	strlist__for_each(pos, trace->ev_qualifier) {
1762 		const char *sc = pos->s;
1763 		int id = audit_name_to_syscall(sc, trace->audit.machine);
1764 
1765 		if (id < 0) {
1766 			if (err == 0) {
1767 				fputs("Error:\tInvalid syscall ", trace->output);
1768 				err = -EINVAL;
1769 			} else {
1770 				fputs(", ", trace->output);
1771 			}
1772 
1773 			fputs(sc, trace->output);
1774 		}
1775 
1776 		trace->ev_qualifier_ids.entries[i++] = id;
1777 	}
1778 
1779 	if (err < 0) {
1780 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1781 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1782 		zfree(&trace->ev_qualifier_ids.entries);
1783 		trace->ev_qualifier_ids.nr = 0;
1784 	}
1785 out:
1786 	return err;
1787 }
1788 
1789 /*
1790  * args is to be interpreted as a series of longs but we need to handle
1791  * 8-byte unaligned accesses. args points to raw_data within the event
1792  * and raw_data is guaranteed to be 8-byte unaligned because it is
1793  * preceded by raw_size which is a u32. So we need to copy args to a temp
1794  * variable to read it. Most notably this avoids extended load instructions
1795  * on unaligned addresses
1796  */
1797 
1798 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1799 				      unsigned char *args, struct trace *trace,
1800 				      struct thread *thread)
1801 {
1802 	size_t printed = 0;
1803 	unsigned char *p;
1804 	unsigned long val;
1805 
1806 	if (sc->args != NULL) {
1807 		struct format_field *field;
1808 		u8 bit = 1;
1809 		struct syscall_arg arg = {
1810 			.idx	= 0,
1811 			.mask	= 0,
1812 			.trace  = trace,
1813 			.thread = thread,
1814 		};
1815 
1816 		for (field = sc->args; field;
1817 		     field = field->next, ++arg.idx, bit <<= 1) {
1818 			if (arg.mask & bit)
1819 				continue;
1820 
1821 			/* special care for unaligned accesses */
1822 			p = args + sizeof(unsigned long) * arg.idx;
1823 			memcpy(&val, p, sizeof(val));
1824 
1825 			/*
1826  			 * Suppress this argument if its value is zero and
1827  			 * and we don't have a string associated in an
1828  			 * strarray for it.
1829  			 */
1830 			if (val == 0 &&
1831 			    !(sc->arg_scnprintf &&
1832 			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1833 			      sc->arg_parm[arg.idx]))
1834 				continue;
1835 
1836 			printed += scnprintf(bf + printed, size - printed,
1837 					     "%s%s: ", printed ? ", " : "", field->name);
1838 			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1839 				arg.val = val;
1840 				if (sc->arg_parm)
1841 					arg.parm = sc->arg_parm[arg.idx];
1842 				printed += sc->arg_scnprintf[arg.idx](bf + printed,
1843 								      size - printed, &arg);
1844 			} else {
1845 				printed += scnprintf(bf + printed, size - printed,
1846 						     "%ld", val);
1847 			}
1848 		}
1849 	} else {
1850 		int i = 0;
1851 
1852 		while (i < 6) {
1853 			/* special care for unaligned accesses */
1854 			p = args + sizeof(unsigned long) * i;
1855 			memcpy(&val, p, sizeof(val));
1856 			printed += scnprintf(bf + printed, size - printed,
1857 					     "%sarg%d: %ld",
1858 					     printed ? ", " : "", i, val);
1859 			++i;
1860 		}
1861 	}
1862 
1863 	return printed;
1864 }
1865 
1866 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1867 				  union perf_event *event,
1868 				  struct perf_sample *sample);
1869 
1870 static struct syscall *trace__syscall_info(struct trace *trace,
1871 					   struct perf_evsel *evsel, int id)
1872 {
1873 
1874 	if (id < 0) {
1875 
1876 		/*
1877 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1878 		 * before that, leaving at a higher verbosity level till that is
1879 		 * explained. Reproduced with plain ftrace with:
1880 		 *
1881 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1882 		 * grep "NR -1 " /t/trace_pipe
1883 		 *
1884 		 * After generating some load on the machine.
1885  		 */
1886 		if (verbose > 1) {
1887 			static u64 n;
1888 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1889 				id, perf_evsel__name(evsel), ++n);
1890 		}
1891 		return NULL;
1892 	}
1893 
1894 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1895 	    trace__read_syscall_info(trace, id))
1896 		goto out_cant_read;
1897 
1898 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1899 		goto out_cant_read;
1900 
1901 	return &trace->syscalls.table[id];
1902 
1903 out_cant_read:
1904 	if (verbose) {
1905 		fprintf(trace->output, "Problems reading syscall %d", id);
1906 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1907 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1908 		fputs(" information\n", trace->output);
1909 	}
1910 	return NULL;
1911 }
1912 
1913 static void thread__update_stats(struct thread_trace *ttrace,
1914 				 int id, struct perf_sample *sample)
1915 {
1916 	struct int_node *inode;
1917 	struct stats *stats;
1918 	u64 duration = 0;
1919 
1920 	inode = intlist__findnew(ttrace->syscall_stats, id);
1921 	if (inode == NULL)
1922 		return;
1923 
1924 	stats = inode->priv;
1925 	if (stats == NULL) {
1926 		stats = malloc(sizeof(struct stats));
1927 		if (stats == NULL)
1928 			return;
1929 		init_stats(stats);
1930 		inode->priv = stats;
1931 	}
1932 
1933 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1934 		duration = sample->time - ttrace->entry_time;
1935 
1936 	update_stats(stats, duration);
1937 }
1938 
1939 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1940 {
1941 	struct thread_trace *ttrace;
1942 	u64 duration;
1943 	size_t printed;
1944 
1945 	if (trace->current == NULL)
1946 		return 0;
1947 
1948 	ttrace = thread__priv(trace->current);
1949 
1950 	if (!ttrace->entry_pending)
1951 		return 0;
1952 
1953 	duration = sample->time - ttrace->entry_time;
1954 
1955 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, sample->time, trace->output);
1956 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1957 	ttrace->entry_pending = false;
1958 
1959 	return printed;
1960 }
1961 
1962 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1963 			    union perf_event *event __maybe_unused,
1964 			    struct perf_sample *sample)
1965 {
1966 	char *msg;
1967 	void *args;
1968 	size_t printed = 0;
1969 	struct thread *thread;
1970 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1971 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1972 	struct thread_trace *ttrace;
1973 
1974 	if (sc == NULL)
1975 		return -1;
1976 
1977 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1978 	ttrace = thread__trace(thread, trace->output);
1979 	if (ttrace == NULL)
1980 		goto out_put;
1981 
1982 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1983 
1984 	if (ttrace->entry_str == NULL) {
1985 		ttrace->entry_str = malloc(trace__entry_str_size);
1986 		if (!ttrace->entry_str)
1987 			goto out_put;
1988 	}
1989 
1990 	if (!trace->summary_only)
1991 		trace__printf_interrupted_entry(trace, sample);
1992 
1993 	ttrace->entry_time = sample->time;
1994 	msg = ttrace->entry_str;
1995 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1996 
1997 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1998 					   args, trace, thread);
1999 
2000 	if (sc->is_exit) {
2001 		if (!trace->duration_filter && !trace->summary_only) {
2002 			trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
2003 			fprintf(trace->output, "%-70s\n", ttrace->entry_str);
2004 		}
2005 	} else {
2006 		ttrace->entry_pending = true;
2007 		/* See trace__vfs_getname & trace__sys_exit */
2008 		ttrace->filename.pending_open = false;
2009 	}
2010 
2011 	if (trace->current != thread) {
2012 		thread__put(trace->current);
2013 		trace->current = thread__get(thread);
2014 	}
2015 	err = 0;
2016 out_put:
2017 	thread__put(thread);
2018 	return err;
2019 }
2020 
2021 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
2022 			   union perf_event *event __maybe_unused,
2023 			   struct perf_sample *sample)
2024 {
2025 	long ret;
2026 	u64 duration = 0;
2027 	struct thread *thread;
2028 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
2029 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
2030 	struct thread_trace *ttrace;
2031 
2032 	if (sc == NULL)
2033 		return -1;
2034 
2035 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2036 	ttrace = thread__trace(thread, trace->output);
2037 	if (ttrace == NULL)
2038 		goto out_put;
2039 
2040 	if (trace->summary)
2041 		thread__update_stats(ttrace, id, sample);
2042 
2043 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
2044 
2045 	if (id == trace->audit.open_id && ret >= 0 && ttrace->filename.pending_open) {
2046 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
2047 		ttrace->filename.pending_open = false;
2048 		++trace->stats.vfs_getname;
2049 	}
2050 
2051 	ttrace->exit_time = sample->time;
2052 
2053 	if (ttrace->entry_time) {
2054 		duration = sample->time - ttrace->entry_time;
2055 		if (trace__filter_duration(trace, duration))
2056 			goto out;
2057 	} else if (trace->duration_filter)
2058 		goto out;
2059 
2060 	if (trace->summary_only)
2061 		goto out;
2062 
2063 	trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
2064 
2065 	if (ttrace->entry_pending) {
2066 		fprintf(trace->output, "%-70s", ttrace->entry_str);
2067 	} else {
2068 		fprintf(trace->output, " ... [");
2069 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
2070 		fprintf(trace->output, "]: %s()", sc->name);
2071 	}
2072 
2073 	if (sc->fmt == NULL) {
2074 signed_print:
2075 		fprintf(trace->output, ") = %ld", ret);
2076 	} else if (ret < 0 && sc->fmt->errmsg) {
2077 		char bf[STRERR_BUFSIZE];
2078 		const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
2079 			   *e = audit_errno_to_name(-ret);
2080 
2081 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
2082 	} else if (ret == 0 && sc->fmt->timeout)
2083 		fprintf(trace->output, ") = 0 Timeout");
2084 	else if (sc->fmt->hexret)
2085 		fprintf(trace->output, ") = %#lx", ret);
2086 	else
2087 		goto signed_print;
2088 
2089 	fputc('\n', trace->output);
2090 out:
2091 	ttrace->entry_pending = false;
2092 	err = 0;
2093 out_put:
2094 	thread__put(thread);
2095 	return err;
2096 }
2097 
2098 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
2099 			      union perf_event *event __maybe_unused,
2100 			      struct perf_sample *sample)
2101 {
2102 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2103 	struct thread_trace *ttrace;
2104 	size_t filename_len, entry_str_len, to_move;
2105 	ssize_t remaining_space;
2106 	char *pos;
2107 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
2108 
2109 	if (!thread)
2110 		goto out;
2111 
2112 	ttrace = thread__priv(thread);
2113 	if (!ttrace)
2114 		goto out;
2115 
2116 	filename_len = strlen(filename);
2117 
2118 	if (ttrace->filename.namelen < filename_len) {
2119 		char *f = realloc(ttrace->filename.name, filename_len + 1);
2120 
2121 		if (f == NULL)
2122 				goto out;
2123 
2124 		ttrace->filename.namelen = filename_len;
2125 		ttrace->filename.name = f;
2126 	}
2127 
2128 	strcpy(ttrace->filename.name, filename);
2129 	ttrace->filename.pending_open = true;
2130 
2131 	if (!ttrace->filename.ptr)
2132 		goto out;
2133 
2134 	entry_str_len = strlen(ttrace->entry_str);
2135 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
2136 	if (remaining_space <= 0)
2137 		goto out;
2138 
2139 	if (filename_len > (size_t)remaining_space) {
2140 		filename += filename_len - remaining_space;
2141 		filename_len = remaining_space;
2142 	}
2143 
2144 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
2145 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
2146 	memmove(pos + filename_len, pos, to_move);
2147 	memcpy(pos, filename, filename_len);
2148 
2149 	ttrace->filename.ptr = 0;
2150 	ttrace->filename.entry_str_pos = 0;
2151 out:
2152 	return 0;
2153 }
2154 
2155 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
2156 				     union perf_event *event __maybe_unused,
2157 				     struct perf_sample *sample)
2158 {
2159         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
2160 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
2161 	struct thread *thread = machine__findnew_thread(trace->host,
2162 							sample->pid,
2163 							sample->tid);
2164 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
2165 
2166 	if (ttrace == NULL)
2167 		goto out_dump;
2168 
2169 	ttrace->runtime_ms += runtime_ms;
2170 	trace->runtime_ms += runtime_ms;
2171 	thread__put(thread);
2172 	return 0;
2173 
2174 out_dump:
2175 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
2176 	       evsel->name,
2177 	       perf_evsel__strval(evsel, sample, "comm"),
2178 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
2179 	       runtime,
2180 	       perf_evsel__intval(evsel, sample, "vruntime"));
2181 	thread__put(thread);
2182 	return 0;
2183 }
2184 
2185 static void bpf_output__printer(enum binary_printer_ops op,
2186 				unsigned int val, void *extra)
2187 {
2188 	FILE *output = extra;
2189 	unsigned char ch = (unsigned char)val;
2190 
2191 	switch (op) {
2192 	case BINARY_PRINT_CHAR_DATA:
2193 		fprintf(output, "%c", isprint(ch) ? ch : '.');
2194 		break;
2195 	case BINARY_PRINT_DATA_BEGIN:
2196 	case BINARY_PRINT_LINE_BEGIN:
2197 	case BINARY_PRINT_ADDR:
2198 	case BINARY_PRINT_NUM_DATA:
2199 	case BINARY_PRINT_NUM_PAD:
2200 	case BINARY_PRINT_SEP:
2201 	case BINARY_PRINT_CHAR_PAD:
2202 	case BINARY_PRINT_LINE_END:
2203 	case BINARY_PRINT_DATA_END:
2204 	default:
2205 		break;
2206 	}
2207 }
2208 
2209 static void bpf_output__fprintf(struct trace *trace,
2210 				struct perf_sample *sample)
2211 {
2212 	print_binary(sample->raw_data, sample->raw_size, 8,
2213 		     bpf_output__printer, trace->output);
2214 }
2215 
2216 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
2217 				union perf_event *event __maybe_unused,
2218 				struct perf_sample *sample)
2219 {
2220 	trace__printf_interrupted_entry(trace, sample);
2221 	trace__fprintf_tstamp(trace, sample->time, trace->output);
2222 
2223 	if (trace->trace_syscalls)
2224 		fprintf(trace->output, "(         ): ");
2225 
2226 	fprintf(trace->output, "%s:", evsel->name);
2227 
2228 	if (perf_evsel__is_bpf_output(evsel)) {
2229 		bpf_output__fprintf(trace, sample);
2230 	} else if (evsel->tp_format) {
2231 		event_format__fprintf(evsel->tp_format, sample->cpu,
2232 				      sample->raw_data, sample->raw_size,
2233 				      trace->output);
2234 	}
2235 
2236 	fprintf(trace->output, ")\n");
2237 	return 0;
2238 }
2239 
2240 static void print_location(FILE *f, struct perf_sample *sample,
2241 			   struct addr_location *al,
2242 			   bool print_dso, bool print_sym)
2243 {
2244 
2245 	if ((verbose || print_dso) && al->map)
2246 		fprintf(f, "%s@", al->map->dso->long_name);
2247 
2248 	if ((verbose || print_sym) && al->sym)
2249 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
2250 			al->addr - al->sym->start);
2251 	else if (al->map)
2252 		fprintf(f, "0x%" PRIx64, al->addr);
2253 	else
2254 		fprintf(f, "0x%" PRIx64, sample->addr);
2255 }
2256 
2257 static int trace__pgfault(struct trace *trace,
2258 			  struct perf_evsel *evsel,
2259 			  union perf_event *event,
2260 			  struct perf_sample *sample)
2261 {
2262 	struct thread *thread;
2263 	u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
2264 	struct addr_location al;
2265 	char map_type = 'd';
2266 	struct thread_trace *ttrace;
2267 	int err = -1;
2268 
2269 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2270 	ttrace = thread__trace(thread, trace->output);
2271 	if (ttrace == NULL)
2272 		goto out_put;
2273 
2274 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
2275 		ttrace->pfmaj++;
2276 	else
2277 		ttrace->pfmin++;
2278 
2279 	if (trace->summary_only)
2280 		goto out;
2281 
2282 	thread__find_addr_location(thread, cpumode, MAP__FUNCTION,
2283 			      sample->ip, &al);
2284 
2285 	trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
2286 
2287 	fprintf(trace->output, "%sfault [",
2288 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
2289 		"maj" : "min");
2290 
2291 	print_location(trace->output, sample, &al, false, true);
2292 
2293 	fprintf(trace->output, "] => ");
2294 
2295 	thread__find_addr_location(thread, cpumode, MAP__VARIABLE,
2296 				   sample->addr, &al);
2297 
2298 	if (!al.map) {
2299 		thread__find_addr_location(thread, cpumode,
2300 					   MAP__FUNCTION, sample->addr, &al);
2301 
2302 		if (al.map)
2303 			map_type = 'x';
2304 		else
2305 			map_type = '?';
2306 	}
2307 
2308 	print_location(trace->output, sample, &al, true, false);
2309 
2310 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
2311 out:
2312 	err = 0;
2313 out_put:
2314 	thread__put(thread);
2315 	return err;
2316 }
2317 
2318 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
2319 {
2320 	if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
2321 	    (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
2322 		return false;
2323 
2324 	if (trace->pid_list || trace->tid_list)
2325 		return true;
2326 
2327 	return false;
2328 }
2329 
2330 static int trace__process_sample(struct perf_tool *tool,
2331 				 union perf_event *event,
2332 				 struct perf_sample *sample,
2333 				 struct perf_evsel *evsel,
2334 				 struct machine *machine __maybe_unused)
2335 {
2336 	struct trace *trace = container_of(tool, struct trace, tool);
2337 	int err = 0;
2338 
2339 	tracepoint_handler handler = evsel->handler;
2340 
2341 	if (skip_sample(trace, sample))
2342 		return 0;
2343 
2344 	if (!trace->full_time && trace->base_time == 0)
2345 		trace->base_time = sample->time;
2346 
2347 	if (handler) {
2348 		++trace->nr_events;
2349 		handler(trace, evsel, event, sample);
2350 	}
2351 
2352 	return err;
2353 }
2354 
2355 static int parse_target_str(struct trace *trace)
2356 {
2357 	if (trace->opts.target.pid) {
2358 		trace->pid_list = intlist__new(trace->opts.target.pid);
2359 		if (trace->pid_list == NULL) {
2360 			pr_err("Error parsing process id string\n");
2361 			return -EINVAL;
2362 		}
2363 	}
2364 
2365 	if (trace->opts.target.tid) {
2366 		trace->tid_list = intlist__new(trace->opts.target.tid);
2367 		if (trace->tid_list == NULL) {
2368 			pr_err("Error parsing thread id string\n");
2369 			return -EINVAL;
2370 		}
2371 	}
2372 
2373 	return 0;
2374 }
2375 
2376 static int trace__record(struct trace *trace, int argc, const char **argv)
2377 {
2378 	unsigned int rec_argc, i, j;
2379 	const char **rec_argv;
2380 	const char * const record_args[] = {
2381 		"record",
2382 		"-R",
2383 		"-m", "1024",
2384 		"-c", "1",
2385 	};
2386 
2387 	const char * const sc_args[] = { "-e", };
2388 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2389 	const char * const majpf_args[] = { "-e", "major-faults" };
2390 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2391 	const char * const minpf_args[] = { "-e", "minor-faults" };
2392 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2393 
2394 	/* +1 is for the event string below */
2395 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2396 		majpf_args_nr + minpf_args_nr + argc;
2397 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2398 
2399 	if (rec_argv == NULL)
2400 		return -ENOMEM;
2401 
2402 	j = 0;
2403 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2404 		rec_argv[j++] = record_args[i];
2405 
2406 	if (trace->trace_syscalls) {
2407 		for (i = 0; i < sc_args_nr; i++)
2408 			rec_argv[j++] = sc_args[i];
2409 
2410 		/* event string may be different for older kernels - e.g., RHEL6 */
2411 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2412 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2413 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2414 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2415 		else {
2416 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2417 			return -1;
2418 		}
2419 	}
2420 
2421 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2422 		for (i = 0; i < majpf_args_nr; i++)
2423 			rec_argv[j++] = majpf_args[i];
2424 
2425 	if (trace->trace_pgfaults & TRACE_PFMIN)
2426 		for (i = 0; i < minpf_args_nr; i++)
2427 			rec_argv[j++] = minpf_args[i];
2428 
2429 	for (i = 0; i < (unsigned int)argc; i++)
2430 		rec_argv[j++] = argv[i];
2431 
2432 	return cmd_record(j, rec_argv, NULL);
2433 }
2434 
2435 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2436 
2437 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2438 {
2439 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2440 
2441 	if (IS_ERR(evsel))
2442 		return false;
2443 
2444 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2445 		perf_evsel__delete(evsel);
2446 		return false;
2447 	}
2448 
2449 	evsel->handler = trace__vfs_getname;
2450 	perf_evlist__add(evlist, evsel);
2451 	return true;
2452 }
2453 
2454 static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
2455 				    u64 config)
2456 {
2457 	struct perf_evsel *evsel;
2458 	struct perf_event_attr attr = {
2459 		.type = PERF_TYPE_SOFTWARE,
2460 		.mmap_data = 1,
2461 	};
2462 
2463 	attr.config = config;
2464 	attr.sample_period = 1;
2465 
2466 	event_attr_init(&attr);
2467 
2468 	evsel = perf_evsel__new(&attr);
2469 	if (!evsel)
2470 		return -ENOMEM;
2471 
2472 	evsel->handler = trace__pgfault;
2473 	perf_evlist__add(evlist, evsel);
2474 
2475 	return 0;
2476 }
2477 
2478 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2479 {
2480 	const u32 type = event->header.type;
2481 	struct perf_evsel *evsel;
2482 
2483 	if (!trace->full_time && trace->base_time == 0)
2484 		trace->base_time = sample->time;
2485 
2486 	if (type != PERF_RECORD_SAMPLE) {
2487 		trace__process_event(trace, trace->host, event, sample);
2488 		return;
2489 	}
2490 
2491 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2492 	if (evsel == NULL) {
2493 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2494 		return;
2495 	}
2496 
2497 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2498 	    sample->raw_data == NULL) {
2499 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2500 		       perf_evsel__name(evsel), sample->tid,
2501 		       sample->cpu, sample->raw_size);
2502 	} else {
2503 		tracepoint_handler handler = evsel->handler;
2504 		handler(trace, evsel, event, sample);
2505 	}
2506 }
2507 
2508 static int trace__add_syscall_newtp(struct trace *trace)
2509 {
2510 	int ret = -1;
2511 	struct perf_evlist *evlist = trace->evlist;
2512 	struct perf_evsel *sys_enter, *sys_exit;
2513 
2514 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2515 	if (sys_enter == NULL)
2516 		goto out;
2517 
2518 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2519 		goto out_delete_sys_enter;
2520 
2521 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2522 	if (sys_exit == NULL)
2523 		goto out_delete_sys_enter;
2524 
2525 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2526 		goto out_delete_sys_exit;
2527 
2528 	perf_evlist__add(evlist, sys_enter);
2529 	perf_evlist__add(evlist, sys_exit);
2530 
2531 	trace->syscalls.events.sys_enter = sys_enter;
2532 	trace->syscalls.events.sys_exit  = sys_exit;
2533 
2534 	ret = 0;
2535 out:
2536 	return ret;
2537 
2538 out_delete_sys_exit:
2539 	perf_evsel__delete_priv(sys_exit);
2540 out_delete_sys_enter:
2541 	perf_evsel__delete_priv(sys_enter);
2542 	goto out;
2543 }
2544 
2545 static int trace__set_ev_qualifier_filter(struct trace *trace)
2546 {
2547 	int err = -1;
2548 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2549 						trace->ev_qualifier_ids.nr,
2550 						trace->ev_qualifier_ids.entries);
2551 
2552 	if (filter == NULL)
2553 		goto out_enomem;
2554 
2555 	if (!perf_evsel__append_filter(trace->syscalls.events.sys_enter, "&&", filter))
2556 		err = perf_evsel__append_filter(trace->syscalls.events.sys_exit, "&&", filter);
2557 
2558 	free(filter);
2559 out:
2560 	return err;
2561 out_enomem:
2562 	errno = ENOMEM;
2563 	goto out;
2564 }
2565 
2566 static int trace__run(struct trace *trace, int argc, const char **argv)
2567 {
2568 	struct perf_evlist *evlist = trace->evlist;
2569 	struct perf_evsel *evsel;
2570 	int err = -1, i;
2571 	unsigned long before;
2572 	const bool forks = argc > 0;
2573 	bool draining = false;
2574 
2575 	trace->live = true;
2576 
2577 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2578 		goto out_error_raw_syscalls;
2579 
2580 	if (trace->trace_syscalls)
2581 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2582 
2583 	if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2584 	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ)) {
2585 		goto out_error_mem;
2586 	}
2587 
2588 	if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2589 	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2590 		goto out_error_mem;
2591 
2592 	if (trace->sched &&
2593 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2594 				   trace__sched_stat_runtime))
2595 		goto out_error_sched_stat_runtime;
2596 
2597 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2598 	if (err < 0) {
2599 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2600 		goto out_delete_evlist;
2601 	}
2602 
2603 	err = trace__symbols_init(trace, evlist);
2604 	if (err < 0) {
2605 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2606 		goto out_delete_evlist;
2607 	}
2608 
2609 	perf_evlist__config(evlist, &trace->opts);
2610 
2611 	signal(SIGCHLD, sig_handler);
2612 	signal(SIGINT, sig_handler);
2613 
2614 	if (forks) {
2615 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2616 						    argv, false, NULL);
2617 		if (err < 0) {
2618 			fprintf(trace->output, "Couldn't run the workload!\n");
2619 			goto out_delete_evlist;
2620 		}
2621 	}
2622 
2623 	err = perf_evlist__open(evlist);
2624 	if (err < 0)
2625 		goto out_error_open;
2626 
2627 	err = bpf__apply_obj_config();
2628 	if (err) {
2629 		char errbuf[BUFSIZ];
2630 
2631 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2632 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2633 			 errbuf);
2634 		goto out_error_open;
2635 	}
2636 
2637 	/*
2638 	 * Better not use !target__has_task() here because we need to cover the
2639 	 * case where no threads were specified in the command line, but a
2640 	 * workload was, and in that case we will fill in the thread_map when
2641 	 * we fork the workload in perf_evlist__prepare_workload.
2642 	 */
2643 	if (trace->filter_pids.nr > 0)
2644 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2645 	else if (thread_map__pid(evlist->threads, 0) == -1)
2646 		err = perf_evlist__set_filter_pid(evlist, getpid());
2647 
2648 	if (err < 0)
2649 		goto out_error_mem;
2650 
2651 	if (trace->ev_qualifier_ids.nr > 0) {
2652 		err = trace__set_ev_qualifier_filter(trace);
2653 		if (err < 0)
2654 			goto out_errno;
2655 
2656 		pr_debug("event qualifier tracepoint filter: %s\n",
2657 			 trace->syscalls.events.sys_exit->filter);
2658 	}
2659 
2660 	err = perf_evlist__apply_filters(evlist, &evsel);
2661 	if (err < 0)
2662 		goto out_error_apply_filters;
2663 
2664 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2665 	if (err < 0)
2666 		goto out_error_mmap;
2667 
2668 	if (!target__none(&trace->opts.target))
2669 		perf_evlist__enable(evlist);
2670 
2671 	if (forks)
2672 		perf_evlist__start_workload(evlist);
2673 
2674 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2675 				  evlist->threads->nr > 1 ||
2676 				  perf_evlist__first(evlist)->attr.inherit;
2677 again:
2678 	before = trace->nr_events;
2679 
2680 	for (i = 0; i < evlist->nr_mmaps; i++) {
2681 		union perf_event *event;
2682 
2683 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2684 			struct perf_sample sample;
2685 
2686 			++trace->nr_events;
2687 
2688 			err = perf_evlist__parse_sample(evlist, event, &sample);
2689 			if (err) {
2690 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2691 				goto next_event;
2692 			}
2693 
2694 			trace__handle_event(trace, event, &sample);
2695 next_event:
2696 			perf_evlist__mmap_consume(evlist, i);
2697 
2698 			if (interrupted)
2699 				goto out_disable;
2700 
2701 			if (done && !draining) {
2702 				perf_evlist__disable(evlist);
2703 				draining = true;
2704 			}
2705 		}
2706 	}
2707 
2708 	if (trace->nr_events == before) {
2709 		int timeout = done ? 100 : -1;
2710 
2711 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2712 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2713 				draining = true;
2714 
2715 			goto again;
2716 		}
2717 	} else {
2718 		goto again;
2719 	}
2720 
2721 out_disable:
2722 	thread__zput(trace->current);
2723 
2724 	perf_evlist__disable(evlist);
2725 
2726 	if (!err) {
2727 		if (trace->summary)
2728 			trace__fprintf_thread_summary(trace, trace->output);
2729 
2730 		if (trace->show_tool_stats) {
2731 			fprintf(trace->output, "Stats:\n "
2732 					       " vfs_getname : %" PRIu64 "\n"
2733 					       " proc_getname: %" PRIu64 "\n",
2734 				trace->stats.vfs_getname,
2735 				trace->stats.proc_getname);
2736 		}
2737 	}
2738 
2739 out_delete_evlist:
2740 	perf_evlist__delete(evlist);
2741 	trace->evlist = NULL;
2742 	trace->live = false;
2743 	return err;
2744 {
2745 	char errbuf[BUFSIZ];
2746 
2747 out_error_sched_stat_runtime:
2748 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2749 	goto out_error;
2750 
2751 out_error_raw_syscalls:
2752 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2753 	goto out_error;
2754 
2755 out_error_mmap:
2756 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2757 	goto out_error;
2758 
2759 out_error_open:
2760 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2761 
2762 out_error:
2763 	fprintf(trace->output, "%s\n", errbuf);
2764 	goto out_delete_evlist;
2765 
2766 out_error_apply_filters:
2767 	fprintf(trace->output,
2768 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2769 		evsel->filter, perf_evsel__name(evsel), errno,
2770 		strerror_r(errno, errbuf, sizeof(errbuf)));
2771 	goto out_delete_evlist;
2772 }
2773 out_error_mem:
2774 	fprintf(trace->output, "Not enough memory to run!\n");
2775 	goto out_delete_evlist;
2776 
2777 out_errno:
2778 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2779 	goto out_delete_evlist;
2780 }
2781 
2782 static int trace__replay(struct trace *trace)
2783 {
2784 	const struct perf_evsel_str_handler handlers[] = {
2785 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2786 	};
2787 	struct perf_data_file file = {
2788 		.path  = input_name,
2789 		.mode  = PERF_DATA_MODE_READ,
2790 		.force = trace->force,
2791 	};
2792 	struct perf_session *session;
2793 	struct perf_evsel *evsel;
2794 	int err = -1;
2795 
2796 	trace->tool.sample	  = trace__process_sample;
2797 	trace->tool.mmap	  = perf_event__process_mmap;
2798 	trace->tool.mmap2	  = perf_event__process_mmap2;
2799 	trace->tool.comm	  = perf_event__process_comm;
2800 	trace->tool.exit	  = perf_event__process_exit;
2801 	trace->tool.fork	  = perf_event__process_fork;
2802 	trace->tool.attr	  = perf_event__process_attr;
2803 	trace->tool.tracing_data = perf_event__process_tracing_data;
2804 	trace->tool.build_id	  = perf_event__process_build_id;
2805 
2806 	trace->tool.ordered_events = true;
2807 	trace->tool.ordering_requires_timestamps = true;
2808 
2809 	/* add tid to output */
2810 	trace->multiple_threads = true;
2811 
2812 	session = perf_session__new(&file, false, &trace->tool);
2813 	if (session == NULL)
2814 		return -1;
2815 
2816 	if (symbol__init(&session->header.env) < 0)
2817 		goto out;
2818 
2819 	trace->host = &session->machines.host;
2820 
2821 	err = perf_session__set_tracepoints_handlers(session, handlers);
2822 	if (err)
2823 		goto out;
2824 
2825 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2826 						     "raw_syscalls:sys_enter");
2827 	/* older kernels have syscalls tp versus raw_syscalls */
2828 	if (evsel == NULL)
2829 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2830 							     "syscalls:sys_enter");
2831 
2832 	if (evsel &&
2833 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2834 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2835 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2836 		goto out;
2837 	}
2838 
2839 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2840 						     "raw_syscalls:sys_exit");
2841 	if (evsel == NULL)
2842 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2843 							     "syscalls:sys_exit");
2844 	if (evsel &&
2845 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2846 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2847 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2848 		goto out;
2849 	}
2850 
2851 	evlist__for_each(session->evlist, evsel) {
2852 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2853 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2854 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2855 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2856 			evsel->handler = trace__pgfault;
2857 	}
2858 
2859 	err = parse_target_str(trace);
2860 	if (err != 0)
2861 		goto out;
2862 
2863 	setup_pager();
2864 
2865 	err = perf_session__process_events(session);
2866 	if (err)
2867 		pr_err("Failed to process events, error %d", err);
2868 
2869 	else if (trace->summary)
2870 		trace__fprintf_thread_summary(trace, trace->output);
2871 
2872 out:
2873 	perf_session__delete(session);
2874 
2875 	return err;
2876 }
2877 
2878 static size_t trace__fprintf_threads_header(FILE *fp)
2879 {
2880 	size_t printed;
2881 
2882 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2883 
2884 	return printed;
2885 }
2886 
2887 static size_t thread__dump_stats(struct thread_trace *ttrace,
2888 				 struct trace *trace, FILE *fp)
2889 {
2890 	struct stats *stats;
2891 	size_t printed = 0;
2892 	struct syscall *sc;
2893 	struct int_node *inode = intlist__first(ttrace->syscall_stats);
2894 
2895 	if (inode == NULL)
2896 		return 0;
2897 
2898 	printed += fprintf(fp, "\n");
2899 
2900 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2901 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2902 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2903 
2904 	/* each int_node is a syscall */
2905 	while (inode) {
2906 		stats = inode->priv;
2907 		if (stats) {
2908 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2909 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2910 			double avg = avg_stats(stats);
2911 			double pct;
2912 			u64 n = (u64) stats->n;
2913 
2914 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2915 			avg /= NSEC_PER_MSEC;
2916 
2917 			sc = &trace->syscalls.table[inode->i];
2918 			printed += fprintf(fp, "   %-15s", sc->name);
2919 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2920 					   n, avg * n, min, avg);
2921 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2922 		}
2923 
2924 		inode = intlist__next(inode);
2925 	}
2926 
2927 	printed += fprintf(fp, "\n\n");
2928 
2929 	return printed;
2930 }
2931 
2932 /* struct used to pass data to per-thread function */
2933 struct summary_data {
2934 	FILE *fp;
2935 	struct trace *trace;
2936 	size_t printed;
2937 };
2938 
2939 static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2940 {
2941 	struct summary_data *data = priv;
2942 	FILE *fp = data->fp;
2943 	size_t printed = data->printed;
2944 	struct trace *trace = data->trace;
2945 	struct thread_trace *ttrace = thread__priv(thread);
2946 	double ratio;
2947 
2948 	if (ttrace == NULL)
2949 		return 0;
2950 
2951 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2952 
2953 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2954 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2955 	printed += fprintf(fp, "%.1f%%", ratio);
2956 	if (ttrace->pfmaj)
2957 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2958 	if (ttrace->pfmin)
2959 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2960 	printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2961 	printed += thread__dump_stats(ttrace, trace, fp);
2962 
2963 	data->printed += printed;
2964 
2965 	return 0;
2966 }
2967 
2968 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2969 {
2970 	struct summary_data data = {
2971 		.fp = fp,
2972 		.trace = trace
2973 	};
2974 	data.printed = trace__fprintf_threads_header(fp);
2975 
2976 	machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2977 
2978 	return data.printed;
2979 }
2980 
2981 static int trace__set_duration(const struct option *opt, const char *str,
2982 			       int unset __maybe_unused)
2983 {
2984 	struct trace *trace = opt->value;
2985 
2986 	trace->duration_filter = atof(str);
2987 	return 0;
2988 }
2989 
2990 static int trace__set_filter_pids(const struct option *opt, const char *str,
2991 				  int unset __maybe_unused)
2992 {
2993 	int ret = -1;
2994 	size_t i;
2995 	struct trace *trace = opt->value;
2996 	/*
2997 	 * FIXME: introduce a intarray class, plain parse csv and create a
2998 	 * { int nr, int entries[] } struct...
2999 	 */
3000 	struct intlist *list = intlist__new(str);
3001 
3002 	if (list == NULL)
3003 		return -1;
3004 
3005 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
3006 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
3007 
3008 	if (trace->filter_pids.entries == NULL)
3009 		goto out;
3010 
3011 	trace->filter_pids.entries[0] = getpid();
3012 
3013 	for (i = 1; i < trace->filter_pids.nr; ++i)
3014 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
3015 
3016 	intlist__delete(list);
3017 	ret = 0;
3018 out:
3019 	return ret;
3020 }
3021 
3022 static int trace__open_output(struct trace *trace, const char *filename)
3023 {
3024 	struct stat st;
3025 
3026 	if (!stat(filename, &st) && st.st_size) {
3027 		char oldname[PATH_MAX];
3028 
3029 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
3030 		unlink(oldname);
3031 		rename(filename, oldname);
3032 	}
3033 
3034 	trace->output = fopen(filename, "w");
3035 
3036 	return trace->output == NULL ? -errno : 0;
3037 }
3038 
3039 static int parse_pagefaults(const struct option *opt, const char *str,
3040 			    int unset __maybe_unused)
3041 {
3042 	int *trace_pgfaults = opt->value;
3043 
3044 	if (strcmp(str, "all") == 0)
3045 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
3046 	else if (strcmp(str, "maj") == 0)
3047 		*trace_pgfaults |= TRACE_PFMAJ;
3048 	else if (strcmp(str, "min") == 0)
3049 		*trace_pgfaults |= TRACE_PFMIN;
3050 	else
3051 		return -1;
3052 
3053 	return 0;
3054 }
3055 
3056 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
3057 {
3058 	struct perf_evsel *evsel;
3059 
3060 	evlist__for_each(evlist, evsel)
3061 		evsel->handler = handler;
3062 }
3063 
3064 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
3065 {
3066 	const char *trace_usage[] = {
3067 		"perf trace [<options>] [<command>]",
3068 		"perf trace [<options>] -- <command> [<options>]",
3069 		"perf trace record [<options>] [<command>]",
3070 		"perf trace record [<options>] -- <command> [<options>]",
3071 		NULL
3072 	};
3073 	struct trace trace = {
3074 		.audit = {
3075 			.machine = audit_detect_machine(),
3076 			.open_id = audit_name_to_syscall("open", trace.audit.machine),
3077 		},
3078 		.syscalls = {
3079 			. max = -1,
3080 		},
3081 		.opts = {
3082 			.target = {
3083 				.uid	   = UINT_MAX,
3084 				.uses_mmap = true,
3085 			},
3086 			.user_freq     = UINT_MAX,
3087 			.user_interval = ULLONG_MAX,
3088 			.no_buffering  = true,
3089 			.mmap_pages    = UINT_MAX,
3090 			.proc_map_timeout  = 500,
3091 		},
3092 		.output = stderr,
3093 		.show_comm = true,
3094 		.trace_syscalls = true,
3095 	};
3096 	const char *output_name = NULL;
3097 	const char *ev_qualifier_str = NULL;
3098 	const struct option trace_options[] = {
3099 	OPT_CALLBACK(0, "event", &trace.evlist, "event",
3100 		     "event selector. use 'perf list' to list available events",
3101 		     parse_events_option),
3102 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
3103 		    "show the thread COMM next to its id"),
3104 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
3105 	OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
3106 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
3107 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
3108 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
3109 		    "trace events on existing process id"),
3110 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
3111 		    "trace events on existing thread id"),
3112 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
3113 		     "pids to filter (by the kernel)", trace__set_filter_pids),
3114 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
3115 		    "system-wide collection from all CPUs"),
3116 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
3117 		    "list of cpus to monitor"),
3118 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
3119 		    "child tasks do not inherit counters"),
3120 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
3121 		     "number of mmap data pages",
3122 		     perf_evlist__parse_mmap_pages),
3123 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
3124 		   "user to profile"),
3125 	OPT_CALLBACK(0, "duration", &trace, "float",
3126 		     "show only events with duration > N.M ms",
3127 		     trace__set_duration),
3128 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3129 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3130 	OPT_BOOLEAN('T', "time", &trace.full_time,
3131 		    "Show full timestamp, not time relative to first start"),
3132 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
3133 		    "Show only syscall summary with statistics"),
3134 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
3135 		    "Show all syscalls and summary with statistics"),
3136 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3137 		     "Trace pagefaults", parse_pagefaults, "maj"),
3138 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3139 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3140 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3141 			"per thread proc mmap processing timeout in ms"),
3142 	OPT_END()
3143 	};
3144 	const char * const trace_subcommands[] = { "record", NULL };
3145 	int err;
3146 	char bf[BUFSIZ];
3147 
3148 	signal(SIGSEGV, sighandler_dump_stack);
3149 	signal(SIGFPE, sighandler_dump_stack);
3150 
3151 	trace.evlist = perf_evlist__new();
3152 
3153 	if (trace.evlist == NULL) {
3154 		pr_err("Not enough memory to run!\n");
3155 		err = -ENOMEM;
3156 		goto out;
3157 	}
3158 
3159 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3160 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3161 
3162 	if (trace.trace_pgfaults) {
3163 		trace.opts.sample_address = true;
3164 		trace.opts.sample_time = true;
3165 	}
3166 
3167 	if (trace.evlist->nr_entries > 0)
3168 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3169 
3170 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3171 		return trace__record(&trace, argc-1, &argv[1]);
3172 
3173 	/* summary_only implies summary option, but don't overwrite summary if set */
3174 	if (trace.summary_only)
3175 		trace.summary = trace.summary_only;
3176 
3177 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3178 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3179 		pr_err("Please specify something to trace.\n");
3180 		return -1;
3181 	}
3182 
3183 	if (output_name != NULL) {
3184 		err = trace__open_output(&trace, output_name);
3185 		if (err < 0) {
3186 			perror("failed to create output file");
3187 			goto out;
3188 		}
3189 	}
3190 
3191 	if (ev_qualifier_str != NULL) {
3192 		const char *s = ev_qualifier_str;
3193 		struct strlist_config slist_config = {
3194 			.dirname = system_path(STRACE_GROUPS_DIR),
3195 		};
3196 
3197 		trace.not_ev_qualifier = *s == '!';
3198 		if (trace.not_ev_qualifier)
3199 			++s;
3200 		trace.ev_qualifier = strlist__new(s, &slist_config);
3201 		if (trace.ev_qualifier == NULL) {
3202 			fputs("Not enough memory to parse event qualifier",
3203 			      trace.output);
3204 			err = -ENOMEM;
3205 			goto out_close;
3206 		}
3207 
3208 		err = trace__validate_ev_qualifier(&trace);
3209 		if (err)
3210 			goto out_close;
3211 	}
3212 
3213 	err = target__validate(&trace.opts.target);
3214 	if (err) {
3215 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3216 		fprintf(trace.output, "%s", bf);
3217 		goto out_close;
3218 	}
3219 
3220 	err = target__parse_uid(&trace.opts.target);
3221 	if (err) {
3222 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3223 		fprintf(trace.output, "%s", bf);
3224 		goto out_close;
3225 	}
3226 
3227 	if (!argc && target__none(&trace.opts.target))
3228 		trace.opts.target.system_wide = true;
3229 
3230 	if (input_name)
3231 		err = trace__replay(&trace);
3232 	else
3233 		err = trace__run(&trace, argc, argv);
3234 
3235 out_close:
3236 	if (output_name != NULL)
3237 		fclose(trace.output);
3238 out:
3239 	return err;
3240 }
3241