xref: /linux/tools/perf/builtin-trace.c (revision 3932b9ca55b0be314a36d3e84faff3e823c081f5)
1 #include <traceevent/event-parse.h>
2 #include "builtin.h"
3 #include "util/color.h"
4 #include "util/debug.h"
5 #include "util/evlist.h"
6 #include "util/machine.h"
7 #include "util/session.h"
8 #include "util/thread.h"
9 #include "util/parse-options.h"
10 #include "util/strlist.h"
11 #include "util/intlist.h"
12 #include "util/thread_map.h"
13 #include "util/stat.h"
14 #include "trace-event.h"
15 #include "util/parse-events.h"
16 
17 #include <libaudit.h>
18 #include <stdlib.h>
19 #include <sys/eventfd.h>
20 #include <sys/mman.h>
21 #include <linux/futex.h>
22 
23 /* For older distros: */
24 #ifndef MAP_STACK
25 # define MAP_STACK		0x20000
26 #endif
27 
28 #ifndef MADV_HWPOISON
29 # define MADV_HWPOISON		100
30 #endif
31 
32 #ifndef MADV_MERGEABLE
33 # define MADV_MERGEABLE		12
34 #endif
35 
36 #ifndef MADV_UNMERGEABLE
37 # define MADV_UNMERGEABLE	13
38 #endif
39 
40 #ifndef EFD_SEMAPHORE
41 # define EFD_SEMAPHORE		1
42 #endif
43 
44 struct tp_field {
45 	int offset;
46 	union {
47 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
48 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
49 	};
50 };
51 
52 #define TP_UINT_FIELD(bits) \
53 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
54 { \
55 	return *(u##bits *)(sample->raw_data + field->offset); \
56 }
57 
58 TP_UINT_FIELD(8);
59 TP_UINT_FIELD(16);
60 TP_UINT_FIELD(32);
61 TP_UINT_FIELD(64);
62 
63 #define TP_UINT_FIELD__SWAPPED(bits) \
64 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
65 { \
66 	u##bits value = *(u##bits *)(sample->raw_data + field->offset); \
67 	return bswap_##bits(value);\
68 }
69 
70 TP_UINT_FIELD__SWAPPED(16);
71 TP_UINT_FIELD__SWAPPED(32);
72 TP_UINT_FIELD__SWAPPED(64);
73 
74 static int tp_field__init_uint(struct tp_field *field,
75 			       struct format_field *format_field,
76 			       bool needs_swap)
77 {
78 	field->offset = format_field->offset;
79 
80 	switch (format_field->size) {
81 	case 1:
82 		field->integer = tp_field__u8;
83 		break;
84 	case 2:
85 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
86 		break;
87 	case 4:
88 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
89 		break;
90 	case 8:
91 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
92 		break;
93 	default:
94 		return -1;
95 	}
96 
97 	return 0;
98 }
99 
100 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
101 {
102 	return sample->raw_data + field->offset;
103 }
104 
105 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
106 {
107 	field->offset = format_field->offset;
108 	field->pointer = tp_field__ptr;
109 	return 0;
110 }
111 
112 struct syscall_tp {
113 	struct tp_field id;
114 	union {
115 		struct tp_field args, ret;
116 	};
117 };
118 
119 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
120 					  struct tp_field *field,
121 					  const char *name)
122 {
123 	struct format_field *format_field = perf_evsel__field(evsel, name);
124 
125 	if (format_field == NULL)
126 		return -1;
127 
128 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
129 }
130 
131 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
132 	({ struct syscall_tp *sc = evsel->priv;\
133 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
134 
135 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
136 					 struct tp_field *field,
137 					 const char *name)
138 {
139 	struct format_field *format_field = perf_evsel__field(evsel, name);
140 
141 	if (format_field == NULL)
142 		return -1;
143 
144 	return tp_field__init_ptr(field, format_field);
145 }
146 
147 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
148 	({ struct syscall_tp *sc = evsel->priv;\
149 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
150 
151 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
152 {
153 	zfree(&evsel->priv);
154 	perf_evsel__delete(evsel);
155 }
156 
157 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
158 {
159 	evsel->priv = malloc(sizeof(struct syscall_tp));
160 	if (evsel->priv != NULL) {
161 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
162 			goto out_delete;
163 
164 		evsel->handler = handler;
165 		return 0;
166 	}
167 
168 	return -ENOMEM;
169 
170 out_delete:
171 	zfree(&evsel->priv);
172 	return -ENOENT;
173 }
174 
175 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
176 {
177 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
178 
179 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
180 	if (evsel == NULL)
181 		evsel = perf_evsel__newtp("syscalls", direction);
182 
183 	if (evsel) {
184 		if (perf_evsel__init_syscall_tp(evsel, handler))
185 			goto out_delete;
186 	}
187 
188 	return evsel;
189 
190 out_delete:
191 	perf_evsel__delete_priv(evsel);
192 	return NULL;
193 }
194 
195 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
196 	({ struct syscall_tp *fields = evsel->priv; \
197 	   fields->name.integer(&fields->name, sample); })
198 
199 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
200 	({ struct syscall_tp *fields = evsel->priv; \
201 	   fields->name.pointer(&fields->name, sample); })
202 
203 static int perf_evlist__add_syscall_newtp(struct perf_evlist *evlist,
204 					  void *sys_enter_handler,
205 					  void *sys_exit_handler)
206 {
207 	int ret = -1;
208 	struct perf_evsel *sys_enter, *sys_exit;
209 
210 	sys_enter = perf_evsel__syscall_newtp("sys_enter", sys_enter_handler);
211 	if (sys_enter == NULL)
212 		goto out;
213 
214 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
215 		goto out_delete_sys_enter;
216 
217 	sys_exit = perf_evsel__syscall_newtp("sys_exit", sys_exit_handler);
218 	if (sys_exit == NULL)
219 		goto out_delete_sys_enter;
220 
221 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
222 		goto out_delete_sys_exit;
223 
224 	perf_evlist__add(evlist, sys_enter);
225 	perf_evlist__add(evlist, sys_exit);
226 
227 	ret = 0;
228 out:
229 	return ret;
230 
231 out_delete_sys_exit:
232 	perf_evsel__delete_priv(sys_exit);
233 out_delete_sys_enter:
234 	perf_evsel__delete_priv(sys_enter);
235 	goto out;
236 }
237 
238 
239 struct syscall_arg {
240 	unsigned long val;
241 	struct thread *thread;
242 	struct trace  *trace;
243 	void	      *parm;
244 	u8	      idx;
245 	u8	      mask;
246 };
247 
248 struct strarray {
249 	int	    offset;
250 	int	    nr_entries;
251 	const char **entries;
252 };
253 
254 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
255 	.nr_entries = ARRAY_SIZE(array), \
256 	.entries = array, \
257 }
258 
259 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
260 	.offset	    = off, \
261 	.nr_entries = ARRAY_SIZE(array), \
262 	.entries = array, \
263 }
264 
265 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
266 						const char *intfmt,
267 					        struct syscall_arg *arg)
268 {
269 	struct strarray *sa = arg->parm;
270 	int idx = arg->val - sa->offset;
271 
272 	if (idx < 0 || idx >= sa->nr_entries)
273 		return scnprintf(bf, size, intfmt, arg->val);
274 
275 	return scnprintf(bf, size, "%s", sa->entries[idx]);
276 }
277 
278 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
279 					      struct syscall_arg *arg)
280 {
281 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
282 }
283 
284 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
285 
286 #if defined(__i386__) || defined(__x86_64__)
287 /*
288  * FIXME: Make this available to all arches as soon as the ioctl beautifier
289  * 	  gets rewritten to support all arches.
290  */
291 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
292 						 struct syscall_arg *arg)
293 {
294 	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
295 }
296 
297 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
298 #endif /* defined(__i386__) || defined(__x86_64__) */
299 
300 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
301 					struct syscall_arg *arg);
302 
303 #define SCA_FD syscall_arg__scnprintf_fd
304 
305 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
306 					   struct syscall_arg *arg)
307 {
308 	int fd = arg->val;
309 
310 	if (fd == AT_FDCWD)
311 		return scnprintf(bf, size, "CWD");
312 
313 	return syscall_arg__scnprintf_fd(bf, size, arg);
314 }
315 
316 #define SCA_FDAT syscall_arg__scnprintf_fd_at
317 
318 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
319 					      struct syscall_arg *arg);
320 
321 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
322 
323 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
324 					 struct syscall_arg *arg)
325 {
326 	return scnprintf(bf, size, "%#lx", arg->val);
327 }
328 
329 #define SCA_HEX syscall_arg__scnprintf_hex
330 
331 static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
332 					       struct syscall_arg *arg)
333 {
334 	int printed = 0, prot = arg->val;
335 
336 	if (prot == PROT_NONE)
337 		return scnprintf(bf, size, "NONE");
338 #define	P_MMAP_PROT(n) \
339 	if (prot & PROT_##n) { \
340 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
341 		prot &= ~PROT_##n; \
342 	}
343 
344 	P_MMAP_PROT(EXEC);
345 	P_MMAP_PROT(READ);
346 	P_MMAP_PROT(WRITE);
347 #ifdef PROT_SEM
348 	P_MMAP_PROT(SEM);
349 #endif
350 	P_MMAP_PROT(GROWSDOWN);
351 	P_MMAP_PROT(GROWSUP);
352 #undef P_MMAP_PROT
353 
354 	if (prot)
355 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
356 
357 	return printed;
358 }
359 
360 #define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
361 
362 static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
363 						struct syscall_arg *arg)
364 {
365 	int printed = 0, flags = arg->val;
366 
367 #define	P_MMAP_FLAG(n) \
368 	if (flags & MAP_##n) { \
369 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
370 		flags &= ~MAP_##n; \
371 	}
372 
373 	P_MMAP_FLAG(SHARED);
374 	P_MMAP_FLAG(PRIVATE);
375 #ifdef MAP_32BIT
376 	P_MMAP_FLAG(32BIT);
377 #endif
378 	P_MMAP_FLAG(ANONYMOUS);
379 	P_MMAP_FLAG(DENYWRITE);
380 	P_MMAP_FLAG(EXECUTABLE);
381 	P_MMAP_FLAG(FILE);
382 	P_MMAP_FLAG(FIXED);
383 	P_MMAP_FLAG(GROWSDOWN);
384 #ifdef MAP_HUGETLB
385 	P_MMAP_FLAG(HUGETLB);
386 #endif
387 	P_MMAP_FLAG(LOCKED);
388 	P_MMAP_FLAG(NONBLOCK);
389 	P_MMAP_FLAG(NORESERVE);
390 	P_MMAP_FLAG(POPULATE);
391 	P_MMAP_FLAG(STACK);
392 #ifdef MAP_UNINITIALIZED
393 	P_MMAP_FLAG(UNINITIALIZED);
394 #endif
395 #undef P_MMAP_FLAG
396 
397 	if (flags)
398 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
399 
400 	return printed;
401 }
402 
403 #define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
404 
405 static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
406 						      struct syscall_arg *arg)
407 {
408 	int behavior = arg->val;
409 
410 	switch (behavior) {
411 #define	P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
412 	P_MADV_BHV(NORMAL);
413 	P_MADV_BHV(RANDOM);
414 	P_MADV_BHV(SEQUENTIAL);
415 	P_MADV_BHV(WILLNEED);
416 	P_MADV_BHV(DONTNEED);
417 	P_MADV_BHV(REMOVE);
418 	P_MADV_BHV(DONTFORK);
419 	P_MADV_BHV(DOFORK);
420 	P_MADV_BHV(HWPOISON);
421 #ifdef MADV_SOFT_OFFLINE
422 	P_MADV_BHV(SOFT_OFFLINE);
423 #endif
424 	P_MADV_BHV(MERGEABLE);
425 	P_MADV_BHV(UNMERGEABLE);
426 #ifdef MADV_HUGEPAGE
427 	P_MADV_BHV(HUGEPAGE);
428 #endif
429 #ifdef MADV_NOHUGEPAGE
430 	P_MADV_BHV(NOHUGEPAGE);
431 #endif
432 #ifdef MADV_DONTDUMP
433 	P_MADV_BHV(DONTDUMP);
434 #endif
435 #ifdef MADV_DODUMP
436 	P_MADV_BHV(DODUMP);
437 #endif
438 #undef P_MADV_PHV
439 	default: break;
440 	}
441 
442 	return scnprintf(bf, size, "%#x", behavior);
443 }
444 
445 #define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
446 
447 static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
448 					   struct syscall_arg *arg)
449 {
450 	int printed = 0, op = arg->val;
451 
452 	if (op == 0)
453 		return scnprintf(bf, size, "NONE");
454 #define	P_CMD(cmd) \
455 	if ((op & LOCK_##cmd) == LOCK_##cmd) { \
456 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
457 		op &= ~LOCK_##cmd; \
458 	}
459 
460 	P_CMD(SH);
461 	P_CMD(EX);
462 	P_CMD(NB);
463 	P_CMD(UN);
464 	P_CMD(MAND);
465 	P_CMD(RW);
466 	P_CMD(READ);
467 	P_CMD(WRITE);
468 #undef P_OP
469 
470 	if (op)
471 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
472 
473 	return printed;
474 }
475 
476 #define SCA_FLOCK syscall_arg__scnprintf_flock
477 
478 static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
479 {
480 	enum syscall_futex_args {
481 		SCF_UADDR   = (1 << 0),
482 		SCF_OP	    = (1 << 1),
483 		SCF_VAL	    = (1 << 2),
484 		SCF_TIMEOUT = (1 << 3),
485 		SCF_UADDR2  = (1 << 4),
486 		SCF_VAL3    = (1 << 5),
487 	};
488 	int op = arg->val;
489 	int cmd = op & FUTEX_CMD_MASK;
490 	size_t printed = 0;
491 
492 	switch (cmd) {
493 #define	P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
494 	P_FUTEX_OP(WAIT);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
495 	P_FUTEX_OP(WAKE);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
496 	P_FUTEX_OP(FD);		    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
497 	P_FUTEX_OP(REQUEUE);	    arg->mask |= SCF_VAL3|SCF_TIMEOUT;	          break;
498 	P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;			  break;
499 	P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;			  break;
500 	P_FUTEX_OP(WAKE_OP);							  break;
501 	P_FUTEX_OP(LOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
502 	P_FUTEX_OP(UNLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
503 	P_FUTEX_OP(TRYLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
504 	P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;			  break;
505 	P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;			  break;
506 	P_FUTEX_OP(WAIT_REQUEUE_PI);						  break;
507 	default: printed = scnprintf(bf, size, "%#x", cmd);			  break;
508 	}
509 
510 	if (op & FUTEX_PRIVATE_FLAG)
511 		printed += scnprintf(bf + printed, size - printed, "|PRIV");
512 
513 	if (op & FUTEX_CLOCK_REALTIME)
514 		printed += scnprintf(bf + printed, size - printed, "|CLKRT");
515 
516 	return printed;
517 }
518 
519 #define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
520 
521 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
522 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
523 
524 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
525 static DEFINE_STRARRAY(itimers);
526 
527 static const char *whences[] = { "SET", "CUR", "END",
528 #ifdef SEEK_DATA
529 "DATA",
530 #endif
531 #ifdef SEEK_HOLE
532 "HOLE",
533 #endif
534 };
535 static DEFINE_STRARRAY(whences);
536 
537 static const char *fcntl_cmds[] = {
538 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
539 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
540 	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
541 	"F_GETOWNER_UIDS",
542 };
543 static DEFINE_STRARRAY(fcntl_cmds);
544 
545 static const char *rlimit_resources[] = {
546 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
547 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
548 	"RTTIME",
549 };
550 static DEFINE_STRARRAY(rlimit_resources);
551 
552 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
553 static DEFINE_STRARRAY(sighow);
554 
555 static const char *clockid[] = {
556 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
557 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE",
558 };
559 static DEFINE_STRARRAY(clockid);
560 
561 static const char *socket_families[] = {
562 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
563 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
564 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
565 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
566 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
567 	"ALG", "NFC", "VSOCK",
568 };
569 static DEFINE_STRARRAY(socket_families);
570 
571 #ifndef SOCK_TYPE_MASK
572 #define SOCK_TYPE_MASK 0xf
573 #endif
574 
575 static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
576 						      struct syscall_arg *arg)
577 {
578 	size_t printed;
579 	int type = arg->val,
580 	    flags = type & ~SOCK_TYPE_MASK;
581 
582 	type &= SOCK_TYPE_MASK;
583 	/*
584  	 * Can't use a strarray, MIPS may override for ABI reasons.
585  	 */
586 	switch (type) {
587 #define	P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
588 	P_SK_TYPE(STREAM);
589 	P_SK_TYPE(DGRAM);
590 	P_SK_TYPE(RAW);
591 	P_SK_TYPE(RDM);
592 	P_SK_TYPE(SEQPACKET);
593 	P_SK_TYPE(DCCP);
594 	P_SK_TYPE(PACKET);
595 #undef P_SK_TYPE
596 	default:
597 		printed = scnprintf(bf, size, "%#x", type);
598 	}
599 
600 #define	P_SK_FLAG(n) \
601 	if (flags & SOCK_##n) { \
602 		printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
603 		flags &= ~SOCK_##n; \
604 	}
605 
606 	P_SK_FLAG(CLOEXEC);
607 	P_SK_FLAG(NONBLOCK);
608 #undef P_SK_FLAG
609 
610 	if (flags)
611 		printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
612 
613 	return printed;
614 }
615 
616 #define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
617 
618 #ifndef MSG_PROBE
619 #define MSG_PROBE	     0x10
620 #endif
621 #ifndef MSG_WAITFORONE
622 #define MSG_WAITFORONE	0x10000
623 #endif
624 #ifndef MSG_SENDPAGE_NOTLAST
625 #define MSG_SENDPAGE_NOTLAST 0x20000
626 #endif
627 #ifndef MSG_FASTOPEN
628 #define MSG_FASTOPEN	     0x20000000
629 #endif
630 
631 static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
632 					       struct syscall_arg *arg)
633 {
634 	int printed = 0, flags = arg->val;
635 
636 	if (flags == 0)
637 		return scnprintf(bf, size, "NONE");
638 #define	P_MSG_FLAG(n) \
639 	if (flags & MSG_##n) { \
640 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
641 		flags &= ~MSG_##n; \
642 	}
643 
644 	P_MSG_FLAG(OOB);
645 	P_MSG_FLAG(PEEK);
646 	P_MSG_FLAG(DONTROUTE);
647 	P_MSG_FLAG(TRYHARD);
648 	P_MSG_FLAG(CTRUNC);
649 	P_MSG_FLAG(PROBE);
650 	P_MSG_FLAG(TRUNC);
651 	P_MSG_FLAG(DONTWAIT);
652 	P_MSG_FLAG(EOR);
653 	P_MSG_FLAG(WAITALL);
654 	P_MSG_FLAG(FIN);
655 	P_MSG_FLAG(SYN);
656 	P_MSG_FLAG(CONFIRM);
657 	P_MSG_FLAG(RST);
658 	P_MSG_FLAG(ERRQUEUE);
659 	P_MSG_FLAG(NOSIGNAL);
660 	P_MSG_FLAG(MORE);
661 	P_MSG_FLAG(WAITFORONE);
662 	P_MSG_FLAG(SENDPAGE_NOTLAST);
663 	P_MSG_FLAG(FASTOPEN);
664 	P_MSG_FLAG(CMSG_CLOEXEC);
665 #undef P_MSG_FLAG
666 
667 	if (flags)
668 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
669 
670 	return printed;
671 }
672 
673 #define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
674 
675 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
676 						 struct syscall_arg *arg)
677 {
678 	size_t printed = 0;
679 	int mode = arg->val;
680 
681 	if (mode == F_OK) /* 0 */
682 		return scnprintf(bf, size, "F");
683 #define	P_MODE(n) \
684 	if (mode & n##_OK) { \
685 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
686 		mode &= ~n##_OK; \
687 	}
688 
689 	P_MODE(R);
690 	P_MODE(W);
691 	P_MODE(X);
692 #undef P_MODE
693 
694 	if (mode)
695 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
696 
697 	return printed;
698 }
699 
700 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
701 
702 static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
703 					       struct syscall_arg *arg)
704 {
705 	int printed = 0, flags = arg->val;
706 
707 	if (!(flags & O_CREAT))
708 		arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
709 
710 	if (flags == 0)
711 		return scnprintf(bf, size, "RDONLY");
712 #define	P_FLAG(n) \
713 	if (flags & O_##n) { \
714 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
715 		flags &= ~O_##n; \
716 	}
717 
718 	P_FLAG(APPEND);
719 	P_FLAG(ASYNC);
720 	P_FLAG(CLOEXEC);
721 	P_FLAG(CREAT);
722 	P_FLAG(DIRECT);
723 	P_FLAG(DIRECTORY);
724 	P_FLAG(EXCL);
725 	P_FLAG(LARGEFILE);
726 	P_FLAG(NOATIME);
727 	P_FLAG(NOCTTY);
728 #ifdef O_NONBLOCK
729 	P_FLAG(NONBLOCK);
730 #elif O_NDELAY
731 	P_FLAG(NDELAY);
732 #endif
733 #ifdef O_PATH
734 	P_FLAG(PATH);
735 #endif
736 	P_FLAG(RDWR);
737 #ifdef O_DSYNC
738 	if ((flags & O_SYNC) == O_SYNC)
739 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
740 	else {
741 		P_FLAG(DSYNC);
742 	}
743 #else
744 	P_FLAG(SYNC);
745 #endif
746 	P_FLAG(TRUNC);
747 	P_FLAG(WRONLY);
748 #undef P_FLAG
749 
750 	if (flags)
751 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
752 
753 	return printed;
754 }
755 
756 #define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
757 
758 static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
759 						   struct syscall_arg *arg)
760 {
761 	int printed = 0, flags = arg->val;
762 
763 	if (flags == 0)
764 		return scnprintf(bf, size, "NONE");
765 #define	P_FLAG(n) \
766 	if (flags & EFD_##n) { \
767 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
768 		flags &= ~EFD_##n; \
769 	}
770 
771 	P_FLAG(SEMAPHORE);
772 	P_FLAG(CLOEXEC);
773 	P_FLAG(NONBLOCK);
774 #undef P_FLAG
775 
776 	if (flags)
777 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
778 
779 	return printed;
780 }
781 
782 #define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
783 
784 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
785 						struct syscall_arg *arg)
786 {
787 	int printed = 0, flags = arg->val;
788 
789 #define	P_FLAG(n) \
790 	if (flags & O_##n) { \
791 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
792 		flags &= ~O_##n; \
793 	}
794 
795 	P_FLAG(CLOEXEC);
796 	P_FLAG(NONBLOCK);
797 #undef P_FLAG
798 
799 	if (flags)
800 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
801 
802 	return printed;
803 }
804 
805 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
806 
807 static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
808 {
809 	int sig = arg->val;
810 
811 	switch (sig) {
812 #define	P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
813 	P_SIGNUM(HUP);
814 	P_SIGNUM(INT);
815 	P_SIGNUM(QUIT);
816 	P_SIGNUM(ILL);
817 	P_SIGNUM(TRAP);
818 	P_SIGNUM(ABRT);
819 	P_SIGNUM(BUS);
820 	P_SIGNUM(FPE);
821 	P_SIGNUM(KILL);
822 	P_SIGNUM(USR1);
823 	P_SIGNUM(SEGV);
824 	P_SIGNUM(USR2);
825 	P_SIGNUM(PIPE);
826 	P_SIGNUM(ALRM);
827 	P_SIGNUM(TERM);
828 	P_SIGNUM(CHLD);
829 	P_SIGNUM(CONT);
830 	P_SIGNUM(STOP);
831 	P_SIGNUM(TSTP);
832 	P_SIGNUM(TTIN);
833 	P_SIGNUM(TTOU);
834 	P_SIGNUM(URG);
835 	P_SIGNUM(XCPU);
836 	P_SIGNUM(XFSZ);
837 	P_SIGNUM(VTALRM);
838 	P_SIGNUM(PROF);
839 	P_SIGNUM(WINCH);
840 	P_SIGNUM(IO);
841 	P_SIGNUM(PWR);
842 	P_SIGNUM(SYS);
843 #ifdef SIGEMT
844 	P_SIGNUM(EMT);
845 #endif
846 #ifdef SIGSTKFLT
847 	P_SIGNUM(STKFLT);
848 #endif
849 #ifdef SIGSWI
850 	P_SIGNUM(SWI);
851 #endif
852 	default: break;
853 	}
854 
855 	return scnprintf(bf, size, "%#x", sig);
856 }
857 
858 #define SCA_SIGNUM syscall_arg__scnprintf_signum
859 
860 #if defined(__i386__) || defined(__x86_64__)
861 /*
862  * FIXME: Make this available to all arches.
863  */
864 #define TCGETS		0x5401
865 
866 static const char *tioctls[] = {
867 	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
868 	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
869 	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
870 	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
871 	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
872 	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
873 	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
874 	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
875 	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
876 	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
877 	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
878 	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
879 	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
880 	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
881 	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
882 };
883 
884 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
885 #endif /* defined(__i386__) || defined(__x86_64__) */
886 
887 #define STRARRAY(arg, name, array) \
888 	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
889 	  .arg_parm	 = { [arg] = &strarray__##array, }
890 
891 static struct syscall_fmt {
892 	const char *name;
893 	const char *alias;
894 	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
895 	void	   *arg_parm[6];
896 	bool	   errmsg;
897 	bool	   timeout;
898 	bool	   hexret;
899 } syscall_fmts[] = {
900 	{ .name	    = "access",	    .errmsg = true,
901 	  .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
902 	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
903 	{ .name	    = "brk",	    .hexret = true,
904 	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
905 	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
906 	{ .name	    = "close",	    .errmsg = true,
907 	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
908 	{ .name	    = "connect",    .errmsg = true, },
909 	{ .name	    = "dup",	    .errmsg = true,
910 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
911 	{ .name	    = "dup2",	    .errmsg = true,
912 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
913 	{ .name	    = "dup3",	    .errmsg = true,
914 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
915 	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
916 	{ .name	    = "eventfd2",   .errmsg = true,
917 	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
918 	{ .name	    = "faccessat",  .errmsg = true,
919 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
920 	{ .name	    = "fadvise64",  .errmsg = true,
921 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
922 	{ .name	    = "fallocate",  .errmsg = true,
923 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
924 	{ .name	    = "fchdir",	    .errmsg = true,
925 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
926 	{ .name	    = "fchmod",	    .errmsg = true,
927 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
928 	{ .name	    = "fchmodat",   .errmsg = true,
929 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
930 	{ .name	    = "fchown",	    .errmsg = true,
931 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
932 	{ .name	    = "fchownat",   .errmsg = true,
933 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
934 	{ .name	    = "fcntl",	    .errmsg = true,
935 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
936 			     [1] = SCA_STRARRAY, /* cmd */ },
937 	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
938 	{ .name	    = "fdatasync",  .errmsg = true,
939 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
940 	{ .name	    = "flock",	    .errmsg = true,
941 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
942 			     [1] = SCA_FLOCK, /* cmd */ }, },
943 	{ .name	    = "fsetxattr",  .errmsg = true,
944 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
945 	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat",
946 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
947 	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat",
948 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
949 	{ .name	    = "fstatfs",    .errmsg = true,
950 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
951 	{ .name	    = "fsync",    .errmsg = true,
952 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
953 	{ .name	    = "ftruncate", .errmsg = true,
954 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
955 	{ .name	    = "futex",	    .errmsg = true,
956 	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
957 	{ .name	    = "futimesat", .errmsg = true,
958 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
959 	{ .name	    = "getdents",   .errmsg = true,
960 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
961 	{ .name	    = "getdents64", .errmsg = true,
962 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
963 	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
964 	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
965 	{ .name	    = "ioctl",	    .errmsg = true,
966 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
967 #if defined(__i386__) || defined(__x86_64__)
968 /*
969  * FIXME: Make this available to all arches.
970  */
971 			     [1] = SCA_STRHEXARRAY, /* cmd */
972 			     [2] = SCA_HEX, /* arg */ },
973 	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
974 #else
975 			     [2] = SCA_HEX, /* arg */ }, },
976 #endif
977 	{ .name	    = "kill",	    .errmsg = true,
978 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
979 	{ .name	    = "linkat",	    .errmsg = true,
980 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
981 	{ .name	    = "lseek",	    .errmsg = true,
982 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
983 			     [2] = SCA_STRARRAY, /* whence */ },
984 	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
985 	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat", },
986 	{ .name     = "madvise",    .errmsg = true,
987 	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
988 			     [2] = SCA_MADV_BHV, /* behavior */ }, },
989 	{ .name	    = "mkdirat",    .errmsg = true,
990 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
991 	{ .name	    = "mknodat",    .errmsg = true,
992 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
993 	{ .name	    = "mlock",	    .errmsg = true,
994 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
995 	{ .name	    = "mlockall",   .errmsg = true,
996 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
997 	{ .name	    = "mmap",	    .hexret = true,
998 	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
999 			     [2] = SCA_MMAP_PROT, /* prot */
1000 			     [3] = SCA_MMAP_FLAGS, /* flags */
1001 			     [4] = SCA_FD, 	  /* fd */ }, },
1002 	{ .name	    = "mprotect",   .errmsg = true,
1003 	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
1004 			     [2] = SCA_MMAP_PROT, /* prot */ }, },
1005 	{ .name	    = "mremap",	    .hexret = true,
1006 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
1007 			     [4] = SCA_HEX, /* new_addr */ }, },
1008 	{ .name	    = "munlock",    .errmsg = true,
1009 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1010 	{ .name	    = "munmap",	    .errmsg = true,
1011 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
1012 	{ .name	    = "name_to_handle_at", .errmsg = true,
1013 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1014 	{ .name	    = "newfstatat", .errmsg = true,
1015 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1016 	{ .name	    = "open",	    .errmsg = true,
1017 	  .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
1018 	{ .name	    = "open_by_handle_at", .errmsg = true,
1019 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1020 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1021 	{ .name	    = "openat",	    .errmsg = true,
1022 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
1023 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
1024 	{ .name	    = "pipe2",	    .errmsg = true,
1025 	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
1026 	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
1027 	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
1028 	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64",
1029 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1030 	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread",
1031 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1032 	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
1033 	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64",
1034 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1035 	{ .name	    = "pwritev",    .errmsg = true,
1036 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1037 	{ .name	    = "read",	    .errmsg = true,
1038 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1039 	{ .name	    = "readlinkat", .errmsg = true,
1040 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1041 	{ .name	    = "readv",	    .errmsg = true,
1042 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1043 	{ .name	    = "recvfrom",   .errmsg = true,
1044 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1045 	{ .name	    = "recvmmsg",   .errmsg = true,
1046 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1047 	{ .name	    = "recvmsg",    .errmsg = true,
1048 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1049 	{ .name	    = "renameat",   .errmsg = true,
1050 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1051 	{ .name	    = "rt_sigaction", .errmsg = true,
1052 	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
1053 	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
1054 	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
1055 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1056 	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
1057 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1058 	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
1059 	{ .name	    = "sendmmsg",    .errmsg = true,
1060 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1061 	{ .name	    = "sendmsg",    .errmsg = true,
1062 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
1063 	{ .name	    = "sendto",	    .errmsg = true,
1064 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
1065 	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
1066 	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
1067 	{ .name	    = "shutdown",   .errmsg = true,
1068 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1069 	{ .name	    = "socket",	    .errmsg = true,
1070 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1071 			     [1] = SCA_SK_TYPE, /* type */ },
1072 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
1073 	{ .name	    = "socketpair", .errmsg = true,
1074 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
1075 			     [1] = SCA_SK_TYPE, /* type */ },
1076 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
1077 	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat", },
1078 	{ .name	    = "symlinkat",  .errmsg = true,
1079 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1080 	{ .name	    = "tgkill",	    .errmsg = true,
1081 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
1082 	{ .name	    = "tkill",	    .errmsg = true,
1083 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
1084 	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
1085 	{ .name	    = "unlinkat",   .errmsg = true,
1086 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
1087 	{ .name	    = "utimensat",  .errmsg = true,
1088 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
1089 	{ .name	    = "write",	    .errmsg = true,
1090 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1091 	{ .name	    = "writev",	    .errmsg = true,
1092 	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, },
1093 };
1094 
1095 static int syscall_fmt__cmp(const void *name, const void *fmtp)
1096 {
1097 	const struct syscall_fmt *fmt = fmtp;
1098 	return strcmp(name, fmt->name);
1099 }
1100 
1101 static struct syscall_fmt *syscall_fmt__find(const char *name)
1102 {
1103 	const int nmemb = ARRAY_SIZE(syscall_fmts);
1104 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
1105 }
1106 
1107 struct syscall {
1108 	struct event_format *tp_format;
1109 	const char	    *name;
1110 	bool		    filtered;
1111 	bool		    is_exit;
1112 	struct syscall_fmt  *fmt;
1113 	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
1114 	void		    **arg_parm;
1115 };
1116 
1117 static size_t fprintf_duration(unsigned long t, FILE *fp)
1118 {
1119 	double duration = (double)t / NSEC_PER_MSEC;
1120 	size_t printed = fprintf(fp, "(");
1121 
1122 	if (duration >= 1.0)
1123 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
1124 	else if (duration >= 0.01)
1125 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
1126 	else
1127 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
1128 	return printed + fprintf(fp, "): ");
1129 }
1130 
1131 struct thread_trace {
1132 	u64		  entry_time;
1133 	u64		  exit_time;
1134 	bool		  entry_pending;
1135 	unsigned long	  nr_events;
1136 	unsigned long	  pfmaj, pfmin;
1137 	char		  *entry_str;
1138 	double		  runtime_ms;
1139 	struct {
1140 		int	  max;
1141 		char	  **table;
1142 	} paths;
1143 
1144 	struct intlist *syscall_stats;
1145 };
1146 
1147 static struct thread_trace *thread_trace__new(void)
1148 {
1149 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
1150 
1151 	if (ttrace)
1152 		ttrace->paths.max = -1;
1153 
1154 	ttrace->syscall_stats = intlist__new(NULL);
1155 
1156 	return ttrace;
1157 }
1158 
1159 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
1160 {
1161 	struct thread_trace *ttrace;
1162 
1163 	if (thread == NULL)
1164 		goto fail;
1165 
1166 	if (thread->priv == NULL)
1167 		thread->priv = thread_trace__new();
1168 
1169 	if (thread->priv == NULL)
1170 		goto fail;
1171 
1172 	ttrace = thread->priv;
1173 	++ttrace->nr_events;
1174 
1175 	return ttrace;
1176 fail:
1177 	color_fprintf(fp, PERF_COLOR_RED,
1178 		      "WARNING: not enough memory, dropping samples!\n");
1179 	return NULL;
1180 }
1181 
1182 #define TRACE_PFMAJ		(1 << 0)
1183 #define TRACE_PFMIN		(1 << 1)
1184 
1185 struct trace {
1186 	struct perf_tool	tool;
1187 	struct {
1188 		int		machine;
1189 		int		open_id;
1190 	}			audit;
1191 	struct {
1192 		int		max;
1193 		struct syscall  *table;
1194 	} syscalls;
1195 	struct record_opts	opts;
1196 	struct machine		*host;
1197 	u64			base_time;
1198 	FILE			*output;
1199 	unsigned long		nr_events;
1200 	struct strlist		*ev_qualifier;
1201 	const char 		*last_vfs_getname;
1202 	struct intlist		*tid_list;
1203 	struct intlist		*pid_list;
1204 	double			duration_filter;
1205 	double			runtime_ms;
1206 	struct {
1207 		u64		vfs_getname,
1208 				proc_getname;
1209 	} stats;
1210 	bool			not_ev_qualifier;
1211 	bool			live;
1212 	bool			full_time;
1213 	bool			sched;
1214 	bool			multiple_threads;
1215 	bool			summary;
1216 	bool			summary_only;
1217 	bool			show_comm;
1218 	bool			show_tool_stats;
1219 	bool			trace_syscalls;
1220 	int			trace_pgfaults;
1221 };
1222 
1223 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
1224 {
1225 	struct thread_trace *ttrace = thread->priv;
1226 
1227 	if (fd > ttrace->paths.max) {
1228 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
1229 
1230 		if (npath == NULL)
1231 			return -1;
1232 
1233 		if (ttrace->paths.max != -1) {
1234 			memset(npath + ttrace->paths.max + 1, 0,
1235 			       (fd - ttrace->paths.max) * sizeof(char *));
1236 		} else {
1237 			memset(npath, 0, (fd + 1) * sizeof(char *));
1238 		}
1239 
1240 		ttrace->paths.table = npath;
1241 		ttrace->paths.max   = fd;
1242 	}
1243 
1244 	ttrace->paths.table[fd] = strdup(pathname);
1245 
1246 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
1247 }
1248 
1249 static int thread__read_fd_path(struct thread *thread, int fd)
1250 {
1251 	char linkname[PATH_MAX], pathname[PATH_MAX];
1252 	struct stat st;
1253 	int ret;
1254 
1255 	if (thread->pid_ == thread->tid) {
1256 		scnprintf(linkname, sizeof(linkname),
1257 			  "/proc/%d/fd/%d", thread->pid_, fd);
1258 	} else {
1259 		scnprintf(linkname, sizeof(linkname),
1260 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
1261 	}
1262 
1263 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
1264 		return -1;
1265 
1266 	ret = readlink(linkname, pathname, sizeof(pathname));
1267 
1268 	if (ret < 0 || ret > st.st_size)
1269 		return -1;
1270 
1271 	pathname[ret] = '\0';
1272 	return trace__set_fd_pathname(thread, fd, pathname);
1273 }
1274 
1275 static const char *thread__fd_path(struct thread *thread, int fd,
1276 				   struct trace *trace)
1277 {
1278 	struct thread_trace *ttrace = thread->priv;
1279 
1280 	if (ttrace == NULL)
1281 		return NULL;
1282 
1283 	if (fd < 0)
1284 		return NULL;
1285 
1286 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
1287 		if (!trace->live)
1288 			return NULL;
1289 		++trace->stats.proc_getname;
1290 		if (thread__read_fd_path(thread, fd))
1291 			return NULL;
1292 	}
1293 
1294 	return ttrace->paths.table[fd];
1295 }
1296 
1297 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1298 					struct syscall_arg *arg)
1299 {
1300 	int fd = arg->val;
1301 	size_t printed = scnprintf(bf, size, "%d", fd);
1302 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1303 
1304 	if (path)
1305 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1306 
1307 	return printed;
1308 }
1309 
1310 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1311 					      struct syscall_arg *arg)
1312 {
1313 	int fd = arg->val;
1314 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1315 	struct thread_trace *ttrace = arg->thread->priv;
1316 
1317 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1318 		zfree(&ttrace->paths.table[fd]);
1319 
1320 	return printed;
1321 }
1322 
1323 static bool trace__filter_duration(struct trace *trace, double t)
1324 {
1325 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1326 }
1327 
1328 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1329 {
1330 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1331 
1332 	return fprintf(fp, "%10.3f ", ts);
1333 }
1334 
1335 static bool done = false;
1336 static bool interrupted = false;
1337 
1338 static void sig_handler(int sig)
1339 {
1340 	done = true;
1341 	interrupted = sig == SIGINT;
1342 }
1343 
1344 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1345 					u64 duration, u64 tstamp, FILE *fp)
1346 {
1347 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1348 	printed += fprintf_duration(duration, fp);
1349 
1350 	if (trace->multiple_threads) {
1351 		if (trace->show_comm)
1352 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1353 		printed += fprintf(fp, "%d ", thread->tid);
1354 	}
1355 
1356 	return printed;
1357 }
1358 
1359 static int trace__process_event(struct trace *trace, struct machine *machine,
1360 				union perf_event *event, struct perf_sample *sample)
1361 {
1362 	int ret = 0;
1363 
1364 	switch (event->header.type) {
1365 	case PERF_RECORD_LOST:
1366 		color_fprintf(trace->output, PERF_COLOR_RED,
1367 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1368 		ret = machine__process_lost_event(machine, event, sample);
1369 	default:
1370 		ret = machine__process_event(machine, event, sample);
1371 		break;
1372 	}
1373 
1374 	return ret;
1375 }
1376 
1377 static int trace__tool_process(struct perf_tool *tool,
1378 			       union perf_event *event,
1379 			       struct perf_sample *sample,
1380 			       struct machine *machine)
1381 {
1382 	struct trace *trace = container_of(tool, struct trace, tool);
1383 	return trace__process_event(trace, machine, event, sample);
1384 }
1385 
1386 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1387 {
1388 	int err = symbol__init();
1389 
1390 	if (err)
1391 		return err;
1392 
1393 	trace->host = machine__new_host();
1394 	if (trace->host == NULL)
1395 		return -ENOMEM;
1396 
1397 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1398 					    evlist->threads, trace__tool_process, false);
1399 	if (err)
1400 		symbol__exit();
1401 
1402 	return err;
1403 }
1404 
1405 static int syscall__set_arg_fmts(struct syscall *sc)
1406 {
1407 	struct format_field *field;
1408 	int idx = 0;
1409 
1410 	sc->arg_scnprintf = calloc(sc->tp_format->format.nr_fields - 1, sizeof(void *));
1411 	if (sc->arg_scnprintf == NULL)
1412 		return -1;
1413 
1414 	if (sc->fmt)
1415 		sc->arg_parm = sc->fmt->arg_parm;
1416 
1417 	for (field = sc->tp_format->format.fields->next; field; field = field->next) {
1418 		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1419 			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1420 		else if (field->flags & FIELD_IS_POINTER)
1421 			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1422 		++idx;
1423 	}
1424 
1425 	return 0;
1426 }
1427 
1428 static int trace__read_syscall_info(struct trace *trace, int id)
1429 {
1430 	char tp_name[128];
1431 	struct syscall *sc;
1432 	const char *name = audit_syscall_to_name(id, trace->audit.machine);
1433 
1434 	if (name == NULL)
1435 		return -1;
1436 
1437 	if (id > trace->syscalls.max) {
1438 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1439 
1440 		if (nsyscalls == NULL)
1441 			return -1;
1442 
1443 		if (trace->syscalls.max != -1) {
1444 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1445 			       (id - trace->syscalls.max) * sizeof(*sc));
1446 		} else {
1447 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1448 		}
1449 
1450 		trace->syscalls.table = nsyscalls;
1451 		trace->syscalls.max   = id;
1452 	}
1453 
1454 	sc = trace->syscalls.table + id;
1455 	sc->name = name;
1456 
1457 	if (trace->ev_qualifier) {
1458 		bool in = strlist__find(trace->ev_qualifier, name) != NULL;
1459 
1460 		if (!(in ^ trace->not_ev_qualifier)) {
1461 			sc->filtered = true;
1462 			/*
1463 			 * No need to do read tracepoint information since this will be
1464 			 * filtered out.
1465 			 */
1466 			return 0;
1467 		}
1468 	}
1469 
1470 	sc->fmt  = syscall_fmt__find(sc->name);
1471 
1472 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1473 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1474 
1475 	if (sc->tp_format == NULL && sc->fmt && sc->fmt->alias) {
1476 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1477 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1478 	}
1479 
1480 	if (sc->tp_format == NULL)
1481 		return -1;
1482 
1483 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1484 
1485 	return syscall__set_arg_fmts(sc);
1486 }
1487 
1488 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1489 				      unsigned long *args, struct trace *trace,
1490 				      struct thread *thread)
1491 {
1492 	size_t printed = 0;
1493 
1494 	if (sc->tp_format != NULL) {
1495 		struct format_field *field;
1496 		u8 bit = 1;
1497 		struct syscall_arg arg = {
1498 			.idx	= 0,
1499 			.mask	= 0,
1500 			.trace  = trace,
1501 			.thread = thread,
1502 		};
1503 
1504 		for (field = sc->tp_format->format.fields->next; field;
1505 		     field = field->next, ++arg.idx, bit <<= 1) {
1506 			if (arg.mask & bit)
1507 				continue;
1508 			/*
1509  			 * Suppress this argument if its value is zero and
1510  			 * and we don't have a string associated in an
1511  			 * strarray for it.
1512  			 */
1513 			if (args[arg.idx] == 0 &&
1514 			    !(sc->arg_scnprintf &&
1515 			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1516 			      sc->arg_parm[arg.idx]))
1517 				continue;
1518 
1519 			printed += scnprintf(bf + printed, size - printed,
1520 					     "%s%s: ", printed ? ", " : "", field->name);
1521 			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1522 				arg.val = args[arg.idx];
1523 				if (sc->arg_parm)
1524 					arg.parm = sc->arg_parm[arg.idx];
1525 				printed += sc->arg_scnprintf[arg.idx](bf + printed,
1526 								      size - printed, &arg);
1527 			} else {
1528 				printed += scnprintf(bf + printed, size - printed,
1529 						     "%ld", args[arg.idx]);
1530 			}
1531 		}
1532 	} else {
1533 		int i = 0;
1534 
1535 		while (i < 6) {
1536 			printed += scnprintf(bf + printed, size - printed,
1537 					     "%sarg%d: %ld",
1538 					     printed ? ", " : "", i, args[i]);
1539 			++i;
1540 		}
1541 	}
1542 
1543 	return printed;
1544 }
1545 
1546 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1547 				  union perf_event *event,
1548 				  struct perf_sample *sample);
1549 
1550 static struct syscall *trace__syscall_info(struct trace *trace,
1551 					   struct perf_evsel *evsel, int id)
1552 {
1553 
1554 	if (id < 0) {
1555 
1556 		/*
1557 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1558 		 * before that, leaving at a higher verbosity level till that is
1559 		 * explained. Reproduced with plain ftrace with:
1560 		 *
1561 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1562 		 * grep "NR -1 " /t/trace_pipe
1563 		 *
1564 		 * After generating some load on the machine.
1565  		 */
1566 		if (verbose > 1) {
1567 			static u64 n;
1568 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1569 				id, perf_evsel__name(evsel), ++n);
1570 		}
1571 		return NULL;
1572 	}
1573 
1574 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1575 	    trace__read_syscall_info(trace, id))
1576 		goto out_cant_read;
1577 
1578 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1579 		goto out_cant_read;
1580 
1581 	return &trace->syscalls.table[id];
1582 
1583 out_cant_read:
1584 	if (verbose) {
1585 		fprintf(trace->output, "Problems reading syscall %d", id);
1586 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1587 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1588 		fputs(" information\n", trace->output);
1589 	}
1590 	return NULL;
1591 }
1592 
1593 static void thread__update_stats(struct thread_trace *ttrace,
1594 				 int id, struct perf_sample *sample)
1595 {
1596 	struct int_node *inode;
1597 	struct stats *stats;
1598 	u64 duration = 0;
1599 
1600 	inode = intlist__findnew(ttrace->syscall_stats, id);
1601 	if (inode == NULL)
1602 		return;
1603 
1604 	stats = inode->priv;
1605 	if (stats == NULL) {
1606 		stats = malloc(sizeof(struct stats));
1607 		if (stats == NULL)
1608 			return;
1609 		init_stats(stats);
1610 		inode->priv = stats;
1611 	}
1612 
1613 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1614 		duration = sample->time - ttrace->entry_time;
1615 
1616 	update_stats(stats, duration);
1617 }
1618 
1619 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1620 			    union perf_event *event __maybe_unused,
1621 			    struct perf_sample *sample)
1622 {
1623 	char *msg;
1624 	void *args;
1625 	size_t printed = 0;
1626 	struct thread *thread;
1627 	int id = perf_evsel__sc_tp_uint(evsel, id, sample);
1628 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1629 	struct thread_trace *ttrace;
1630 
1631 	if (sc == NULL)
1632 		return -1;
1633 
1634 	if (sc->filtered)
1635 		return 0;
1636 
1637 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1638 	ttrace = thread__trace(thread, trace->output);
1639 	if (ttrace == NULL)
1640 		return -1;
1641 
1642 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1643 
1644 	if (ttrace->entry_str == NULL) {
1645 		ttrace->entry_str = malloc(1024);
1646 		if (!ttrace->entry_str)
1647 			return -1;
1648 	}
1649 
1650 	ttrace->entry_time = sample->time;
1651 	msg = ttrace->entry_str;
1652 	printed += scnprintf(msg + printed, 1024 - printed, "%s(", sc->name);
1653 
1654 	printed += syscall__scnprintf_args(sc, msg + printed, 1024 - printed,
1655 					   args, trace, thread);
1656 
1657 	if (sc->is_exit) {
1658 		if (!trace->duration_filter && !trace->summary_only) {
1659 			trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
1660 			fprintf(trace->output, "%-70s\n", ttrace->entry_str);
1661 		}
1662 	} else
1663 		ttrace->entry_pending = true;
1664 
1665 	return 0;
1666 }
1667 
1668 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1669 			   union perf_event *event __maybe_unused,
1670 			   struct perf_sample *sample)
1671 {
1672 	int ret;
1673 	u64 duration = 0;
1674 	struct thread *thread;
1675 	int id = perf_evsel__sc_tp_uint(evsel, id, sample);
1676 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1677 	struct thread_trace *ttrace;
1678 
1679 	if (sc == NULL)
1680 		return -1;
1681 
1682 	if (sc->filtered)
1683 		return 0;
1684 
1685 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1686 	ttrace = thread__trace(thread, trace->output);
1687 	if (ttrace == NULL)
1688 		return -1;
1689 
1690 	if (trace->summary)
1691 		thread__update_stats(ttrace, id, sample);
1692 
1693 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1694 
1695 	if (id == trace->audit.open_id && ret >= 0 && trace->last_vfs_getname) {
1696 		trace__set_fd_pathname(thread, ret, trace->last_vfs_getname);
1697 		trace->last_vfs_getname = NULL;
1698 		++trace->stats.vfs_getname;
1699 	}
1700 
1701 	ttrace->exit_time = sample->time;
1702 
1703 	if (ttrace->entry_time) {
1704 		duration = sample->time - ttrace->entry_time;
1705 		if (trace__filter_duration(trace, duration))
1706 			goto out;
1707 	} else if (trace->duration_filter)
1708 		goto out;
1709 
1710 	if (trace->summary_only)
1711 		goto out;
1712 
1713 	trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
1714 
1715 	if (ttrace->entry_pending) {
1716 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1717 	} else {
1718 		fprintf(trace->output, " ... [");
1719 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1720 		fprintf(trace->output, "]: %s()", sc->name);
1721 	}
1722 
1723 	if (sc->fmt == NULL) {
1724 signed_print:
1725 		fprintf(trace->output, ") = %d", ret);
1726 	} else if (ret < 0 && sc->fmt->errmsg) {
1727 		char bf[256];
1728 		const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
1729 			   *e = audit_errno_to_name(-ret);
1730 
1731 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1732 	} else if (ret == 0 && sc->fmt->timeout)
1733 		fprintf(trace->output, ") = 0 Timeout");
1734 	else if (sc->fmt->hexret)
1735 		fprintf(trace->output, ") = %#x", ret);
1736 	else
1737 		goto signed_print;
1738 
1739 	fputc('\n', trace->output);
1740 out:
1741 	ttrace->entry_pending = false;
1742 
1743 	return 0;
1744 }
1745 
1746 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1747 			      union perf_event *event __maybe_unused,
1748 			      struct perf_sample *sample)
1749 {
1750 	trace->last_vfs_getname = perf_evsel__rawptr(evsel, sample, "pathname");
1751 	return 0;
1752 }
1753 
1754 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1755 				     union perf_event *event __maybe_unused,
1756 				     struct perf_sample *sample)
1757 {
1758         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1759 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1760 	struct thread *thread = machine__findnew_thread(trace->host,
1761 							sample->pid,
1762 							sample->tid);
1763 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1764 
1765 	if (ttrace == NULL)
1766 		goto out_dump;
1767 
1768 	ttrace->runtime_ms += runtime_ms;
1769 	trace->runtime_ms += runtime_ms;
1770 	return 0;
1771 
1772 out_dump:
1773 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1774 	       evsel->name,
1775 	       perf_evsel__strval(evsel, sample, "comm"),
1776 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1777 	       runtime,
1778 	       perf_evsel__intval(evsel, sample, "vruntime"));
1779 	return 0;
1780 }
1781 
1782 static void print_location(FILE *f, struct perf_sample *sample,
1783 			   struct addr_location *al,
1784 			   bool print_dso, bool print_sym)
1785 {
1786 
1787 	if ((verbose || print_dso) && al->map)
1788 		fprintf(f, "%s@", al->map->dso->long_name);
1789 
1790 	if ((verbose || print_sym) && al->sym)
1791 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1792 			al->addr - al->sym->start);
1793 	else if (al->map)
1794 		fprintf(f, "0x%" PRIx64, al->addr);
1795 	else
1796 		fprintf(f, "0x%" PRIx64, sample->addr);
1797 }
1798 
1799 static int trace__pgfault(struct trace *trace,
1800 			  struct perf_evsel *evsel,
1801 			  union perf_event *event,
1802 			  struct perf_sample *sample)
1803 {
1804 	struct thread *thread;
1805 	u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
1806 	struct addr_location al;
1807 	char map_type = 'd';
1808 	struct thread_trace *ttrace;
1809 
1810 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1811 	ttrace = thread__trace(thread, trace->output);
1812 	if (ttrace == NULL)
1813 		return -1;
1814 
1815 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1816 		ttrace->pfmaj++;
1817 	else
1818 		ttrace->pfmin++;
1819 
1820 	if (trace->summary_only)
1821 		return 0;
1822 
1823 	thread__find_addr_location(thread, trace->host, cpumode, MAP__FUNCTION,
1824 			      sample->ip, &al);
1825 
1826 	trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output);
1827 
1828 	fprintf(trace->output, "%sfault [",
1829 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1830 		"maj" : "min");
1831 
1832 	print_location(trace->output, sample, &al, false, true);
1833 
1834 	fprintf(trace->output, "] => ");
1835 
1836 	thread__find_addr_location(thread, trace->host, cpumode, MAP__VARIABLE,
1837 				   sample->addr, &al);
1838 
1839 	if (!al.map) {
1840 		thread__find_addr_location(thread, trace->host, cpumode,
1841 					   MAP__FUNCTION, sample->addr, &al);
1842 
1843 		if (al.map)
1844 			map_type = 'x';
1845 		else
1846 			map_type = '?';
1847 	}
1848 
1849 	print_location(trace->output, sample, &al, true, false);
1850 
1851 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1852 
1853 	return 0;
1854 }
1855 
1856 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1857 {
1858 	if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1859 	    (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1860 		return false;
1861 
1862 	if (trace->pid_list || trace->tid_list)
1863 		return true;
1864 
1865 	return false;
1866 }
1867 
1868 static int trace__process_sample(struct perf_tool *tool,
1869 				 union perf_event *event,
1870 				 struct perf_sample *sample,
1871 				 struct perf_evsel *evsel,
1872 				 struct machine *machine __maybe_unused)
1873 {
1874 	struct trace *trace = container_of(tool, struct trace, tool);
1875 	int err = 0;
1876 
1877 	tracepoint_handler handler = evsel->handler;
1878 
1879 	if (skip_sample(trace, sample))
1880 		return 0;
1881 
1882 	if (!trace->full_time && trace->base_time == 0)
1883 		trace->base_time = sample->time;
1884 
1885 	if (handler) {
1886 		++trace->nr_events;
1887 		handler(trace, evsel, event, sample);
1888 	}
1889 
1890 	return err;
1891 }
1892 
1893 static int parse_target_str(struct trace *trace)
1894 {
1895 	if (trace->opts.target.pid) {
1896 		trace->pid_list = intlist__new(trace->opts.target.pid);
1897 		if (trace->pid_list == NULL) {
1898 			pr_err("Error parsing process id string\n");
1899 			return -EINVAL;
1900 		}
1901 	}
1902 
1903 	if (trace->opts.target.tid) {
1904 		trace->tid_list = intlist__new(trace->opts.target.tid);
1905 		if (trace->tid_list == NULL) {
1906 			pr_err("Error parsing thread id string\n");
1907 			return -EINVAL;
1908 		}
1909 	}
1910 
1911 	return 0;
1912 }
1913 
1914 static int trace__record(struct trace *trace, int argc, const char **argv)
1915 {
1916 	unsigned int rec_argc, i, j;
1917 	const char **rec_argv;
1918 	const char * const record_args[] = {
1919 		"record",
1920 		"-R",
1921 		"-m", "1024",
1922 		"-c", "1",
1923 	};
1924 
1925 	const char * const sc_args[] = { "-e", };
1926 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1927 	const char * const majpf_args[] = { "-e", "major-faults" };
1928 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1929 	const char * const minpf_args[] = { "-e", "minor-faults" };
1930 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1931 
1932 	/* +1 is for the event string below */
1933 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1934 		majpf_args_nr + minpf_args_nr + argc;
1935 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
1936 
1937 	if (rec_argv == NULL)
1938 		return -ENOMEM;
1939 
1940 	j = 0;
1941 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
1942 		rec_argv[j++] = record_args[i];
1943 
1944 	if (trace->trace_syscalls) {
1945 		for (i = 0; i < sc_args_nr; i++)
1946 			rec_argv[j++] = sc_args[i];
1947 
1948 		/* event string may be different for older kernels - e.g., RHEL6 */
1949 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
1950 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
1951 		else if (is_valid_tracepoint("syscalls:sys_enter"))
1952 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
1953 		else {
1954 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
1955 			return -1;
1956 		}
1957 	}
1958 
1959 	if (trace->trace_pgfaults & TRACE_PFMAJ)
1960 		for (i = 0; i < majpf_args_nr; i++)
1961 			rec_argv[j++] = majpf_args[i];
1962 
1963 	if (trace->trace_pgfaults & TRACE_PFMIN)
1964 		for (i = 0; i < minpf_args_nr; i++)
1965 			rec_argv[j++] = minpf_args[i];
1966 
1967 	for (i = 0; i < (unsigned int)argc; i++)
1968 		rec_argv[j++] = argv[i];
1969 
1970 	return cmd_record(j, rec_argv, NULL);
1971 }
1972 
1973 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
1974 
1975 static void perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
1976 {
1977 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
1978 	if (evsel == NULL)
1979 		return;
1980 
1981 	if (perf_evsel__field(evsel, "pathname") == NULL) {
1982 		perf_evsel__delete(evsel);
1983 		return;
1984 	}
1985 
1986 	evsel->handler = trace__vfs_getname;
1987 	perf_evlist__add(evlist, evsel);
1988 }
1989 
1990 static int perf_evlist__add_pgfault(struct perf_evlist *evlist,
1991 				    u64 config)
1992 {
1993 	struct perf_evsel *evsel;
1994 	struct perf_event_attr attr = {
1995 		.type = PERF_TYPE_SOFTWARE,
1996 		.mmap_data = 1,
1997 	};
1998 
1999 	attr.config = config;
2000 	attr.sample_period = 1;
2001 
2002 	event_attr_init(&attr);
2003 
2004 	evsel = perf_evsel__new(&attr);
2005 	if (!evsel)
2006 		return -ENOMEM;
2007 
2008 	evsel->handler = trace__pgfault;
2009 	perf_evlist__add(evlist, evsel);
2010 
2011 	return 0;
2012 }
2013 
2014 static int trace__run(struct trace *trace, int argc, const char **argv)
2015 {
2016 	struct perf_evlist *evlist = perf_evlist__new();
2017 	struct perf_evsel *evsel;
2018 	int err = -1, i;
2019 	unsigned long before;
2020 	const bool forks = argc > 0;
2021 
2022 	trace->live = true;
2023 
2024 	if (evlist == NULL) {
2025 		fprintf(trace->output, "Not enough memory to run!\n");
2026 		goto out;
2027 	}
2028 
2029 	if (trace->trace_syscalls &&
2030 	    perf_evlist__add_syscall_newtp(evlist, trace__sys_enter,
2031 					   trace__sys_exit))
2032 		goto out_error_tp;
2033 
2034 	if (trace->trace_syscalls)
2035 		perf_evlist__add_vfs_getname(evlist);
2036 
2037 	if ((trace->trace_pgfaults & TRACE_PFMAJ) &&
2038 	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MAJ))
2039 		goto out_error_tp;
2040 
2041 	if ((trace->trace_pgfaults & TRACE_PFMIN) &&
2042 	    perf_evlist__add_pgfault(evlist, PERF_COUNT_SW_PAGE_FAULTS_MIN))
2043 		goto out_error_tp;
2044 
2045 	if (trace->sched &&
2046 		perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2047 				trace__sched_stat_runtime))
2048 		goto out_error_tp;
2049 
2050 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2051 	if (err < 0) {
2052 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2053 		goto out_delete_evlist;
2054 	}
2055 
2056 	err = trace__symbols_init(trace, evlist);
2057 	if (err < 0) {
2058 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2059 		goto out_delete_evlist;
2060 	}
2061 
2062 	perf_evlist__config(evlist, &trace->opts);
2063 
2064 	signal(SIGCHLD, sig_handler);
2065 	signal(SIGINT, sig_handler);
2066 
2067 	if (forks) {
2068 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2069 						    argv, false, NULL);
2070 		if (err < 0) {
2071 			fprintf(trace->output, "Couldn't run the workload!\n");
2072 			goto out_delete_evlist;
2073 		}
2074 	}
2075 
2076 	err = perf_evlist__open(evlist);
2077 	if (err < 0)
2078 		goto out_error_open;
2079 
2080 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2081 	if (err < 0) {
2082 		fprintf(trace->output, "Couldn't mmap the events: %s\n", strerror(errno));
2083 		goto out_delete_evlist;
2084 	}
2085 
2086 	perf_evlist__enable(evlist);
2087 
2088 	if (forks)
2089 		perf_evlist__start_workload(evlist);
2090 
2091 	trace->multiple_threads = evlist->threads->map[0] == -1 || evlist->threads->nr > 1;
2092 again:
2093 	before = trace->nr_events;
2094 
2095 	for (i = 0; i < evlist->nr_mmaps; i++) {
2096 		union perf_event *event;
2097 
2098 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2099 			const u32 type = event->header.type;
2100 			tracepoint_handler handler;
2101 			struct perf_sample sample;
2102 
2103 			++trace->nr_events;
2104 
2105 			err = perf_evlist__parse_sample(evlist, event, &sample);
2106 			if (err) {
2107 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2108 				goto next_event;
2109 			}
2110 
2111 			if (!trace->full_time && trace->base_time == 0)
2112 				trace->base_time = sample.time;
2113 
2114 			if (type != PERF_RECORD_SAMPLE) {
2115 				trace__process_event(trace, trace->host, event, &sample);
2116 				continue;
2117 			}
2118 
2119 			evsel = perf_evlist__id2evsel(evlist, sample.id);
2120 			if (evsel == NULL) {
2121 				fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample.id);
2122 				goto next_event;
2123 			}
2124 
2125 			if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2126 			    sample.raw_data == NULL) {
2127 				fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2128 				       perf_evsel__name(evsel), sample.tid,
2129 				       sample.cpu, sample.raw_size);
2130 				goto next_event;
2131 			}
2132 
2133 			handler = evsel->handler;
2134 			handler(trace, evsel, event, &sample);
2135 next_event:
2136 			perf_evlist__mmap_consume(evlist, i);
2137 
2138 			if (interrupted)
2139 				goto out_disable;
2140 		}
2141 	}
2142 
2143 	if (trace->nr_events == before) {
2144 		int timeout = done ? 100 : -1;
2145 
2146 		if (poll(evlist->pollfd, evlist->nr_fds, timeout) > 0)
2147 			goto again;
2148 	} else {
2149 		goto again;
2150 	}
2151 
2152 out_disable:
2153 	perf_evlist__disable(evlist);
2154 
2155 	if (!err) {
2156 		if (trace->summary)
2157 			trace__fprintf_thread_summary(trace, trace->output);
2158 
2159 		if (trace->show_tool_stats) {
2160 			fprintf(trace->output, "Stats:\n "
2161 					       " vfs_getname : %" PRIu64 "\n"
2162 					       " proc_getname: %" PRIu64 "\n",
2163 				trace->stats.vfs_getname,
2164 				trace->stats.proc_getname);
2165 		}
2166 	}
2167 
2168 out_delete_evlist:
2169 	perf_evlist__delete(evlist);
2170 out:
2171 	trace->live = false;
2172 	return err;
2173 {
2174 	char errbuf[BUFSIZ];
2175 
2176 out_error_tp:
2177 	perf_evlist__strerror_tp(evlist, errno, errbuf, sizeof(errbuf));
2178 	goto out_error;
2179 
2180 out_error_open:
2181 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2182 
2183 out_error:
2184 	fprintf(trace->output, "%s\n", errbuf);
2185 	goto out_delete_evlist;
2186 }
2187 }
2188 
2189 static int trace__replay(struct trace *trace)
2190 {
2191 	const struct perf_evsel_str_handler handlers[] = {
2192 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2193 	};
2194 	struct perf_data_file file = {
2195 		.path  = input_name,
2196 		.mode  = PERF_DATA_MODE_READ,
2197 	};
2198 	struct perf_session *session;
2199 	struct perf_evsel *evsel;
2200 	int err = -1;
2201 
2202 	trace->tool.sample	  = trace__process_sample;
2203 	trace->tool.mmap	  = perf_event__process_mmap;
2204 	trace->tool.mmap2	  = perf_event__process_mmap2;
2205 	trace->tool.comm	  = perf_event__process_comm;
2206 	trace->tool.exit	  = perf_event__process_exit;
2207 	trace->tool.fork	  = perf_event__process_fork;
2208 	trace->tool.attr	  = perf_event__process_attr;
2209 	trace->tool.tracing_data = perf_event__process_tracing_data;
2210 	trace->tool.build_id	  = perf_event__process_build_id;
2211 
2212 	trace->tool.ordered_samples = true;
2213 	trace->tool.ordering_requires_timestamps = true;
2214 
2215 	/* add tid to output */
2216 	trace->multiple_threads = true;
2217 
2218 	if (symbol__init() < 0)
2219 		return -1;
2220 
2221 	session = perf_session__new(&file, false, &trace->tool);
2222 	if (session == NULL)
2223 		return -ENOMEM;
2224 
2225 	trace->host = &session->machines.host;
2226 
2227 	err = perf_session__set_tracepoints_handlers(session, handlers);
2228 	if (err)
2229 		goto out;
2230 
2231 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2232 						     "raw_syscalls:sys_enter");
2233 	/* older kernels have syscalls tp versus raw_syscalls */
2234 	if (evsel == NULL)
2235 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2236 							     "syscalls:sys_enter");
2237 
2238 	if (evsel &&
2239 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2240 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2241 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2242 		goto out;
2243 	}
2244 
2245 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2246 						     "raw_syscalls:sys_exit");
2247 	if (evsel == NULL)
2248 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2249 							     "syscalls:sys_exit");
2250 	if (evsel &&
2251 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2252 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2253 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2254 		goto out;
2255 	}
2256 
2257 	evlist__for_each(session->evlist, evsel) {
2258 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2259 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2260 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2261 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2262 			evsel->handler = trace__pgfault;
2263 	}
2264 
2265 	err = parse_target_str(trace);
2266 	if (err != 0)
2267 		goto out;
2268 
2269 	setup_pager();
2270 
2271 	err = perf_session__process_events(session, &trace->tool);
2272 	if (err)
2273 		pr_err("Failed to process events, error %d", err);
2274 
2275 	else if (trace->summary)
2276 		trace__fprintf_thread_summary(trace, trace->output);
2277 
2278 out:
2279 	perf_session__delete(session);
2280 
2281 	return err;
2282 }
2283 
2284 static size_t trace__fprintf_threads_header(FILE *fp)
2285 {
2286 	size_t printed;
2287 
2288 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2289 
2290 	return printed;
2291 }
2292 
2293 static size_t thread__dump_stats(struct thread_trace *ttrace,
2294 				 struct trace *trace, FILE *fp)
2295 {
2296 	struct stats *stats;
2297 	size_t printed = 0;
2298 	struct syscall *sc;
2299 	struct int_node *inode = intlist__first(ttrace->syscall_stats);
2300 
2301 	if (inode == NULL)
2302 		return 0;
2303 
2304 	printed += fprintf(fp, "\n");
2305 
2306 	printed += fprintf(fp, "   syscall            calls      min       avg       max      stddev\n");
2307 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)        (%%)\n");
2308 	printed += fprintf(fp, "   --------------- -------- --------- --------- ---------     ------\n");
2309 
2310 	/* each int_node is a syscall */
2311 	while (inode) {
2312 		stats = inode->priv;
2313 		if (stats) {
2314 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2315 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2316 			double avg = avg_stats(stats);
2317 			double pct;
2318 			u64 n = (u64) stats->n;
2319 
2320 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2321 			avg /= NSEC_PER_MSEC;
2322 
2323 			sc = &trace->syscalls.table[inode->i];
2324 			printed += fprintf(fp, "   %-15s", sc->name);
2325 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f",
2326 					   n, min, avg);
2327 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2328 		}
2329 
2330 		inode = intlist__next(inode);
2331 	}
2332 
2333 	printed += fprintf(fp, "\n\n");
2334 
2335 	return printed;
2336 }
2337 
2338 /* struct used to pass data to per-thread function */
2339 struct summary_data {
2340 	FILE *fp;
2341 	struct trace *trace;
2342 	size_t printed;
2343 };
2344 
2345 static int trace__fprintf_one_thread(struct thread *thread, void *priv)
2346 {
2347 	struct summary_data *data = priv;
2348 	FILE *fp = data->fp;
2349 	size_t printed = data->printed;
2350 	struct trace *trace = data->trace;
2351 	struct thread_trace *ttrace = thread->priv;
2352 	double ratio;
2353 
2354 	if (ttrace == NULL)
2355 		return 0;
2356 
2357 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2358 
2359 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2360 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2361 	printed += fprintf(fp, "%.1f%%", ratio);
2362 	if (ttrace->pfmaj)
2363 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2364 	if (ttrace->pfmin)
2365 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2366 	printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2367 	printed += thread__dump_stats(ttrace, trace, fp);
2368 
2369 	data->printed += printed;
2370 
2371 	return 0;
2372 }
2373 
2374 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2375 {
2376 	struct summary_data data = {
2377 		.fp = fp,
2378 		.trace = trace
2379 	};
2380 	data.printed = trace__fprintf_threads_header(fp);
2381 
2382 	machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
2383 
2384 	return data.printed;
2385 }
2386 
2387 static int trace__set_duration(const struct option *opt, const char *str,
2388 			       int unset __maybe_unused)
2389 {
2390 	struct trace *trace = opt->value;
2391 
2392 	trace->duration_filter = atof(str);
2393 	return 0;
2394 }
2395 
2396 static int trace__open_output(struct trace *trace, const char *filename)
2397 {
2398 	struct stat st;
2399 
2400 	if (!stat(filename, &st) && st.st_size) {
2401 		char oldname[PATH_MAX];
2402 
2403 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2404 		unlink(oldname);
2405 		rename(filename, oldname);
2406 	}
2407 
2408 	trace->output = fopen(filename, "w");
2409 
2410 	return trace->output == NULL ? -errno : 0;
2411 }
2412 
2413 static int parse_pagefaults(const struct option *opt, const char *str,
2414 			    int unset __maybe_unused)
2415 {
2416 	int *trace_pgfaults = opt->value;
2417 
2418 	if (strcmp(str, "all") == 0)
2419 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2420 	else if (strcmp(str, "maj") == 0)
2421 		*trace_pgfaults |= TRACE_PFMAJ;
2422 	else if (strcmp(str, "min") == 0)
2423 		*trace_pgfaults |= TRACE_PFMIN;
2424 	else
2425 		return -1;
2426 
2427 	return 0;
2428 }
2429 
2430 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2431 {
2432 	const char * const trace_usage[] = {
2433 		"perf trace [<options>] [<command>]",
2434 		"perf trace [<options>] -- <command> [<options>]",
2435 		"perf trace record [<options>] [<command>]",
2436 		"perf trace record [<options>] -- <command> [<options>]",
2437 		NULL
2438 	};
2439 	struct trace trace = {
2440 		.audit = {
2441 			.machine = audit_detect_machine(),
2442 			.open_id = audit_name_to_syscall("open", trace.audit.machine),
2443 		},
2444 		.syscalls = {
2445 			. max = -1,
2446 		},
2447 		.opts = {
2448 			.target = {
2449 				.uid	   = UINT_MAX,
2450 				.uses_mmap = true,
2451 			},
2452 			.user_freq     = UINT_MAX,
2453 			.user_interval = ULLONG_MAX,
2454 			.no_buffering  = true,
2455 			.mmap_pages    = 1024,
2456 		},
2457 		.output = stdout,
2458 		.show_comm = true,
2459 		.trace_syscalls = true,
2460 	};
2461 	const char *output_name = NULL;
2462 	const char *ev_qualifier_str = NULL;
2463 	const struct option trace_options[] = {
2464 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2465 		    "show the thread COMM next to its id"),
2466 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2467 	OPT_STRING('e', "expr", &ev_qualifier_str, "expr",
2468 		    "list of events to trace"),
2469 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
2470 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2471 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2472 		    "trace events on existing process id"),
2473 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2474 		    "trace events on existing thread id"),
2475 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2476 		    "system-wide collection from all CPUs"),
2477 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2478 		    "list of cpus to monitor"),
2479 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2480 		    "child tasks do not inherit counters"),
2481 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2482 		     "number of mmap data pages",
2483 		     perf_evlist__parse_mmap_pages),
2484 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2485 		   "user to profile"),
2486 	OPT_CALLBACK(0, "duration", &trace, "float",
2487 		     "show only events with duration > N.M ms",
2488 		     trace__set_duration),
2489 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2490 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2491 	OPT_BOOLEAN('T', "time", &trace.full_time,
2492 		    "Show full timestamp, not time relative to first start"),
2493 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
2494 		    "Show only syscall summary with statistics"),
2495 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
2496 		    "Show all syscalls and summary with statistics"),
2497 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2498 		     "Trace pagefaults", parse_pagefaults, "maj"),
2499 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2500 	OPT_END()
2501 	};
2502 	int err;
2503 	char bf[BUFSIZ];
2504 
2505 	argc = parse_options(argc, argv, trace_options, trace_usage,
2506 			     PARSE_OPT_STOP_AT_NON_OPTION);
2507 
2508 	if (trace.trace_pgfaults) {
2509 		trace.opts.sample_address = true;
2510 		trace.opts.sample_time = true;
2511 	}
2512 
2513 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2514 		return trace__record(&trace, argc-1, &argv[1]);
2515 
2516 	/* summary_only implies summary option, but don't overwrite summary if set */
2517 	if (trace.summary_only)
2518 		trace.summary = trace.summary_only;
2519 
2520 	if (!trace.trace_syscalls && !trace.trace_pgfaults) {
2521 		pr_err("Please specify something to trace.\n");
2522 		return -1;
2523 	}
2524 
2525 	if (output_name != NULL) {
2526 		err = trace__open_output(&trace, output_name);
2527 		if (err < 0) {
2528 			perror("failed to create output file");
2529 			goto out;
2530 		}
2531 	}
2532 
2533 	if (ev_qualifier_str != NULL) {
2534 		const char *s = ev_qualifier_str;
2535 
2536 		trace.not_ev_qualifier = *s == '!';
2537 		if (trace.not_ev_qualifier)
2538 			++s;
2539 		trace.ev_qualifier = strlist__new(true, s);
2540 		if (trace.ev_qualifier == NULL) {
2541 			fputs("Not enough memory to parse event qualifier",
2542 			      trace.output);
2543 			err = -ENOMEM;
2544 			goto out_close;
2545 		}
2546 	}
2547 
2548 	err = target__validate(&trace.opts.target);
2549 	if (err) {
2550 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2551 		fprintf(trace.output, "%s", bf);
2552 		goto out_close;
2553 	}
2554 
2555 	err = target__parse_uid(&trace.opts.target);
2556 	if (err) {
2557 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2558 		fprintf(trace.output, "%s", bf);
2559 		goto out_close;
2560 	}
2561 
2562 	if (!argc && target__none(&trace.opts.target))
2563 		trace.opts.target.system_wide = true;
2564 
2565 	if (input_name)
2566 		err = trace__replay(&trace);
2567 	else
2568 		err = trace__run(&trace, argc, argv);
2569 
2570 out_close:
2571 	if (output_name != NULL)
2572 		fclose(trace.output);
2573 out:
2574 	return err;
2575 }
2576