1 /* 2 * builtin-trace.c 3 * 4 * Builtin 'trace' command: 5 * 6 * Display a continuously updated trace of any workload, CPU, specific PID, 7 * system wide, etc. Default format is loosely strace like, but any other 8 * event may be specified using --event. 9 * 10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> 11 * 12 * Initially based on the 'trace' prototype by Thomas Gleixner: 13 * 14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'") 15 * 16 * Released under the GPL v2. (and only v2, not any later version) 17 */ 18 19 #include <traceevent/event-parse.h> 20 #include <api/fs/tracing_path.h> 21 #include "builtin.h" 22 #include "util/color.h" 23 #include "util/debug.h" 24 #include "util/evlist.h" 25 #include <subcmd/exec-cmd.h> 26 #include "util/machine.h" 27 #include "util/session.h" 28 #include "util/thread.h" 29 #include <subcmd/parse-options.h> 30 #include "util/strlist.h" 31 #include "util/intlist.h" 32 #include "util/thread_map.h" 33 #include "util/stat.h" 34 #include "trace-event.h" 35 #include "util/parse-events.h" 36 #include "util/bpf-loader.h" 37 #include "callchain.h" 38 #include "syscalltbl.h" 39 #include "rb_resort.h" 40 41 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */ 42 #include <stdlib.h> 43 #include <linux/err.h> 44 #include <linux/filter.h> 45 #include <linux/audit.h> 46 #include <linux/random.h> 47 #include <linux/stringify.h> 48 #include <linux/time64.h> 49 50 #ifndef O_CLOEXEC 51 # define O_CLOEXEC 02000000 52 #endif 53 54 struct trace { 55 struct perf_tool tool; 56 struct syscalltbl *sctbl; 57 struct { 58 int max; 59 struct syscall *table; 60 struct { 61 struct perf_evsel *sys_enter, 62 *sys_exit; 63 } events; 64 } syscalls; 65 struct record_opts opts; 66 struct perf_evlist *evlist; 67 struct machine *host; 68 struct thread *current; 69 u64 base_time; 70 FILE *output; 71 unsigned long nr_events; 72 struct strlist *ev_qualifier; 73 struct { 74 size_t nr; 75 int *entries; 76 } ev_qualifier_ids; 77 struct intlist *tid_list; 78 struct intlist *pid_list; 79 struct { 80 size_t nr; 81 pid_t *entries; 82 } filter_pids; 83 double duration_filter; 84 double runtime_ms; 85 struct { 86 u64 vfs_getname, 87 proc_getname; 88 } stats; 89 unsigned int max_stack; 90 unsigned int min_stack; 91 bool not_ev_qualifier; 92 bool live; 93 bool full_time; 94 bool sched; 95 bool multiple_threads; 96 bool summary; 97 bool summary_only; 98 bool show_comm; 99 bool show_tool_stats; 100 bool trace_syscalls; 101 bool kernel_syscallchains; 102 bool force; 103 bool vfs_getname; 104 int trace_pgfaults; 105 int open_id; 106 }; 107 108 struct tp_field { 109 int offset; 110 union { 111 u64 (*integer)(struct tp_field *field, struct perf_sample *sample); 112 void *(*pointer)(struct tp_field *field, struct perf_sample *sample); 113 }; 114 }; 115 116 #define TP_UINT_FIELD(bits) \ 117 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \ 118 { \ 119 u##bits value; \ 120 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \ 121 return value; \ 122 } 123 124 TP_UINT_FIELD(8); 125 TP_UINT_FIELD(16); 126 TP_UINT_FIELD(32); 127 TP_UINT_FIELD(64); 128 129 #define TP_UINT_FIELD__SWAPPED(bits) \ 130 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \ 131 { \ 132 u##bits value; \ 133 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \ 134 return bswap_##bits(value);\ 135 } 136 137 TP_UINT_FIELD__SWAPPED(16); 138 TP_UINT_FIELD__SWAPPED(32); 139 TP_UINT_FIELD__SWAPPED(64); 140 141 static int tp_field__init_uint(struct tp_field *field, 142 struct format_field *format_field, 143 bool needs_swap) 144 { 145 field->offset = format_field->offset; 146 147 switch (format_field->size) { 148 case 1: 149 field->integer = tp_field__u8; 150 break; 151 case 2: 152 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16; 153 break; 154 case 4: 155 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32; 156 break; 157 case 8: 158 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64; 159 break; 160 default: 161 return -1; 162 } 163 164 return 0; 165 } 166 167 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample) 168 { 169 return sample->raw_data + field->offset; 170 } 171 172 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field) 173 { 174 field->offset = format_field->offset; 175 field->pointer = tp_field__ptr; 176 return 0; 177 } 178 179 struct syscall_tp { 180 struct tp_field id; 181 union { 182 struct tp_field args, ret; 183 }; 184 }; 185 186 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel, 187 struct tp_field *field, 188 const char *name) 189 { 190 struct format_field *format_field = perf_evsel__field(evsel, name); 191 192 if (format_field == NULL) 193 return -1; 194 195 return tp_field__init_uint(field, format_field, evsel->needs_swap); 196 } 197 198 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \ 199 ({ struct syscall_tp *sc = evsel->priv;\ 200 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); }) 201 202 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel, 203 struct tp_field *field, 204 const char *name) 205 { 206 struct format_field *format_field = perf_evsel__field(evsel, name); 207 208 if (format_field == NULL) 209 return -1; 210 211 return tp_field__init_ptr(field, format_field); 212 } 213 214 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \ 215 ({ struct syscall_tp *sc = evsel->priv;\ 216 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); }) 217 218 static void perf_evsel__delete_priv(struct perf_evsel *evsel) 219 { 220 zfree(&evsel->priv); 221 perf_evsel__delete(evsel); 222 } 223 224 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler) 225 { 226 evsel->priv = malloc(sizeof(struct syscall_tp)); 227 if (evsel->priv != NULL) { 228 if (perf_evsel__init_sc_tp_uint_field(evsel, id)) 229 goto out_delete; 230 231 evsel->handler = handler; 232 return 0; 233 } 234 235 return -ENOMEM; 236 237 out_delete: 238 zfree(&evsel->priv); 239 return -ENOENT; 240 } 241 242 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler) 243 { 244 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction); 245 246 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */ 247 if (IS_ERR(evsel)) 248 evsel = perf_evsel__newtp("syscalls", direction); 249 250 if (IS_ERR(evsel)) 251 return NULL; 252 253 if (perf_evsel__init_syscall_tp(evsel, handler)) 254 goto out_delete; 255 256 return evsel; 257 258 out_delete: 259 perf_evsel__delete_priv(evsel); 260 return NULL; 261 } 262 263 #define perf_evsel__sc_tp_uint(evsel, name, sample) \ 264 ({ struct syscall_tp *fields = evsel->priv; \ 265 fields->name.integer(&fields->name, sample); }) 266 267 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \ 268 ({ struct syscall_tp *fields = evsel->priv; \ 269 fields->name.pointer(&fields->name, sample); }) 270 271 struct syscall_arg { 272 unsigned long val; 273 struct thread *thread; 274 struct trace *trace; 275 void *parm; 276 u8 idx; 277 u8 mask; 278 }; 279 280 struct strarray { 281 int offset; 282 int nr_entries; 283 const char **entries; 284 }; 285 286 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \ 287 .nr_entries = ARRAY_SIZE(array), \ 288 .entries = array, \ 289 } 290 291 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \ 292 .offset = off, \ 293 .nr_entries = ARRAY_SIZE(array), \ 294 .entries = array, \ 295 } 296 297 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size, 298 const char *intfmt, 299 struct syscall_arg *arg) 300 { 301 struct strarray *sa = arg->parm; 302 int idx = arg->val - sa->offset; 303 304 if (idx < 0 || idx >= sa->nr_entries) 305 return scnprintf(bf, size, intfmt, arg->val); 306 307 return scnprintf(bf, size, "%s", sa->entries[idx]); 308 } 309 310 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size, 311 struct syscall_arg *arg) 312 { 313 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg); 314 } 315 316 #define SCA_STRARRAY syscall_arg__scnprintf_strarray 317 318 #if defined(__i386__) || defined(__x86_64__) 319 /* 320 * FIXME: Make this available to all arches as soon as the ioctl beautifier 321 * gets rewritten to support all arches. 322 */ 323 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size, 324 struct syscall_arg *arg) 325 { 326 return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg); 327 } 328 329 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray 330 #endif /* defined(__i386__) || defined(__x86_64__) */ 331 332 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size, 333 struct syscall_arg *arg); 334 335 #define SCA_FD syscall_arg__scnprintf_fd 336 337 #ifndef AT_FDCWD 338 #define AT_FDCWD -100 339 #endif 340 341 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size, 342 struct syscall_arg *arg) 343 { 344 int fd = arg->val; 345 346 if (fd == AT_FDCWD) 347 return scnprintf(bf, size, "CWD"); 348 349 return syscall_arg__scnprintf_fd(bf, size, arg); 350 } 351 352 #define SCA_FDAT syscall_arg__scnprintf_fd_at 353 354 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size, 355 struct syscall_arg *arg); 356 357 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd 358 359 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size, 360 struct syscall_arg *arg) 361 { 362 return scnprintf(bf, size, "%#lx", arg->val); 363 } 364 365 #define SCA_HEX syscall_arg__scnprintf_hex 366 367 static size_t syscall_arg__scnprintf_int(char *bf, size_t size, 368 struct syscall_arg *arg) 369 { 370 return scnprintf(bf, size, "%d", arg->val); 371 } 372 373 #define SCA_INT syscall_arg__scnprintf_int 374 375 static const char *bpf_cmd[] = { 376 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM", 377 "MAP_GET_NEXT_KEY", "PROG_LOAD", 378 }; 379 static DEFINE_STRARRAY(bpf_cmd); 380 381 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", }; 382 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1); 383 384 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", }; 385 static DEFINE_STRARRAY(itimers); 386 387 static const char *keyctl_options[] = { 388 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN", 389 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ", 390 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT", 391 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT", 392 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT", 393 }; 394 static DEFINE_STRARRAY(keyctl_options); 395 396 static const char *whences[] = { "SET", "CUR", "END", 397 #ifdef SEEK_DATA 398 "DATA", 399 #endif 400 #ifdef SEEK_HOLE 401 "HOLE", 402 #endif 403 }; 404 static DEFINE_STRARRAY(whences); 405 406 static const char *fcntl_cmds[] = { 407 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK", 408 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64", 409 "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX", 410 "F_GETOWNER_UIDS", 411 }; 412 static DEFINE_STRARRAY(fcntl_cmds); 413 414 static const char *rlimit_resources[] = { 415 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE", 416 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO", 417 "RTTIME", 418 }; 419 static DEFINE_STRARRAY(rlimit_resources); 420 421 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", }; 422 static DEFINE_STRARRAY(sighow); 423 424 static const char *clockid[] = { 425 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID", 426 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME", 427 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI" 428 }; 429 static DEFINE_STRARRAY(clockid); 430 431 static const char *socket_families[] = { 432 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM", 433 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI", 434 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC", 435 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC", 436 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF", 437 "ALG", "NFC", "VSOCK", 438 }; 439 static DEFINE_STRARRAY(socket_families); 440 441 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size, 442 struct syscall_arg *arg) 443 { 444 size_t printed = 0; 445 int mode = arg->val; 446 447 if (mode == F_OK) /* 0 */ 448 return scnprintf(bf, size, "F"); 449 #define P_MODE(n) \ 450 if (mode & n##_OK) { \ 451 printed += scnprintf(bf + printed, size - printed, "%s", #n); \ 452 mode &= ~n##_OK; \ 453 } 454 455 P_MODE(R); 456 P_MODE(W); 457 P_MODE(X); 458 #undef P_MODE 459 460 if (mode) 461 printed += scnprintf(bf + printed, size - printed, "|%#x", mode); 462 463 return printed; 464 } 465 466 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode 467 468 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, 469 struct syscall_arg *arg); 470 471 #define SCA_FILENAME syscall_arg__scnprintf_filename 472 473 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size, 474 struct syscall_arg *arg) 475 { 476 int printed = 0, flags = arg->val; 477 478 #define P_FLAG(n) \ 479 if (flags & O_##n) { \ 480 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ 481 flags &= ~O_##n; \ 482 } 483 484 P_FLAG(CLOEXEC); 485 P_FLAG(NONBLOCK); 486 #undef P_FLAG 487 488 if (flags) 489 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); 490 491 return printed; 492 } 493 494 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags 495 496 #if defined(__i386__) || defined(__x86_64__) 497 /* 498 * FIXME: Make this available to all arches. 499 */ 500 #define TCGETS 0x5401 501 502 static const char *tioctls[] = { 503 "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW", 504 "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL", 505 "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI", 506 "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC", 507 "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX", 508 "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO", 509 "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK", 510 "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2", 511 "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK", 512 "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG", 513 "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL", 514 [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG", 515 "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS", 516 "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI", 517 "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE", 518 }; 519 520 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401); 521 #endif /* defined(__i386__) || defined(__x86_64__) */ 522 523 #ifndef GRND_NONBLOCK 524 #define GRND_NONBLOCK 0x0001 525 #endif 526 #ifndef GRND_RANDOM 527 #define GRND_RANDOM 0x0002 528 #endif 529 530 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, 531 struct syscall_arg *arg) 532 { 533 int printed = 0, flags = arg->val; 534 535 #define P_FLAG(n) \ 536 if (flags & GRND_##n) { \ 537 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ 538 flags &= ~GRND_##n; \ 539 } 540 541 P_FLAG(RANDOM); 542 P_FLAG(NONBLOCK); 543 #undef P_FLAG 544 545 if (flags) 546 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); 547 548 return printed; 549 } 550 551 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags 552 553 #define STRARRAY(arg, name, array) \ 554 .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \ 555 .arg_parm = { [arg] = &strarray__##array, } 556 557 #include "trace/beauty/eventfd.c" 558 #include "trace/beauty/flock.c" 559 #include "trace/beauty/futex_op.c" 560 #include "trace/beauty/mmap.c" 561 #include "trace/beauty/mode_t.c" 562 #include "trace/beauty/msg_flags.c" 563 #include "trace/beauty/open_flags.c" 564 #include "trace/beauty/perf_event_open.c" 565 #include "trace/beauty/pid.c" 566 #include "trace/beauty/sched_policy.c" 567 #include "trace/beauty/seccomp.c" 568 #include "trace/beauty/signum.c" 569 #include "trace/beauty/socket_type.c" 570 #include "trace/beauty/waitid_options.c" 571 572 static struct syscall_fmt { 573 const char *name; 574 const char *alias; 575 size_t (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg); 576 void *arg_parm[6]; 577 bool errmsg; 578 bool errpid; 579 bool timeout; 580 bool hexret; 581 } syscall_fmts[] = { 582 { .name = "access", .errmsg = true, 583 .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, }, 584 { .name = "arch_prctl", .errmsg = true, .alias = "prctl", }, 585 { .name = "bpf", .errmsg = true, STRARRAY(0, cmd, bpf_cmd), }, 586 { .name = "brk", .hexret = true, 587 .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, }, 588 { .name = "chdir", .errmsg = true, }, 589 { .name = "chmod", .errmsg = true, }, 590 { .name = "chroot", .errmsg = true, }, 591 { .name = "clock_gettime", .errmsg = true, STRARRAY(0, clk_id, clockid), }, 592 { .name = "clone", .errpid = true, }, 593 { .name = "close", .errmsg = true, 594 .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, }, 595 { .name = "connect", .errmsg = true, }, 596 { .name = "creat", .errmsg = true, }, 597 { .name = "dup", .errmsg = true, }, 598 { .name = "dup2", .errmsg = true, }, 599 { .name = "dup3", .errmsg = true, }, 600 { .name = "epoll_ctl", .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), }, 601 { .name = "eventfd2", .errmsg = true, 602 .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, }, 603 { .name = "faccessat", .errmsg = true, }, 604 { .name = "fadvise64", .errmsg = true, }, 605 { .name = "fallocate", .errmsg = true, }, 606 { .name = "fchdir", .errmsg = true, }, 607 { .name = "fchmod", .errmsg = true, }, 608 { .name = "fchmodat", .errmsg = true, 609 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 610 { .name = "fchown", .errmsg = true, }, 611 { .name = "fchownat", .errmsg = true, 612 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 613 { .name = "fcntl", .errmsg = true, 614 .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ }, 615 .arg_parm = { [1] = &strarray__fcntl_cmds, /* cmd */ }, }, 616 { .name = "fdatasync", .errmsg = true, }, 617 { .name = "flock", .errmsg = true, 618 .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, }, 619 { .name = "fsetxattr", .errmsg = true, }, 620 { .name = "fstat", .errmsg = true, .alias = "newfstat", }, 621 { .name = "fstatat", .errmsg = true, .alias = "newfstatat", }, 622 { .name = "fstatfs", .errmsg = true, }, 623 { .name = "fsync", .errmsg = true, }, 624 { .name = "ftruncate", .errmsg = true, }, 625 { .name = "futex", .errmsg = true, 626 .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, }, 627 { .name = "futimesat", .errmsg = true, 628 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 629 { .name = "getdents", .errmsg = true, }, 630 { .name = "getdents64", .errmsg = true, }, 631 { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), }, 632 { .name = "getpid", .errpid = true, }, 633 { .name = "getpgid", .errpid = true, }, 634 { .name = "getppid", .errpid = true, }, 635 { .name = "getrandom", .errmsg = true, 636 .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, }, 637 { .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), }, 638 { .name = "getxattr", .errmsg = true, }, 639 { .name = "inotify_add_watch", .errmsg = true, }, 640 { .name = "ioctl", .errmsg = true, 641 .arg_scnprintf = { 642 #if defined(__i386__) || defined(__x86_64__) 643 /* 644 * FIXME: Make this available to all arches. 645 */ 646 [1] = SCA_STRHEXARRAY, /* cmd */ 647 [2] = SCA_HEX, /* arg */ }, 648 .arg_parm = { [1] = &strarray__tioctls, /* cmd */ }, }, 649 #else 650 [2] = SCA_HEX, /* arg */ }, }, 651 #endif 652 { .name = "keyctl", .errmsg = true, STRARRAY(0, option, keyctl_options), }, 653 { .name = "kill", .errmsg = true, 654 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, }, 655 { .name = "lchown", .errmsg = true, }, 656 { .name = "lgetxattr", .errmsg = true, }, 657 { .name = "linkat", .errmsg = true, 658 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 659 { .name = "listxattr", .errmsg = true, }, 660 { .name = "llistxattr", .errmsg = true, }, 661 { .name = "lremovexattr", .errmsg = true, }, 662 { .name = "lseek", .errmsg = true, 663 .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ }, 664 .arg_parm = { [2] = &strarray__whences, /* whence */ }, }, 665 { .name = "lsetxattr", .errmsg = true, }, 666 { .name = "lstat", .errmsg = true, .alias = "newlstat", }, 667 { .name = "lsxattr", .errmsg = true, }, 668 { .name = "madvise", .errmsg = true, 669 .arg_scnprintf = { [0] = SCA_HEX, /* start */ 670 [2] = SCA_MADV_BHV, /* behavior */ }, }, 671 { .name = "mkdir", .errmsg = true, }, 672 { .name = "mkdirat", .errmsg = true, 673 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 674 { .name = "mknod", .errmsg = true, }, 675 { .name = "mknodat", .errmsg = true, 676 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 677 { .name = "mlock", .errmsg = true, 678 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, }, 679 { .name = "mlockall", .errmsg = true, 680 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, }, 681 { .name = "mmap", .hexret = true, 682 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ 683 [2] = SCA_MMAP_PROT, /* prot */ 684 [3] = SCA_MMAP_FLAGS, /* flags */ }, }, 685 { .name = "mprotect", .errmsg = true, 686 .arg_scnprintf = { [0] = SCA_HEX, /* start */ 687 [2] = SCA_MMAP_PROT, /* prot */ }, }, 688 { .name = "mq_unlink", .errmsg = true, 689 .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, }, 690 { .name = "mremap", .hexret = true, 691 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ 692 [3] = SCA_MREMAP_FLAGS, /* flags */ 693 [4] = SCA_HEX, /* new_addr */ }, }, 694 { .name = "munlock", .errmsg = true, 695 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, }, 696 { .name = "munmap", .errmsg = true, 697 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, }, 698 { .name = "name_to_handle_at", .errmsg = true, 699 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 700 { .name = "newfstatat", .errmsg = true, 701 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 702 { .name = "open", .errmsg = true, 703 .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, }, 704 { .name = "open_by_handle_at", .errmsg = true, 705 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ 706 [2] = SCA_OPEN_FLAGS, /* flags */ }, }, 707 { .name = "openat", .errmsg = true, 708 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ 709 [2] = SCA_OPEN_FLAGS, /* flags */ }, }, 710 { .name = "perf_event_open", .errmsg = true, 711 .arg_scnprintf = { [2] = SCA_INT, /* cpu */ 712 [3] = SCA_FD, /* group_fd */ 713 [4] = SCA_PERF_FLAGS, /* flags */ }, }, 714 { .name = "pipe2", .errmsg = true, 715 .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, }, 716 { .name = "poll", .errmsg = true, .timeout = true, }, 717 { .name = "ppoll", .errmsg = true, .timeout = true, }, 718 { .name = "pread", .errmsg = true, .alias = "pread64", }, 719 { .name = "preadv", .errmsg = true, .alias = "pread", }, 720 { .name = "prlimit64", .errmsg = true, STRARRAY(1, resource, rlimit_resources), }, 721 { .name = "pwrite", .errmsg = true, .alias = "pwrite64", }, 722 { .name = "pwritev", .errmsg = true, }, 723 { .name = "read", .errmsg = true, }, 724 { .name = "readlink", .errmsg = true, }, 725 { .name = "readlinkat", .errmsg = true, 726 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 727 { .name = "readv", .errmsg = true, }, 728 { .name = "recvfrom", .errmsg = true, 729 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, 730 { .name = "recvmmsg", .errmsg = true, 731 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, 732 { .name = "recvmsg", .errmsg = true, 733 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, }, 734 { .name = "removexattr", .errmsg = true, }, 735 { .name = "renameat", .errmsg = true, 736 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 737 { .name = "rmdir", .errmsg = true, }, 738 { .name = "rt_sigaction", .errmsg = true, 739 .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, }, 740 { .name = "rt_sigprocmask", .errmsg = true, STRARRAY(0, how, sighow), }, 741 { .name = "rt_sigqueueinfo", .errmsg = true, 742 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, }, 743 { .name = "rt_tgsigqueueinfo", .errmsg = true, 744 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, }, 745 { .name = "sched_getattr", .errmsg = true, }, 746 { .name = "sched_setattr", .errmsg = true, }, 747 { .name = "sched_setscheduler", .errmsg = true, 748 .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, }, 749 { .name = "seccomp", .errmsg = true, 750 .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */ 751 [1] = SCA_SECCOMP_FLAGS, /* flags */ }, }, 752 { .name = "select", .errmsg = true, .timeout = true, }, 753 { .name = "sendmmsg", .errmsg = true, 754 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, 755 { .name = "sendmsg", .errmsg = true, 756 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, }, 757 { .name = "sendto", .errmsg = true, 758 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, }, 759 { .name = "set_tid_address", .errpid = true, }, 760 { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), }, 761 { .name = "setpgid", .errmsg = true, }, 762 { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), }, 763 { .name = "setxattr", .errmsg = true, }, 764 { .name = "shutdown", .errmsg = true, }, 765 { .name = "socket", .errmsg = true, 766 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */ 767 [1] = SCA_SK_TYPE, /* type */ }, 768 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, }, 769 { .name = "socketpair", .errmsg = true, 770 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */ 771 [1] = SCA_SK_TYPE, /* type */ }, 772 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, }, 773 { .name = "stat", .errmsg = true, .alias = "newstat", }, 774 { .name = "statfs", .errmsg = true, }, 775 { .name = "swapoff", .errmsg = true, 776 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, }, 777 { .name = "swapon", .errmsg = true, 778 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, }, 779 { .name = "symlinkat", .errmsg = true, 780 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 781 { .name = "tgkill", .errmsg = true, 782 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, }, 783 { .name = "tkill", .errmsg = true, 784 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, }, 785 { .name = "truncate", .errmsg = true, }, 786 { .name = "uname", .errmsg = true, .alias = "newuname", }, 787 { .name = "unlinkat", .errmsg = true, 788 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 789 { .name = "utime", .errmsg = true, }, 790 { .name = "utimensat", .errmsg = true, 791 .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, }, 792 { .name = "utimes", .errmsg = true, }, 793 { .name = "vmsplice", .errmsg = true, }, 794 { .name = "wait4", .errpid = true, 795 .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, }, 796 { .name = "waitid", .errpid = true, 797 .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, }, 798 { .name = "write", .errmsg = true, }, 799 { .name = "writev", .errmsg = true, }, 800 }; 801 802 static int syscall_fmt__cmp(const void *name, const void *fmtp) 803 { 804 const struct syscall_fmt *fmt = fmtp; 805 return strcmp(name, fmt->name); 806 } 807 808 static struct syscall_fmt *syscall_fmt__find(const char *name) 809 { 810 const int nmemb = ARRAY_SIZE(syscall_fmts); 811 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp); 812 } 813 814 struct syscall { 815 struct event_format *tp_format; 816 int nr_args; 817 struct format_field *args; 818 const char *name; 819 bool is_exit; 820 struct syscall_fmt *fmt; 821 size_t (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg); 822 void **arg_parm; 823 }; 824 825 static size_t fprintf_duration(unsigned long t, FILE *fp) 826 { 827 double duration = (double)t / NSEC_PER_MSEC; 828 size_t printed = fprintf(fp, "("); 829 830 if (duration >= 1.0) 831 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration); 832 else if (duration >= 0.01) 833 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration); 834 else 835 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration); 836 return printed + fprintf(fp, "): "); 837 } 838 839 /** 840 * filename.ptr: The filename char pointer that will be vfs_getname'd 841 * filename.entry_str_pos: Where to insert the string translated from 842 * filename.ptr by the vfs_getname tracepoint/kprobe. 843 */ 844 struct thread_trace { 845 u64 entry_time; 846 bool entry_pending; 847 unsigned long nr_events; 848 unsigned long pfmaj, pfmin; 849 char *entry_str; 850 double runtime_ms; 851 struct { 852 unsigned long ptr; 853 short int entry_str_pos; 854 bool pending_open; 855 unsigned int namelen; 856 char *name; 857 } filename; 858 struct { 859 int max; 860 char **table; 861 } paths; 862 863 struct intlist *syscall_stats; 864 }; 865 866 static struct thread_trace *thread_trace__new(void) 867 { 868 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace)); 869 870 if (ttrace) 871 ttrace->paths.max = -1; 872 873 ttrace->syscall_stats = intlist__new(NULL); 874 875 return ttrace; 876 } 877 878 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp) 879 { 880 struct thread_trace *ttrace; 881 882 if (thread == NULL) 883 goto fail; 884 885 if (thread__priv(thread) == NULL) 886 thread__set_priv(thread, thread_trace__new()); 887 888 if (thread__priv(thread) == NULL) 889 goto fail; 890 891 ttrace = thread__priv(thread); 892 ++ttrace->nr_events; 893 894 return ttrace; 895 fail: 896 color_fprintf(fp, PERF_COLOR_RED, 897 "WARNING: not enough memory, dropping samples!\n"); 898 return NULL; 899 } 900 901 #define TRACE_PFMAJ (1 << 0) 902 #define TRACE_PFMIN (1 << 1) 903 904 static const size_t trace__entry_str_size = 2048; 905 906 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname) 907 { 908 struct thread_trace *ttrace = thread__priv(thread); 909 910 if (fd > ttrace->paths.max) { 911 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *)); 912 913 if (npath == NULL) 914 return -1; 915 916 if (ttrace->paths.max != -1) { 917 memset(npath + ttrace->paths.max + 1, 0, 918 (fd - ttrace->paths.max) * sizeof(char *)); 919 } else { 920 memset(npath, 0, (fd + 1) * sizeof(char *)); 921 } 922 923 ttrace->paths.table = npath; 924 ttrace->paths.max = fd; 925 } 926 927 ttrace->paths.table[fd] = strdup(pathname); 928 929 return ttrace->paths.table[fd] != NULL ? 0 : -1; 930 } 931 932 static int thread__read_fd_path(struct thread *thread, int fd) 933 { 934 char linkname[PATH_MAX], pathname[PATH_MAX]; 935 struct stat st; 936 int ret; 937 938 if (thread->pid_ == thread->tid) { 939 scnprintf(linkname, sizeof(linkname), 940 "/proc/%d/fd/%d", thread->pid_, fd); 941 } else { 942 scnprintf(linkname, sizeof(linkname), 943 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd); 944 } 945 946 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname)) 947 return -1; 948 949 ret = readlink(linkname, pathname, sizeof(pathname)); 950 951 if (ret < 0 || ret > st.st_size) 952 return -1; 953 954 pathname[ret] = '\0'; 955 return trace__set_fd_pathname(thread, fd, pathname); 956 } 957 958 static const char *thread__fd_path(struct thread *thread, int fd, 959 struct trace *trace) 960 { 961 struct thread_trace *ttrace = thread__priv(thread); 962 963 if (ttrace == NULL) 964 return NULL; 965 966 if (fd < 0) 967 return NULL; 968 969 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) { 970 if (!trace->live) 971 return NULL; 972 ++trace->stats.proc_getname; 973 if (thread__read_fd_path(thread, fd)) 974 return NULL; 975 } 976 977 return ttrace->paths.table[fd]; 978 } 979 980 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size, 981 struct syscall_arg *arg) 982 { 983 int fd = arg->val; 984 size_t printed = scnprintf(bf, size, "%d", fd); 985 const char *path = thread__fd_path(arg->thread, fd, arg->trace); 986 987 if (path) 988 printed += scnprintf(bf + printed, size - printed, "<%s>", path); 989 990 return printed; 991 } 992 993 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size, 994 struct syscall_arg *arg) 995 { 996 int fd = arg->val; 997 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg); 998 struct thread_trace *ttrace = thread__priv(arg->thread); 999 1000 if (ttrace && fd >= 0 && fd <= ttrace->paths.max) 1001 zfree(&ttrace->paths.table[fd]); 1002 1003 return printed; 1004 } 1005 1006 static void thread__set_filename_pos(struct thread *thread, const char *bf, 1007 unsigned long ptr) 1008 { 1009 struct thread_trace *ttrace = thread__priv(thread); 1010 1011 ttrace->filename.ptr = ptr; 1012 ttrace->filename.entry_str_pos = bf - ttrace->entry_str; 1013 } 1014 1015 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, 1016 struct syscall_arg *arg) 1017 { 1018 unsigned long ptr = arg->val; 1019 1020 if (!arg->trace->vfs_getname) 1021 return scnprintf(bf, size, "%#x", ptr); 1022 1023 thread__set_filename_pos(arg->thread, bf, ptr); 1024 return 0; 1025 } 1026 1027 static bool trace__filter_duration(struct trace *trace, double t) 1028 { 1029 return t < (trace->duration_filter * NSEC_PER_MSEC); 1030 } 1031 1032 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp) 1033 { 1034 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC; 1035 1036 return fprintf(fp, "%10.3f ", ts); 1037 } 1038 1039 static bool done = false; 1040 static bool interrupted = false; 1041 1042 static void sig_handler(int sig) 1043 { 1044 done = true; 1045 interrupted = sig == SIGINT; 1046 } 1047 1048 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread, 1049 u64 duration, u64 tstamp, FILE *fp) 1050 { 1051 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp); 1052 printed += fprintf_duration(duration, fp); 1053 1054 if (trace->multiple_threads) { 1055 if (trace->show_comm) 1056 printed += fprintf(fp, "%.14s/", thread__comm_str(thread)); 1057 printed += fprintf(fp, "%d ", thread->tid); 1058 } 1059 1060 return printed; 1061 } 1062 1063 static int trace__process_event(struct trace *trace, struct machine *machine, 1064 union perf_event *event, struct perf_sample *sample) 1065 { 1066 int ret = 0; 1067 1068 switch (event->header.type) { 1069 case PERF_RECORD_LOST: 1070 color_fprintf(trace->output, PERF_COLOR_RED, 1071 "LOST %" PRIu64 " events!\n", event->lost.lost); 1072 ret = machine__process_lost_event(machine, event, sample); 1073 break; 1074 default: 1075 ret = machine__process_event(machine, event, sample); 1076 break; 1077 } 1078 1079 return ret; 1080 } 1081 1082 static int trace__tool_process(struct perf_tool *tool, 1083 union perf_event *event, 1084 struct perf_sample *sample, 1085 struct machine *machine) 1086 { 1087 struct trace *trace = container_of(tool, struct trace, tool); 1088 return trace__process_event(trace, machine, event, sample); 1089 } 1090 1091 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp) 1092 { 1093 struct machine *machine = vmachine; 1094 1095 if (machine->kptr_restrict_warned) 1096 return NULL; 1097 1098 if (symbol_conf.kptr_restrict) { 1099 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n" 1100 "Check /proc/sys/kernel/kptr_restrict.\n\n" 1101 "Kernel samples will not be resolved.\n"); 1102 machine->kptr_restrict_warned = true; 1103 return NULL; 1104 } 1105 1106 return machine__resolve_kernel_addr(vmachine, addrp, modp); 1107 } 1108 1109 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist) 1110 { 1111 int err = symbol__init(NULL); 1112 1113 if (err) 1114 return err; 1115 1116 trace->host = machine__new_host(); 1117 if (trace->host == NULL) 1118 return -ENOMEM; 1119 1120 if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0) 1121 return -errno; 1122 1123 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target, 1124 evlist->threads, trace__tool_process, false, 1125 trace->opts.proc_map_timeout); 1126 if (err) 1127 symbol__exit(); 1128 1129 return err; 1130 } 1131 1132 static int syscall__set_arg_fmts(struct syscall *sc) 1133 { 1134 struct format_field *field; 1135 int idx = 0, len; 1136 1137 sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *)); 1138 if (sc->arg_scnprintf == NULL) 1139 return -1; 1140 1141 if (sc->fmt) 1142 sc->arg_parm = sc->fmt->arg_parm; 1143 1144 for (field = sc->args; field; field = field->next) { 1145 if (sc->fmt && sc->fmt->arg_scnprintf[idx]) 1146 sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx]; 1147 else if (strcmp(field->type, "const char *") == 0 && 1148 (strcmp(field->name, "filename") == 0 || 1149 strcmp(field->name, "path") == 0 || 1150 strcmp(field->name, "pathname") == 0)) 1151 sc->arg_scnprintf[idx] = SCA_FILENAME; 1152 else if (field->flags & FIELD_IS_POINTER) 1153 sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex; 1154 else if (strcmp(field->type, "pid_t") == 0) 1155 sc->arg_scnprintf[idx] = SCA_PID; 1156 else if (strcmp(field->type, "umode_t") == 0) 1157 sc->arg_scnprintf[idx] = SCA_MODE_T; 1158 else if ((strcmp(field->type, "int") == 0 || 1159 strcmp(field->type, "unsigned int") == 0 || 1160 strcmp(field->type, "long") == 0) && 1161 (len = strlen(field->name)) >= 2 && 1162 strcmp(field->name + len - 2, "fd") == 0) { 1163 /* 1164 * /sys/kernel/tracing/events/syscalls/sys_enter* 1165 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c 1166 * 65 int 1167 * 23 unsigned int 1168 * 7 unsigned long 1169 */ 1170 sc->arg_scnprintf[idx] = SCA_FD; 1171 } 1172 ++idx; 1173 } 1174 1175 return 0; 1176 } 1177 1178 static int trace__read_syscall_info(struct trace *trace, int id) 1179 { 1180 char tp_name[128]; 1181 struct syscall *sc; 1182 const char *name = syscalltbl__name(trace->sctbl, id); 1183 1184 if (name == NULL) 1185 return -1; 1186 1187 if (id > trace->syscalls.max) { 1188 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc)); 1189 1190 if (nsyscalls == NULL) 1191 return -1; 1192 1193 if (trace->syscalls.max != -1) { 1194 memset(nsyscalls + trace->syscalls.max + 1, 0, 1195 (id - trace->syscalls.max) * sizeof(*sc)); 1196 } else { 1197 memset(nsyscalls, 0, (id + 1) * sizeof(*sc)); 1198 } 1199 1200 trace->syscalls.table = nsyscalls; 1201 trace->syscalls.max = id; 1202 } 1203 1204 sc = trace->syscalls.table + id; 1205 sc->name = name; 1206 1207 sc->fmt = syscall_fmt__find(sc->name); 1208 1209 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name); 1210 sc->tp_format = trace_event__tp_format("syscalls", tp_name); 1211 1212 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) { 1213 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias); 1214 sc->tp_format = trace_event__tp_format("syscalls", tp_name); 1215 } 1216 1217 if (IS_ERR(sc->tp_format)) 1218 return -1; 1219 1220 sc->args = sc->tp_format->format.fields; 1221 sc->nr_args = sc->tp_format->format.nr_fields; 1222 /* 1223 * We need to check and discard the first variable '__syscall_nr' 1224 * or 'nr' that mean the syscall number. It is needless here. 1225 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels. 1226 */ 1227 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) { 1228 sc->args = sc->args->next; 1229 --sc->nr_args; 1230 } 1231 1232 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit"); 1233 1234 return syscall__set_arg_fmts(sc); 1235 } 1236 1237 static int trace__validate_ev_qualifier(struct trace *trace) 1238 { 1239 int err = 0, i; 1240 struct str_node *pos; 1241 1242 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier); 1243 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr * 1244 sizeof(trace->ev_qualifier_ids.entries[0])); 1245 1246 if (trace->ev_qualifier_ids.entries == NULL) { 1247 fputs("Error:\tNot enough memory for allocating events qualifier ids\n", 1248 trace->output); 1249 err = -EINVAL; 1250 goto out; 1251 } 1252 1253 i = 0; 1254 1255 strlist__for_each_entry(pos, trace->ev_qualifier) { 1256 const char *sc = pos->s; 1257 int id = syscalltbl__id(trace->sctbl, sc); 1258 1259 if (id < 0) { 1260 if (err == 0) { 1261 fputs("Error:\tInvalid syscall ", trace->output); 1262 err = -EINVAL; 1263 } else { 1264 fputs(", ", trace->output); 1265 } 1266 1267 fputs(sc, trace->output); 1268 } 1269 1270 trace->ev_qualifier_ids.entries[i++] = id; 1271 } 1272 1273 if (err < 0) { 1274 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'" 1275 "\nHint:\tand: 'man syscalls'\n", trace->output); 1276 zfree(&trace->ev_qualifier_ids.entries); 1277 trace->ev_qualifier_ids.nr = 0; 1278 } 1279 out: 1280 return err; 1281 } 1282 1283 /* 1284 * args is to be interpreted as a series of longs but we need to handle 1285 * 8-byte unaligned accesses. args points to raw_data within the event 1286 * and raw_data is guaranteed to be 8-byte unaligned because it is 1287 * preceded by raw_size which is a u32. So we need to copy args to a temp 1288 * variable to read it. Most notably this avoids extended load instructions 1289 * on unaligned addresses 1290 */ 1291 1292 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, 1293 unsigned char *args, struct trace *trace, 1294 struct thread *thread) 1295 { 1296 size_t printed = 0; 1297 unsigned char *p; 1298 unsigned long val; 1299 1300 if (sc->args != NULL) { 1301 struct format_field *field; 1302 u8 bit = 1; 1303 struct syscall_arg arg = { 1304 .idx = 0, 1305 .mask = 0, 1306 .trace = trace, 1307 .thread = thread, 1308 }; 1309 1310 for (field = sc->args; field; 1311 field = field->next, ++arg.idx, bit <<= 1) { 1312 if (arg.mask & bit) 1313 continue; 1314 1315 /* special care for unaligned accesses */ 1316 p = args + sizeof(unsigned long) * arg.idx; 1317 memcpy(&val, p, sizeof(val)); 1318 1319 /* 1320 * Suppress this argument if its value is zero and 1321 * and we don't have a string associated in an 1322 * strarray for it. 1323 */ 1324 if (val == 0 && 1325 !(sc->arg_scnprintf && 1326 sc->arg_scnprintf[arg.idx] == SCA_STRARRAY && 1327 sc->arg_parm[arg.idx])) 1328 continue; 1329 1330 printed += scnprintf(bf + printed, size - printed, 1331 "%s%s: ", printed ? ", " : "", field->name); 1332 if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) { 1333 arg.val = val; 1334 if (sc->arg_parm) 1335 arg.parm = sc->arg_parm[arg.idx]; 1336 printed += sc->arg_scnprintf[arg.idx](bf + printed, 1337 size - printed, &arg); 1338 } else { 1339 printed += scnprintf(bf + printed, size - printed, 1340 "%ld", val); 1341 } 1342 } 1343 } else if (IS_ERR(sc->tp_format)) { 1344 /* 1345 * If we managed to read the tracepoint /format file, then we 1346 * may end up not having any args, like with gettid(), so only 1347 * print the raw args when we didn't manage to read it. 1348 */ 1349 int i = 0; 1350 1351 while (i < 6) { 1352 /* special care for unaligned accesses */ 1353 p = args + sizeof(unsigned long) * i; 1354 memcpy(&val, p, sizeof(val)); 1355 printed += scnprintf(bf + printed, size - printed, 1356 "%sarg%d: %ld", 1357 printed ? ", " : "", i, val); 1358 ++i; 1359 } 1360 } 1361 1362 return printed; 1363 } 1364 1365 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel, 1366 union perf_event *event, 1367 struct perf_sample *sample); 1368 1369 static struct syscall *trace__syscall_info(struct trace *trace, 1370 struct perf_evsel *evsel, int id) 1371 { 1372 1373 if (id < 0) { 1374 1375 /* 1376 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried 1377 * before that, leaving at a higher verbosity level till that is 1378 * explained. Reproduced with plain ftrace with: 1379 * 1380 * echo 1 > /t/events/raw_syscalls/sys_exit/enable 1381 * grep "NR -1 " /t/trace_pipe 1382 * 1383 * After generating some load on the machine. 1384 */ 1385 if (verbose > 1) { 1386 static u64 n; 1387 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n", 1388 id, perf_evsel__name(evsel), ++n); 1389 } 1390 return NULL; 1391 } 1392 1393 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) && 1394 trace__read_syscall_info(trace, id)) 1395 goto out_cant_read; 1396 1397 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL)) 1398 goto out_cant_read; 1399 1400 return &trace->syscalls.table[id]; 1401 1402 out_cant_read: 1403 if (verbose) { 1404 fprintf(trace->output, "Problems reading syscall %d", id); 1405 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL) 1406 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name); 1407 fputs(" information\n", trace->output); 1408 } 1409 return NULL; 1410 } 1411 1412 static void thread__update_stats(struct thread_trace *ttrace, 1413 int id, struct perf_sample *sample) 1414 { 1415 struct int_node *inode; 1416 struct stats *stats; 1417 u64 duration = 0; 1418 1419 inode = intlist__findnew(ttrace->syscall_stats, id); 1420 if (inode == NULL) 1421 return; 1422 1423 stats = inode->priv; 1424 if (stats == NULL) { 1425 stats = malloc(sizeof(struct stats)); 1426 if (stats == NULL) 1427 return; 1428 init_stats(stats); 1429 inode->priv = stats; 1430 } 1431 1432 if (ttrace->entry_time && sample->time > ttrace->entry_time) 1433 duration = sample->time - ttrace->entry_time; 1434 1435 update_stats(stats, duration); 1436 } 1437 1438 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample) 1439 { 1440 struct thread_trace *ttrace; 1441 u64 duration; 1442 size_t printed; 1443 1444 if (trace->current == NULL) 1445 return 0; 1446 1447 ttrace = thread__priv(trace->current); 1448 1449 if (!ttrace->entry_pending) 1450 return 0; 1451 1452 duration = sample->time - ttrace->entry_time; 1453 1454 printed = trace__fprintf_entry_head(trace, trace->current, duration, ttrace->entry_time, trace->output); 1455 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str); 1456 ttrace->entry_pending = false; 1457 1458 return printed; 1459 } 1460 1461 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, 1462 union perf_event *event __maybe_unused, 1463 struct perf_sample *sample) 1464 { 1465 char *msg; 1466 void *args; 1467 size_t printed = 0; 1468 struct thread *thread; 1469 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; 1470 struct syscall *sc = trace__syscall_info(trace, evsel, id); 1471 struct thread_trace *ttrace; 1472 1473 if (sc == NULL) 1474 return -1; 1475 1476 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1477 ttrace = thread__trace(thread, trace->output); 1478 if (ttrace == NULL) 1479 goto out_put; 1480 1481 args = perf_evsel__sc_tp_ptr(evsel, args, sample); 1482 1483 if (ttrace->entry_str == NULL) { 1484 ttrace->entry_str = malloc(trace__entry_str_size); 1485 if (!ttrace->entry_str) 1486 goto out_put; 1487 } 1488 1489 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) 1490 trace__printf_interrupted_entry(trace, sample); 1491 1492 ttrace->entry_time = sample->time; 1493 msg = ttrace->entry_str; 1494 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name); 1495 1496 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed, 1497 args, trace, thread); 1498 1499 if (sc->is_exit) { 1500 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) { 1501 trace__fprintf_entry_head(trace, thread, 1, ttrace->entry_time, trace->output); 1502 fprintf(trace->output, "%-70s)\n", ttrace->entry_str); 1503 } 1504 } else { 1505 ttrace->entry_pending = true; 1506 /* See trace__vfs_getname & trace__sys_exit */ 1507 ttrace->filename.pending_open = false; 1508 } 1509 1510 if (trace->current != thread) { 1511 thread__put(trace->current); 1512 trace->current = thread__get(thread); 1513 } 1514 err = 0; 1515 out_put: 1516 thread__put(thread); 1517 return err; 1518 } 1519 1520 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel, 1521 struct perf_sample *sample, 1522 struct callchain_cursor *cursor) 1523 { 1524 struct addr_location al; 1525 1526 if (machine__resolve(trace->host, &al, sample) < 0 || 1527 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack)) 1528 return -1; 1529 1530 return 0; 1531 } 1532 1533 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample) 1534 { 1535 /* TODO: user-configurable print_opts */ 1536 const unsigned int print_opts = EVSEL__PRINT_SYM | 1537 EVSEL__PRINT_DSO | 1538 EVSEL__PRINT_UNKNOWN_AS_ADDR; 1539 1540 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output); 1541 } 1542 1543 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, 1544 union perf_event *event __maybe_unused, 1545 struct perf_sample *sample) 1546 { 1547 long ret; 1548 u64 duration = 0; 1549 struct thread *thread; 1550 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0; 1551 struct syscall *sc = trace__syscall_info(trace, evsel, id); 1552 struct thread_trace *ttrace; 1553 1554 if (sc == NULL) 1555 return -1; 1556 1557 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1558 ttrace = thread__trace(thread, trace->output); 1559 if (ttrace == NULL) 1560 goto out_put; 1561 1562 if (trace->summary) 1563 thread__update_stats(ttrace, id, sample); 1564 1565 ret = perf_evsel__sc_tp_uint(evsel, ret, sample); 1566 1567 if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) { 1568 trace__set_fd_pathname(thread, ret, ttrace->filename.name); 1569 ttrace->filename.pending_open = false; 1570 ++trace->stats.vfs_getname; 1571 } 1572 1573 if (ttrace->entry_time) { 1574 duration = sample->time - ttrace->entry_time; 1575 if (trace__filter_duration(trace, duration)) 1576 goto out; 1577 } else if (trace->duration_filter) 1578 goto out; 1579 1580 if (sample->callchain) { 1581 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 1582 if (callchain_ret == 0) { 1583 if (callchain_cursor.nr < trace->min_stack) 1584 goto out; 1585 callchain_ret = 1; 1586 } 1587 } 1588 1589 if (trace->summary_only) 1590 goto out; 1591 1592 trace__fprintf_entry_head(trace, thread, duration, ttrace->entry_time, trace->output); 1593 1594 if (ttrace->entry_pending) { 1595 fprintf(trace->output, "%-70s", ttrace->entry_str); 1596 } else { 1597 fprintf(trace->output, " ... ["); 1598 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued"); 1599 fprintf(trace->output, "]: %s()", sc->name); 1600 } 1601 1602 if (sc->fmt == NULL) { 1603 signed_print: 1604 fprintf(trace->output, ") = %ld", ret); 1605 } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) { 1606 char bf[STRERR_BUFSIZE]; 1607 const char *emsg = str_error_r(-ret, bf, sizeof(bf)), 1608 *e = audit_errno_to_name(-ret); 1609 1610 fprintf(trace->output, ") = -1 %s %s", e, emsg); 1611 } else if (ret == 0 && sc->fmt->timeout) 1612 fprintf(trace->output, ") = 0 Timeout"); 1613 else if (sc->fmt->hexret) 1614 fprintf(trace->output, ") = %#lx", ret); 1615 else if (sc->fmt->errpid) { 1616 struct thread *child = machine__find_thread(trace->host, ret, ret); 1617 1618 if (child != NULL) { 1619 fprintf(trace->output, ") = %ld", ret); 1620 if (child->comm_set) 1621 fprintf(trace->output, " (%s)", thread__comm_str(child)); 1622 thread__put(child); 1623 } 1624 } else 1625 goto signed_print; 1626 1627 fputc('\n', trace->output); 1628 1629 if (callchain_ret > 0) 1630 trace__fprintf_callchain(trace, sample); 1631 else if (callchain_ret < 0) 1632 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 1633 out: 1634 ttrace->entry_pending = false; 1635 err = 0; 1636 out_put: 1637 thread__put(thread); 1638 return err; 1639 } 1640 1641 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel, 1642 union perf_event *event __maybe_unused, 1643 struct perf_sample *sample) 1644 { 1645 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1646 struct thread_trace *ttrace; 1647 size_t filename_len, entry_str_len, to_move; 1648 ssize_t remaining_space; 1649 char *pos; 1650 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname"); 1651 1652 if (!thread) 1653 goto out; 1654 1655 ttrace = thread__priv(thread); 1656 if (!ttrace) 1657 goto out; 1658 1659 filename_len = strlen(filename); 1660 1661 if (ttrace->filename.namelen < filename_len) { 1662 char *f = realloc(ttrace->filename.name, filename_len + 1); 1663 1664 if (f == NULL) 1665 goto out; 1666 1667 ttrace->filename.namelen = filename_len; 1668 ttrace->filename.name = f; 1669 } 1670 1671 strcpy(ttrace->filename.name, filename); 1672 ttrace->filename.pending_open = true; 1673 1674 if (!ttrace->filename.ptr) 1675 goto out; 1676 1677 entry_str_len = strlen(ttrace->entry_str); 1678 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */ 1679 if (remaining_space <= 0) 1680 goto out; 1681 1682 if (filename_len > (size_t)remaining_space) { 1683 filename += filename_len - remaining_space; 1684 filename_len = remaining_space; 1685 } 1686 1687 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */ 1688 pos = ttrace->entry_str + ttrace->filename.entry_str_pos; 1689 memmove(pos + filename_len, pos, to_move); 1690 memcpy(pos, filename, filename_len); 1691 1692 ttrace->filename.ptr = 0; 1693 ttrace->filename.entry_str_pos = 0; 1694 out: 1695 return 0; 1696 } 1697 1698 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel, 1699 union perf_event *event __maybe_unused, 1700 struct perf_sample *sample) 1701 { 1702 u64 runtime = perf_evsel__intval(evsel, sample, "runtime"); 1703 double runtime_ms = (double)runtime / NSEC_PER_MSEC; 1704 struct thread *thread = machine__findnew_thread(trace->host, 1705 sample->pid, 1706 sample->tid); 1707 struct thread_trace *ttrace = thread__trace(thread, trace->output); 1708 1709 if (ttrace == NULL) 1710 goto out_dump; 1711 1712 ttrace->runtime_ms += runtime_ms; 1713 trace->runtime_ms += runtime_ms; 1714 thread__put(thread); 1715 return 0; 1716 1717 out_dump: 1718 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n", 1719 evsel->name, 1720 perf_evsel__strval(evsel, sample, "comm"), 1721 (pid_t)perf_evsel__intval(evsel, sample, "pid"), 1722 runtime, 1723 perf_evsel__intval(evsel, sample, "vruntime")); 1724 thread__put(thread); 1725 return 0; 1726 } 1727 1728 static void bpf_output__printer(enum binary_printer_ops op, 1729 unsigned int val, void *extra) 1730 { 1731 FILE *output = extra; 1732 unsigned char ch = (unsigned char)val; 1733 1734 switch (op) { 1735 case BINARY_PRINT_CHAR_DATA: 1736 fprintf(output, "%c", isprint(ch) ? ch : '.'); 1737 break; 1738 case BINARY_PRINT_DATA_BEGIN: 1739 case BINARY_PRINT_LINE_BEGIN: 1740 case BINARY_PRINT_ADDR: 1741 case BINARY_PRINT_NUM_DATA: 1742 case BINARY_PRINT_NUM_PAD: 1743 case BINARY_PRINT_SEP: 1744 case BINARY_PRINT_CHAR_PAD: 1745 case BINARY_PRINT_LINE_END: 1746 case BINARY_PRINT_DATA_END: 1747 default: 1748 break; 1749 } 1750 } 1751 1752 static void bpf_output__fprintf(struct trace *trace, 1753 struct perf_sample *sample) 1754 { 1755 print_binary(sample->raw_data, sample->raw_size, 8, 1756 bpf_output__printer, trace->output); 1757 } 1758 1759 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, 1760 union perf_event *event __maybe_unused, 1761 struct perf_sample *sample) 1762 { 1763 int callchain_ret = 0; 1764 1765 if (sample->callchain) { 1766 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 1767 if (callchain_ret == 0) { 1768 if (callchain_cursor.nr < trace->min_stack) 1769 goto out; 1770 callchain_ret = 1; 1771 } 1772 } 1773 1774 trace__printf_interrupted_entry(trace, sample); 1775 trace__fprintf_tstamp(trace, sample->time, trace->output); 1776 1777 if (trace->trace_syscalls) 1778 fprintf(trace->output, "( ): "); 1779 1780 fprintf(trace->output, "%s:", evsel->name); 1781 1782 if (perf_evsel__is_bpf_output(evsel)) { 1783 bpf_output__fprintf(trace, sample); 1784 } else if (evsel->tp_format) { 1785 event_format__fprintf(evsel->tp_format, sample->cpu, 1786 sample->raw_data, sample->raw_size, 1787 trace->output); 1788 } 1789 1790 fprintf(trace->output, ")\n"); 1791 1792 if (callchain_ret > 0) 1793 trace__fprintf_callchain(trace, sample); 1794 else if (callchain_ret < 0) 1795 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 1796 out: 1797 return 0; 1798 } 1799 1800 static void print_location(FILE *f, struct perf_sample *sample, 1801 struct addr_location *al, 1802 bool print_dso, bool print_sym) 1803 { 1804 1805 if ((verbose || print_dso) && al->map) 1806 fprintf(f, "%s@", al->map->dso->long_name); 1807 1808 if ((verbose || print_sym) && al->sym) 1809 fprintf(f, "%s+0x%" PRIx64, al->sym->name, 1810 al->addr - al->sym->start); 1811 else if (al->map) 1812 fprintf(f, "0x%" PRIx64, al->addr); 1813 else 1814 fprintf(f, "0x%" PRIx64, sample->addr); 1815 } 1816 1817 static int trace__pgfault(struct trace *trace, 1818 struct perf_evsel *evsel, 1819 union perf_event *event __maybe_unused, 1820 struct perf_sample *sample) 1821 { 1822 struct thread *thread; 1823 struct addr_location al; 1824 char map_type = 'd'; 1825 struct thread_trace *ttrace; 1826 int err = -1; 1827 int callchain_ret = 0; 1828 1829 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1830 1831 if (sample->callchain) { 1832 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 1833 if (callchain_ret == 0) { 1834 if (callchain_cursor.nr < trace->min_stack) 1835 goto out_put; 1836 callchain_ret = 1; 1837 } 1838 } 1839 1840 ttrace = thread__trace(thread, trace->output); 1841 if (ttrace == NULL) 1842 goto out_put; 1843 1844 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ) 1845 ttrace->pfmaj++; 1846 else 1847 ttrace->pfmin++; 1848 1849 if (trace->summary_only) 1850 goto out; 1851 1852 thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION, 1853 sample->ip, &al); 1854 1855 trace__fprintf_entry_head(trace, thread, 0, sample->time, trace->output); 1856 1857 fprintf(trace->output, "%sfault [", 1858 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ? 1859 "maj" : "min"); 1860 1861 print_location(trace->output, sample, &al, false, true); 1862 1863 fprintf(trace->output, "] => "); 1864 1865 thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE, 1866 sample->addr, &al); 1867 1868 if (!al.map) { 1869 thread__find_addr_location(thread, sample->cpumode, 1870 MAP__FUNCTION, sample->addr, &al); 1871 1872 if (al.map) 1873 map_type = 'x'; 1874 else 1875 map_type = '?'; 1876 } 1877 1878 print_location(trace->output, sample, &al, true, false); 1879 1880 fprintf(trace->output, " (%c%c)\n", map_type, al.level); 1881 1882 if (callchain_ret > 0) 1883 trace__fprintf_callchain(trace, sample); 1884 else if (callchain_ret < 0) 1885 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 1886 out: 1887 err = 0; 1888 out_put: 1889 thread__put(thread); 1890 return err; 1891 } 1892 1893 static bool skip_sample(struct trace *trace, struct perf_sample *sample) 1894 { 1895 if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) || 1896 (trace->tid_list && intlist__find(trace->tid_list, sample->tid))) 1897 return false; 1898 1899 if (trace->pid_list || trace->tid_list) 1900 return true; 1901 1902 return false; 1903 } 1904 1905 static void trace__set_base_time(struct trace *trace, 1906 struct perf_evsel *evsel, 1907 struct perf_sample *sample) 1908 { 1909 /* 1910 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust 1911 * and don't use sample->time unconditionally, we may end up having 1912 * some other event in the future without PERF_SAMPLE_TIME for good 1913 * reason, i.e. we may not be interested in its timestamps, just in 1914 * it taking place, picking some piece of information when it 1915 * appears in our event stream (vfs_getname comes to mind). 1916 */ 1917 if (trace->base_time == 0 && !trace->full_time && 1918 (evsel->attr.sample_type & PERF_SAMPLE_TIME)) 1919 trace->base_time = sample->time; 1920 } 1921 1922 static int trace__process_sample(struct perf_tool *tool, 1923 union perf_event *event, 1924 struct perf_sample *sample, 1925 struct perf_evsel *evsel, 1926 struct machine *machine __maybe_unused) 1927 { 1928 struct trace *trace = container_of(tool, struct trace, tool); 1929 int err = 0; 1930 1931 tracepoint_handler handler = evsel->handler; 1932 1933 if (skip_sample(trace, sample)) 1934 return 0; 1935 1936 trace__set_base_time(trace, evsel, sample); 1937 1938 if (handler) { 1939 ++trace->nr_events; 1940 handler(trace, evsel, event, sample); 1941 } 1942 1943 return err; 1944 } 1945 1946 static int parse_target_str(struct trace *trace) 1947 { 1948 if (trace->opts.target.pid) { 1949 trace->pid_list = intlist__new(trace->opts.target.pid); 1950 if (trace->pid_list == NULL) { 1951 pr_err("Error parsing process id string\n"); 1952 return -EINVAL; 1953 } 1954 } 1955 1956 if (trace->opts.target.tid) { 1957 trace->tid_list = intlist__new(trace->opts.target.tid); 1958 if (trace->tid_list == NULL) { 1959 pr_err("Error parsing thread id string\n"); 1960 return -EINVAL; 1961 } 1962 } 1963 1964 return 0; 1965 } 1966 1967 static int trace__record(struct trace *trace, int argc, const char **argv) 1968 { 1969 unsigned int rec_argc, i, j; 1970 const char **rec_argv; 1971 const char * const record_args[] = { 1972 "record", 1973 "-R", 1974 "-m", "1024", 1975 "-c", "1", 1976 }; 1977 1978 const char * const sc_args[] = { "-e", }; 1979 unsigned int sc_args_nr = ARRAY_SIZE(sc_args); 1980 const char * const majpf_args[] = { "-e", "major-faults" }; 1981 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args); 1982 const char * const minpf_args[] = { "-e", "minor-faults" }; 1983 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args); 1984 1985 /* +1 is for the event string below */ 1986 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 + 1987 majpf_args_nr + minpf_args_nr + argc; 1988 rec_argv = calloc(rec_argc + 1, sizeof(char *)); 1989 1990 if (rec_argv == NULL) 1991 return -ENOMEM; 1992 1993 j = 0; 1994 for (i = 0; i < ARRAY_SIZE(record_args); i++) 1995 rec_argv[j++] = record_args[i]; 1996 1997 if (trace->trace_syscalls) { 1998 for (i = 0; i < sc_args_nr; i++) 1999 rec_argv[j++] = sc_args[i]; 2000 2001 /* event string may be different for older kernels - e.g., RHEL6 */ 2002 if (is_valid_tracepoint("raw_syscalls:sys_enter")) 2003 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit"; 2004 else if (is_valid_tracepoint("syscalls:sys_enter")) 2005 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit"; 2006 else { 2007 pr_err("Neither raw_syscalls nor syscalls events exist.\n"); 2008 return -1; 2009 } 2010 } 2011 2012 if (trace->trace_pgfaults & TRACE_PFMAJ) 2013 for (i = 0; i < majpf_args_nr; i++) 2014 rec_argv[j++] = majpf_args[i]; 2015 2016 if (trace->trace_pgfaults & TRACE_PFMIN) 2017 for (i = 0; i < minpf_args_nr; i++) 2018 rec_argv[j++] = minpf_args[i]; 2019 2020 for (i = 0; i < (unsigned int)argc; i++) 2021 rec_argv[j++] = argv[i]; 2022 2023 return cmd_record(j, rec_argv, NULL); 2024 } 2025 2026 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp); 2027 2028 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist) 2029 { 2030 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname"); 2031 2032 if (IS_ERR(evsel)) 2033 return false; 2034 2035 if (perf_evsel__field(evsel, "pathname") == NULL) { 2036 perf_evsel__delete(evsel); 2037 return false; 2038 } 2039 2040 evsel->handler = trace__vfs_getname; 2041 perf_evlist__add(evlist, evsel); 2042 return true; 2043 } 2044 2045 static struct perf_evsel *perf_evsel__new_pgfault(u64 config) 2046 { 2047 struct perf_evsel *evsel; 2048 struct perf_event_attr attr = { 2049 .type = PERF_TYPE_SOFTWARE, 2050 .mmap_data = 1, 2051 }; 2052 2053 attr.config = config; 2054 attr.sample_period = 1; 2055 2056 event_attr_init(&attr); 2057 2058 evsel = perf_evsel__new(&attr); 2059 if (evsel) 2060 evsel->handler = trace__pgfault; 2061 2062 return evsel; 2063 } 2064 2065 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample) 2066 { 2067 const u32 type = event->header.type; 2068 struct perf_evsel *evsel; 2069 2070 if (type != PERF_RECORD_SAMPLE) { 2071 trace__process_event(trace, trace->host, event, sample); 2072 return; 2073 } 2074 2075 evsel = perf_evlist__id2evsel(trace->evlist, sample->id); 2076 if (evsel == NULL) { 2077 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id); 2078 return; 2079 } 2080 2081 trace__set_base_time(trace, evsel, sample); 2082 2083 if (evsel->attr.type == PERF_TYPE_TRACEPOINT && 2084 sample->raw_data == NULL) { 2085 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n", 2086 perf_evsel__name(evsel), sample->tid, 2087 sample->cpu, sample->raw_size); 2088 } else { 2089 tracepoint_handler handler = evsel->handler; 2090 handler(trace, evsel, event, sample); 2091 } 2092 } 2093 2094 static int trace__add_syscall_newtp(struct trace *trace) 2095 { 2096 int ret = -1; 2097 struct perf_evlist *evlist = trace->evlist; 2098 struct perf_evsel *sys_enter, *sys_exit; 2099 2100 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter); 2101 if (sys_enter == NULL) 2102 goto out; 2103 2104 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args)) 2105 goto out_delete_sys_enter; 2106 2107 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit); 2108 if (sys_exit == NULL) 2109 goto out_delete_sys_enter; 2110 2111 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret)) 2112 goto out_delete_sys_exit; 2113 2114 perf_evlist__add(evlist, sys_enter); 2115 perf_evlist__add(evlist, sys_exit); 2116 2117 if (callchain_param.enabled && !trace->kernel_syscallchains) { 2118 /* 2119 * We're interested only in the user space callchain 2120 * leading to the syscall, allow overriding that for 2121 * debugging reasons using --kernel_syscall_callchains 2122 */ 2123 sys_exit->attr.exclude_callchain_kernel = 1; 2124 } 2125 2126 trace->syscalls.events.sys_enter = sys_enter; 2127 trace->syscalls.events.sys_exit = sys_exit; 2128 2129 ret = 0; 2130 out: 2131 return ret; 2132 2133 out_delete_sys_exit: 2134 perf_evsel__delete_priv(sys_exit); 2135 out_delete_sys_enter: 2136 perf_evsel__delete_priv(sys_enter); 2137 goto out; 2138 } 2139 2140 static int trace__set_ev_qualifier_filter(struct trace *trace) 2141 { 2142 int err = -1; 2143 struct perf_evsel *sys_exit; 2144 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier, 2145 trace->ev_qualifier_ids.nr, 2146 trace->ev_qualifier_ids.entries); 2147 2148 if (filter == NULL) 2149 goto out_enomem; 2150 2151 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter, 2152 filter)) { 2153 sys_exit = trace->syscalls.events.sys_exit; 2154 err = perf_evsel__append_tp_filter(sys_exit, filter); 2155 } 2156 2157 free(filter); 2158 out: 2159 return err; 2160 out_enomem: 2161 errno = ENOMEM; 2162 goto out; 2163 } 2164 2165 static int trace__run(struct trace *trace, int argc, const char **argv) 2166 { 2167 struct perf_evlist *evlist = trace->evlist; 2168 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL; 2169 int err = -1, i; 2170 unsigned long before; 2171 const bool forks = argc > 0; 2172 bool draining = false; 2173 2174 trace->live = true; 2175 2176 if (trace->trace_syscalls && trace__add_syscall_newtp(trace)) 2177 goto out_error_raw_syscalls; 2178 2179 if (trace->trace_syscalls) 2180 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist); 2181 2182 if ((trace->trace_pgfaults & TRACE_PFMAJ)) { 2183 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ); 2184 if (pgfault_maj == NULL) 2185 goto out_error_mem; 2186 perf_evlist__add(evlist, pgfault_maj); 2187 } 2188 2189 if ((trace->trace_pgfaults & TRACE_PFMIN)) { 2190 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN); 2191 if (pgfault_min == NULL) 2192 goto out_error_mem; 2193 perf_evlist__add(evlist, pgfault_min); 2194 } 2195 2196 if (trace->sched && 2197 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime", 2198 trace__sched_stat_runtime)) 2199 goto out_error_sched_stat_runtime; 2200 2201 err = perf_evlist__create_maps(evlist, &trace->opts.target); 2202 if (err < 0) { 2203 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n"); 2204 goto out_delete_evlist; 2205 } 2206 2207 err = trace__symbols_init(trace, evlist); 2208 if (err < 0) { 2209 fprintf(trace->output, "Problems initializing symbol libraries!\n"); 2210 goto out_delete_evlist; 2211 } 2212 2213 perf_evlist__config(evlist, &trace->opts, NULL); 2214 2215 if (callchain_param.enabled) { 2216 bool use_identifier = false; 2217 2218 if (trace->syscalls.events.sys_exit) { 2219 perf_evsel__config_callchain(trace->syscalls.events.sys_exit, 2220 &trace->opts, &callchain_param); 2221 use_identifier = true; 2222 } 2223 2224 if (pgfault_maj) { 2225 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param); 2226 use_identifier = true; 2227 } 2228 2229 if (pgfault_min) { 2230 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param); 2231 use_identifier = true; 2232 } 2233 2234 if (use_identifier) { 2235 /* 2236 * Now we have evsels with different sample_ids, use 2237 * PERF_SAMPLE_IDENTIFIER to map from sample to evsel 2238 * from a fixed position in each ring buffer record. 2239 * 2240 * As of this the changeset introducing this comment, this 2241 * isn't strictly needed, as the fields that can come before 2242 * PERF_SAMPLE_ID are all used, but we'll probably disable 2243 * some of those for things like copying the payload of 2244 * pointer syscall arguments, and for vfs_getname we don't 2245 * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this 2246 * here as a warning we need to use PERF_SAMPLE_IDENTIFIER. 2247 */ 2248 perf_evlist__set_sample_bit(evlist, IDENTIFIER); 2249 perf_evlist__reset_sample_bit(evlist, ID); 2250 } 2251 } 2252 2253 signal(SIGCHLD, sig_handler); 2254 signal(SIGINT, sig_handler); 2255 2256 if (forks) { 2257 err = perf_evlist__prepare_workload(evlist, &trace->opts.target, 2258 argv, false, NULL); 2259 if (err < 0) { 2260 fprintf(trace->output, "Couldn't run the workload!\n"); 2261 goto out_delete_evlist; 2262 } 2263 } 2264 2265 err = perf_evlist__open(evlist); 2266 if (err < 0) 2267 goto out_error_open; 2268 2269 err = bpf__apply_obj_config(); 2270 if (err) { 2271 char errbuf[BUFSIZ]; 2272 2273 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf)); 2274 pr_err("ERROR: Apply config to BPF failed: %s\n", 2275 errbuf); 2276 goto out_error_open; 2277 } 2278 2279 /* 2280 * Better not use !target__has_task() here because we need to cover the 2281 * case where no threads were specified in the command line, but a 2282 * workload was, and in that case we will fill in the thread_map when 2283 * we fork the workload in perf_evlist__prepare_workload. 2284 */ 2285 if (trace->filter_pids.nr > 0) 2286 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries); 2287 else if (thread_map__pid(evlist->threads, 0) == -1) 2288 err = perf_evlist__set_filter_pid(evlist, getpid()); 2289 2290 if (err < 0) 2291 goto out_error_mem; 2292 2293 if (trace->ev_qualifier_ids.nr > 0) { 2294 err = trace__set_ev_qualifier_filter(trace); 2295 if (err < 0) 2296 goto out_errno; 2297 2298 pr_debug("event qualifier tracepoint filter: %s\n", 2299 trace->syscalls.events.sys_exit->filter); 2300 } 2301 2302 err = perf_evlist__apply_filters(evlist, &evsel); 2303 if (err < 0) 2304 goto out_error_apply_filters; 2305 2306 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false); 2307 if (err < 0) 2308 goto out_error_mmap; 2309 2310 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay) 2311 perf_evlist__enable(evlist); 2312 2313 if (forks) 2314 perf_evlist__start_workload(evlist); 2315 2316 if (trace->opts.initial_delay) { 2317 usleep(trace->opts.initial_delay * 1000); 2318 perf_evlist__enable(evlist); 2319 } 2320 2321 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 || 2322 evlist->threads->nr > 1 || 2323 perf_evlist__first(evlist)->attr.inherit; 2324 again: 2325 before = trace->nr_events; 2326 2327 for (i = 0; i < evlist->nr_mmaps; i++) { 2328 union perf_event *event; 2329 2330 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) { 2331 struct perf_sample sample; 2332 2333 ++trace->nr_events; 2334 2335 err = perf_evlist__parse_sample(evlist, event, &sample); 2336 if (err) { 2337 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err); 2338 goto next_event; 2339 } 2340 2341 trace__handle_event(trace, event, &sample); 2342 next_event: 2343 perf_evlist__mmap_consume(evlist, i); 2344 2345 if (interrupted) 2346 goto out_disable; 2347 2348 if (done && !draining) { 2349 perf_evlist__disable(evlist); 2350 draining = true; 2351 } 2352 } 2353 } 2354 2355 if (trace->nr_events == before) { 2356 int timeout = done ? 100 : -1; 2357 2358 if (!draining && perf_evlist__poll(evlist, timeout) > 0) { 2359 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0) 2360 draining = true; 2361 2362 goto again; 2363 } 2364 } else { 2365 goto again; 2366 } 2367 2368 out_disable: 2369 thread__zput(trace->current); 2370 2371 perf_evlist__disable(evlist); 2372 2373 if (!err) { 2374 if (trace->summary) 2375 trace__fprintf_thread_summary(trace, trace->output); 2376 2377 if (trace->show_tool_stats) { 2378 fprintf(trace->output, "Stats:\n " 2379 " vfs_getname : %" PRIu64 "\n" 2380 " proc_getname: %" PRIu64 "\n", 2381 trace->stats.vfs_getname, 2382 trace->stats.proc_getname); 2383 } 2384 } 2385 2386 out_delete_evlist: 2387 perf_evlist__delete(evlist); 2388 trace->evlist = NULL; 2389 trace->live = false; 2390 return err; 2391 { 2392 char errbuf[BUFSIZ]; 2393 2394 out_error_sched_stat_runtime: 2395 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime"); 2396 goto out_error; 2397 2398 out_error_raw_syscalls: 2399 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)"); 2400 goto out_error; 2401 2402 out_error_mmap: 2403 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf)); 2404 goto out_error; 2405 2406 out_error_open: 2407 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf)); 2408 2409 out_error: 2410 fprintf(trace->output, "%s\n", errbuf); 2411 goto out_delete_evlist; 2412 2413 out_error_apply_filters: 2414 fprintf(trace->output, 2415 "Failed to set filter \"%s\" on event %s with %d (%s)\n", 2416 evsel->filter, perf_evsel__name(evsel), errno, 2417 str_error_r(errno, errbuf, sizeof(errbuf))); 2418 goto out_delete_evlist; 2419 } 2420 out_error_mem: 2421 fprintf(trace->output, "Not enough memory to run!\n"); 2422 goto out_delete_evlist; 2423 2424 out_errno: 2425 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno)); 2426 goto out_delete_evlist; 2427 } 2428 2429 static int trace__replay(struct trace *trace) 2430 { 2431 const struct perf_evsel_str_handler handlers[] = { 2432 { "probe:vfs_getname", trace__vfs_getname, }, 2433 }; 2434 struct perf_data_file file = { 2435 .path = input_name, 2436 .mode = PERF_DATA_MODE_READ, 2437 .force = trace->force, 2438 }; 2439 struct perf_session *session; 2440 struct perf_evsel *evsel; 2441 int err = -1; 2442 2443 trace->tool.sample = trace__process_sample; 2444 trace->tool.mmap = perf_event__process_mmap; 2445 trace->tool.mmap2 = perf_event__process_mmap2; 2446 trace->tool.comm = perf_event__process_comm; 2447 trace->tool.exit = perf_event__process_exit; 2448 trace->tool.fork = perf_event__process_fork; 2449 trace->tool.attr = perf_event__process_attr; 2450 trace->tool.tracing_data = perf_event__process_tracing_data; 2451 trace->tool.build_id = perf_event__process_build_id; 2452 2453 trace->tool.ordered_events = true; 2454 trace->tool.ordering_requires_timestamps = true; 2455 2456 /* add tid to output */ 2457 trace->multiple_threads = true; 2458 2459 session = perf_session__new(&file, false, &trace->tool); 2460 if (session == NULL) 2461 return -1; 2462 2463 if (symbol__init(&session->header.env) < 0) 2464 goto out; 2465 2466 trace->host = &session->machines.host; 2467 2468 err = perf_session__set_tracepoints_handlers(session, handlers); 2469 if (err) 2470 goto out; 2471 2472 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2473 "raw_syscalls:sys_enter"); 2474 /* older kernels have syscalls tp versus raw_syscalls */ 2475 if (evsel == NULL) 2476 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2477 "syscalls:sys_enter"); 2478 2479 if (evsel && 2480 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 || 2481 perf_evsel__init_sc_tp_ptr_field(evsel, args))) { 2482 pr_err("Error during initialize raw_syscalls:sys_enter event\n"); 2483 goto out; 2484 } 2485 2486 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2487 "raw_syscalls:sys_exit"); 2488 if (evsel == NULL) 2489 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2490 "syscalls:sys_exit"); 2491 if (evsel && 2492 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 || 2493 perf_evsel__init_sc_tp_uint_field(evsel, ret))) { 2494 pr_err("Error during initialize raw_syscalls:sys_exit event\n"); 2495 goto out; 2496 } 2497 2498 evlist__for_each_entry(session->evlist, evsel) { 2499 if (evsel->attr.type == PERF_TYPE_SOFTWARE && 2500 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ || 2501 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN || 2502 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS)) 2503 evsel->handler = trace__pgfault; 2504 } 2505 2506 err = parse_target_str(trace); 2507 if (err != 0) 2508 goto out; 2509 2510 setup_pager(); 2511 2512 err = perf_session__process_events(session); 2513 if (err) 2514 pr_err("Failed to process events, error %d", err); 2515 2516 else if (trace->summary) 2517 trace__fprintf_thread_summary(trace, trace->output); 2518 2519 out: 2520 perf_session__delete(session); 2521 2522 return err; 2523 } 2524 2525 static size_t trace__fprintf_threads_header(FILE *fp) 2526 { 2527 size_t printed; 2528 2529 printed = fprintf(fp, "\n Summary of events:\n\n"); 2530 2531 return printed; 2532 } 2533 2534 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs, 2535 struct stats *stats; 2536 double msecs; 2537 int syscall; 2538 ) 2539 { 2540 struct int_node *source = rb_entry(nd, struct int_node, rb_node); 2541 struct stats *stats = source->priv; 2542 2543 entry->syscall = source->i; 2544 entry->stats = stats; 2545 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0; 2546 } 2547 2548 static size_t thread__dump_stats(struct thread_trace *ttrace, 2549 struct trace *trace, FILE *fp) 2550 { 2551 size_t printed = 0; 2552 struct syscall *sc; 2553 struct rb_node *nd; 2554 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats); 2555 2556 if (syscall_stats == NULL) 2557 return 0; 2558 2559 printed += fprintf(fp, "\n"); 2560 2561 printed += fprintf(fp, " syscall calls total min avg max stddev\n"); 2562 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); 2563 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n"); 2564 2565 resort_rb__for_each_entry(nd, syscall_stats) { 2566 struct stats *stats = syscall_stats_entry->stats; 2567 if (stats) { 2568 double min = (double)(stats->min) / NSEC_PER_MSEC; 2569 double max = (double)(stats->max) / NSEC_PER_MSEC; 2570 double avg = avg_stats(stats); 2571 double pct; 2572 u64 n = (u64) stats->n; 2573 2574 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0; 2575 avg /= NSEC_PER_MSEC; 2576 2577 sc = &trace->syscalls.table[syscall_stats_entry->syscall]; 2578 printed += fprintf(fp, " %-15s", sc->name); 2579 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f", 2580 n, syscall_stats_entry->msecs, min, avg); 2581 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct); 2582 } 2583 } 2584 2585 resort_rb__delete(syscall_stats); 2586 printed += fprintf(fp, "\n\n"); 2587 2588 return printed; 2589 } 2590 2591 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace) 2592 { 2593 size_t printed = 0; 2594 struct thread_trace *ttrace = thread__priv(thread); 2595 double ratio; 2596 2597 if (ttrace == NULL) 2598 return 0; 2599 2600 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0; 2601 2602 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid); 2603 printed += fprintf(fp, "%lu events, ", ttrace->nr_events); 2604 printed += fprintf(fp, "%.1f%%", ratio); 2605 if (ttrace->pfmaj) 2606 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj); 2607 if (ttrace->pfmin) 2608 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin); 2609 if (trace->sched) 2610 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms); 2611 else if (fputc('\n', fp) != EOF) 2612 ++printed; 2613 2614 printed += thread__dump_stats(ttrace, trace, fp); 2615 2616 return printed; 2617 } 2618 2619 static unsigned long thread__nr_events(struct thread_trace *ttrace) 2620 { 2621 return ttrace ? ttrace->nr_events : 0; 2622 } 2623 2624 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)), 2625 struct thread *thread; 2626 ) 2627 { 2628 entry->thread = rb_entry(nd, struct thread, rb_node); 2629 } 2630 2631 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp) 2632 { 2633 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host); 2634 size_t printed = trace__fprintf_threads_header(fp); 2635 struct rb_node *nd; 2636 2637 if (threads == NULL) { 2638 fprintf(fp, "%s", "Error sorting output by nr_events!\n"); 2639 return 0; 2640 } 2641 2642 resort_rb__for_each_entry(nd, threads) 2643 printed += trace__fprintf_thread(fp, threads_entry->thread, trace); 2644 2645 resort_rb__delete(threads); 2646 2647 return printed; 2648 } 2649 2650 static int trace__set_duration(const struct option *opt, const char *str, 2651 int unset __maybe_unused) 2652 { 2653 struct trace *trace = opt->value; 2654 2655 trace->duration_filter = atof(str); 2656 return 0; 2657 } 2658 2659 static int trace__set_filter_pids(const struct option *opt, const char *str, 2660 int unset __maybe_unused) 2661 { 2662 int ret = -1; 2663 size_t i; 2664 struct trace *trace = opt->value; 2665 /* 2666 * FIXME: introduce a intarray class, plain parse csv and create a 2667 * { int nr, int entries[] } struct... 2668 */ 2669 struct intlist *list = intlist__new(str); 2670 2671 if (list == NULL) 2672 return -1; 2673 2674 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1; 2675 trace->filter_pids.entries = calloc(i, sizeof(pid_t)); 2676 2677 if (trace->filter_pids.entries == NULL) 2678 goto out; 2679 2680 trace->filter_pids.entries[0] = getpid(); 2681 2682 for (i = 1; i < trace->filter_pids.nr; ++i) 2683 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i; 2684 2685 intlist__delete(list); 2686 ret = 0; 2687 out: 2688 return ret; 2689 } 2690 2691 static int trace__open_output(struct trace *trace, const char *filename) 2692 { 2693 struct stat st; 2694 2695 if (!stat(filename, &st) && st.st_size) { 2696 char oldname[PATH_MAX]; 2697 2698 scnprintf(oldname, sizeof(oldname), "%s.old", filename); 2699 unlink(oldname); 2700 rename(filename, oldname); 2701 } 2702 2703 trace->output = fopen(filename, "w"); 2704 2705 return trace->output == NULL ? -errno : 0; 2706 } 2707 2708 static int parse_pagefaults(const struct option *opt, const char *str, 2709 int unset __maybe_unused) 2710 { 2711 int *trace_pgfaults = opt->value; 2712 2713 if (strcmp(str, "all") == 0) 2714 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN; 2715 else if (strcmp(str, "maj") == 0) 2716 *trace_pgfaults |= TRACE_PFMAJ; 2717 else if (strcmp(str, "min") == 0) 2718 *trace_pgfaults |= TRACE_PFMIN; 2719 else 2720 return -1; 2721 2722 return 0; 2723 } 2724 2725 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler) 2726 { 2727 struct perf_evsel *evsel; 2728 2729 evlist__for_each_entry(evlist, evsel) 2730 evsel->handler = handler; 2731 } 2732 2733 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused) 2734 { 2735 const char *trace_usage[] = { 2736 "perf trace [<options>] [<command>]", 2737 "perf trace [<options>] -- <command> [<options>]", 2738 "perf trace record [<options>] [<command>]", 2739 "perf trace record [<options>] -- <command> [<options>]", 2740 NULL 2741 }; 2742 struct trace trace = { 2743 .syscalls = { 2744 . max = -1, 2745 }, 2746 .opts = { 2747 .target = { 2748 .uid = UINT_MAX, 2749 .uses_mmap = true, 2750 }, 2751 .user_freq = UINT_MAX, 2752 .user_interval = ULLONG_MAX, 2753 .no_buffering = true, 2754 .mmap_pages = UINT_MAX, 2755 .proc_map_timeout = 500, 2756 }, 2757 .output = stderr, 2758 .show_comm = true, 2759 .trace_syscalls = true, 2760 .kernel_syscallchains = false, 2761 .max_stack = UINT_MAX, 2762 }; 2763 const char *output_name = NULL; 2764 const char *ev_qualifier_str = NULL; 2765 const struct option trace_options[] = { 2766 OPT_CALLBACK(0, "event", &trace.evlist, "event", 2767 "event selector. use 'perf list' to list available events", 2768 parse_events_option), 2769 OPT_BOOLEAN(0, "comm", &trace.show_comm, 2770 "show the thread COMM next to its id"), 2771 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"), 2772 OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"), 2773 OPT_STRING('o', "output", &output_name, "file", "output file name"), 2774 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"), 2775 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid", 2776 "trace events on existing process id"), 2777 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid", 2778 "trace events on existing thread id"), 2779 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids", 2780 "pids to filter (by the kernel)", trace__set_filter_pids), 2781 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide, 2782 "system-wide collection from all CPUs"), 2783 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu", 2784 "list of cpus to monitor"), 2785 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit, 2786 "child tasks do not inherit counters"), 2787 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages", 2788 "number of mmap data pages", 2789 perf_evlist__parse_mmap_pages), 2790 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user", 2791 "user to profile"), 2792 OPT_CALLBACK(0, "duration", &trace, "float", 2793 "show only events with duration > N.M ms", 2794 trace__set_duration), 2795 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"), 2796 OPT_INCR('v', "verbose", &verbose, "be more verbose"), 2797 OPT_BOOLEAN('T', "time", &trace.full_time, 2798 "Show full timestamp, not time relative to first start"), 2799 OPT_BOOLEAN('s', "summary", &trace.summary_only, 2800 "Show only syscall summary with statistics"), 2801 OPT_BOOLEAN('S', "with-summary", &trace.summary, 2802 "Show all syscalls and summary with statistics"), 2803 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min", 2804 "Trace pagefaults", parse_pagefaults, "maj"), 2805 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"), 2806 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"), 2807 OPT_CALLBACK(0, "call-graph", &trace.opts, 2808 "record_mode[,record_size]", record_callchain_help, 2809 &record_parse_callchain_opt), 2810 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains, 2811 "Show the kernel callchains on the syscall exit path"), 2812 OPT_UINTEGER(0, "min-stack", &trace.min_stack, 2813 "Set the minimum stack depth when parsing the callchain, " 2814 "anything below the specified depth will be ignored."), 2815 OPT_UINTEGER(0, "max-stack", &trace.max_stack, 2816 "Set the maximum stack depth when parsing the callchain, " 2817 "anything beyond the specified depth will be ignored. " 2818 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)), 2819 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout, 2820 "per thread proc mmap processing timeout in ms"), 2821 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay, 2822 "ms to wait before starting measurement after program " 2823 "start"), 2824 OPT_END() 2825 }; 2826 bool __maybe_unused max_stack_user_set = true; 2827 bool mmap_pages_user_set = true; 2828 const char * const trace_subcommands[] = { "record", NULL }; 2829 int err; 2830 char bf[BUFSIZ]; 2831 2832 signal(SIGSEGV, sighandler_dump_stack); 2833 signal(SIGFPE, sighandler_dump_stack); 2834 2835 trace.evlist = perf_evlist__new(); 2836 trace.sctbl = syscalltbl__new(); 2837 2838 if (trace.evlist == NULL || trace.sctbl == NULL) { 2839 pr_err("Not enough memory to run!\n"); 2840 err = -ENOMEM; 2841 goto out; 2842 } 2843 2844 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands, 2845 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION); 2846 2847 err = bpf__setup_stdout(trace.evlist); 2848 if (err) { 2849 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf)); 2850 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf); 2851 goto out; 2852 } 2853 2854 err = -1; 2855 2856 if (trace.trace_pgfaults) { 2857 trace.opts.sample_address = true; 2858 trace.opts.sample_time = true; 2859 } 2860 2861 if (trace.opts.mmap_pages == UINT_MAX) 2862 mmap_pages_user_set = false; 2863 2864 if (trace.max_stack == UINT_MAX) { 2865 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack; 2866 max_stack_user_set = false; 2867 } 2868 2869 #ifdef HAVE_DWARF_UNWIND_SUPPORT 2870 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls) 2871 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false); 2872 #endif 2873 2874 if (callchain_param.enabled) { 2875 if (!mmap_pages_user_set && geteuid() == 0) 2876 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4; 2877 2878 symbol_conf.use_callchain = true; 2879 } 2880 2881 if (trace.evlist->nr_entries > 0) 2882 evlist__set_evsel_handler(trace.evlist, trace__event_handler); 2883 2884 if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) 2885 return trace__record(&trace, argc-1, &argv[1]); 2886 2887 /* summary_only implies summary option, but don't overwrite summary if set */ 2888 if (trace.summary_only) 2889 trace.summary = trace.summary_only; 2890 2891 if (!trace.trace_syscalls && !trace.trace_pgfaults && 2892 trace.evlist->nr_entries == 0 /* Was --events used? */) { 2893 pr_err("Please specify something to trace.\n"); 2894 return -1; 2895 } 2896 2897 if (!trace.trace_syscalls && ev_qualifier_str) { 2898 pr_err("The -e option can't be used with --no-syscalls.\n"); 2899 goto out; 2900 } 2901 2902 if (output_name != NULL) { 2903 err = trace__open_output(&trace, output_name); 2904 if (err < 0) { 2905 perror("failed to create output file"); 2906 goto out; 2907 } 2908 } 2909 2910 trace.open_id = syscalltbl__id(trace.sctbl, "open"); 2911 2912 if (ev_qualifier_str != NULL) { 2913 const char *s = ev_qualifier_str; 2914 struct strlist_config slist_config = { 2915 .dirname = system_path(STRACE_GROUPS_DIR), 2916 }; 2917 2918 trace.not_ev_qualifier = *s == '!'; 2919 if (trace.not_ev_qualifier) 2920 ++s; 2921 trace.ev_qualifier = strlist__new(s, &slist_config); 2922 if (trace.ev_qualifier == NULL) { 2923 fputs("Not enough memory to parse event qualifier", 2924 trace.output); 2925 err = -ENOMEM; 2926 goto out_close; 2927 } 2928 2929 err = trace__validate_ev_qualifier(&trace); 2930 if (err) 2931 goto out_close; 2932 } 2933 2934 err = target__validate(&trace.opts.target); 2935 if (err) { 2936 target__strerror(&trace.opts.target, err, bf, sizeof(bf)); 2937 fprintf(trace.output, "%s", bf); 2938 goto out_close; 2939 } 2940 2941 err = target__parse_uid(&trace.opts.target); 2942 if (err) { 2943 target__strerror(&trace.opts.target, err, bf, sizeof(bf)); 2944 fprintf(trace.output, "%s", bf); 2945 goto out_close; 2946 } 2947 2948 if (!argc && target__none(&trace.opts.target)) 2949 trace.opts.target.system_wide = true; 2950 2951 if (input_name) 2952 err = trace__replay(&trace); 2953 else 2954 err = trace__run(&trace, argc, argv); 2955 2956 out_close: 2957 if (output_name != NULL) 2958 fclose(trace.output); 2959 out: 2960 return err; 2961 } 2962