1 /* 2 * builtin-trace.c 3 * 4 * Builtin 'trace' command: 5 * 6 * Display a continuously updated trace of any workload, CPU, specific PID, 7 * system wide, etc. Default format is loosely strace like, but any other 8 * event may be specified using --event. 9 * 10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> 11 * 12 * Initially based on the 'trace' prototype by Thomas Gleixner: 13 * 14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'") 15 * 16 * Released under the GPL v2. (and only v2, not any later version) 17 */ 18 19 #include <traceevent/event-parse.h> 20 #include <api/fs/tracing_path.h> 21 #include "builtin.h" 22 #include "util/color.h" 23 #include "util/debug.h" 24 #include "util/event.h" 25 #include "util/evlist.h" 26 #include <subcmd/exec-cmd.h> 27 #include "util/machine.h" 28 #include "util/path.h" 29 #include "util/session.h" 30 #include "util/thread.h" 31 #include <subcmd/parse-options.h> 32 #include "util/strlist.h" 33 #include "util/intlist.h" 34 #include "util/thread_map.h" 35 #include "util/stat.h" 36 #include "trace/beauty/beauty.h" 37 #include "trace-event.h" 38 #include "util/parse-events.h" 39 #include "util/bpf-loader.h" 40 #include "callchain.h" 41 #include "print_binary.h" 42 #include "string2.h" 43 #include "syscalltbl.h" 44 #include "rb_resort.h" 45 46 #include <errno.h> 47 #include <inttypes.h> 48 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */ 49 #include <poll.h> 50 #include <signal.h> 51 #include <stdlib.h> 52 #include <string.h> 53 #include <linux/err.h> 54 #include <linux/filter.h> 55 #include <linux/audit.h> 56 #include <linux/kernel.h> 57 #include <linux/random.h> 58 #include <linux/stringify.h> 59 #include <linux/time64.h> 60 61 #include "sane_ctype.h" 62 63 #ifndef O_CLOEXEC 64 # define O_CLOEXEC 02000000 65 #endif 66 67 #ifndef F_LINUX_SPECIFIC_BASE 68 # define F_LINUX_SPECIFIC_BASE 1024 69 #endif 70 71 struct trace { 72 struct perf_tool tool; 73 struct syscalltbl *sctbl; 74 struct { 75 int max; 76 struct syscall *table; 77 struct { 78 struct perf_evsel *sys_enter, 79 *sys_exit; 80 } events; 81 } syscalls; 82 struct record_opts opts; 83 struct perf_evlist *evlist; 84 struct machine *host; 85 struct thread *current; 86 u64 base_time; 87 FILE *output; 88 unsigned long nr_events; 89 struct strlist *ev_qualifier; 90 struct { 91 size_t nr; 92 int *entries; 93 } ev_qualifier_ids; 94 struct { 95 size_t nr; 96 pid_t *entries; 97 } filter_pids; 98 double duration_filter; 99 double runtime_ms; 100 struct { 101 u64 vfs_getname, 102 proc_getname; 103 } stats; 104 unsigned int max_stack; 105 unsigned int min_stack; 106 bool not_ev_qualifier; 107 bool live; 108 bool full_time; 109 bool sched; 110 bool multiple_threads; 111 bool summary; 112 bool summary_only; 113 bool show_comm; 114 bool show_tool_stats; 115 bool trace_syscalls; 116 bool kernel_syscallchains; 117 bool force; 118 bool vfs_getname; 119 int trace_pgfaults; 120 int open_id; 121 }; 122 123 struct tp_field { 124 int offset; 125 union { 126 u64 (*integer)(struct tp_field *field, struct perf_sample *sample); 127 void *(*pointer)(struct tp_field *field, struct perf_sample *sample); 128 }; 129 }; 130 131 #define TP_UINT_FIELD(bits) \ 132 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \ 133 { \ 134 u##bits value; \ 135 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \ 136 return value; \ 137 } 138 139 TP_UINT_FIELD(8); 140 TP_UINT_FIELD(16); 141 TP_UINT_FIELD(32); 142 TP_UINT_FIELD(64); 143 144 #define TP_UINT_FIELD__SWAPPED(bits) \ 145 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \ 146 { \ 147 u##bits value; \ 148 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \ 149 return bswap_##bits(value);\ 150 } 151 152 TP_UINT_FIELD__SWAPPED(16); 153 TP_UINT_FIELD__SWAPPED(32); 154 TP_UINT_FIELD__SWAPPED(64); 155 156 static int tp_field__init_uint(struct tp_field *field, 157 struct format_field *format_field, 158 bool needs_swap) 159 { 160 field->offset = format_field->offset; 161 162 switch (format_field->size) { 163 case 1: 164 field->integer = tp_field__u8; 165 break; 166 case 2: 167 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16; 168 break; 169 case 4: 170 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32; 171 break; 172 case 8: 173 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64; 174 break; 175 default: 176 return -1; 177 } 178 179 return 0; 180 } 181 182 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample) 183 { 184 return sample->raw_data + field->offset; 185 } 186 187 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field) 188 { 189 field->offset = format_field->offset; 190 field->pointer = tp_field__ptr; 191 return 0; 192 } 193 194 struct syscall_tp { 195 struct tp_field id; 196 union { 197 struct tp_field args, ret; 198 }; 199 }; 200 201 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel, 202 struct tp_field *field, 203 const char *name) 204 { 205 struct format_field *format_field = perf_evsel__field(evsel, name); 206 207 if (format_field == NULL) 208 return -1; 209 210 return tp_field__init_uint(field, format_field, evsel->needs_swap); 211 } 212 213 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \ 214 ({ struct syscall_tp *sc = evsel->priv;\ 215 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); }) 216 217 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel, 218 struct tp_field *field, 219 const char *name) 220 { 221 struct format_field *format_field = perf_evsel__field(evsel, name); 222 223 if (format_field == NULL) 224 return -1; 225 226 return tp_field__init_ptr(field, format_field); 227 } 228 229 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \ 230 ({ struct syscall_tp *sc = evsel->priv;\ 231 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); }) 232 233 static void perf_evsel__delete_priv(struct perf_evsel *evsel) 234 { 235 zfree(&evsel->priv); 236 perf_evsel__delete(evsel); 237 } 238 239 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler) 240 { 241 evsel->priv = malloc(sizeof(struct syscall_tp)); 242 if (evsel->priv != NULL) { 243 if (perf_evsel__init_sc_tp_uint_field(evsel, id)) 244 goto out_delete; 245 246 evsel->handler = handler; 247 return 0; 248 } 249 250 return -ENOMEM; 251 252 out_delete: 253 zfree(&evsel->priv); 254 return -ENOENT; 255 } 256 257 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler) 258 { 259 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction); 260 261 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */ 262 if (IS_ERR(evsel)) 263 evsel = perf_evsel__newtp("syscalls", direction); 264 265 if (IS_ERR(evsel)) 266 return NULL; 267 268 if (perf_evsel__init_syscall_tp(evsel, handler)) 269 goto out_delete; 270 271 return evsel; 272 273 out_delete: 274 perf_evsel__delete_priv(evsel); 275 return NULL; 276 } 277 278 #define perf_evsel__sc_tp_uint(evsel, name, sample) \ 279 ({ struct syscall_tp *fields = evsel->priv; \ 280 fields->name.integer(&fields->name, sample); }) 281 282 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \ 283 ({ struct syscall_tp *fields = evsel->priv; \ 284 fields->name.pointer(&fields->name, sample); }) 285 286 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val) 287 { 288 int idx = val - sa->offset; 289 290 if (idx < 0 || idx >= sa->nr_entries) 291 return scnprintf(bf, size, intfmt, val); 292 293 return scnprintf(bf, size, "%s", sa->entries[idx]); 294 } 295 296 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size, 297 const char *intfmt, 298 struct syscall_arg *arg) 299 { 300 return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val); 301 } 302 303 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size, 304 struct syscall_arg *arg) 305 { 306 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg); 307 } 308 309 #define SCA_STRARRAY syscall_arg__scnprintf_strarray 310 311 struct strarrays { 312 int nr_entries; 313 struct strarray **entries; 314 }; 315 316 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \ 317 .nr_entries = ARRAY_SIZE(array), \ 318 .entries = array, \ 319 } 320 321 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size, 322 struct syscall_arg *arg) 323 { 324 struct strarrays *sas = arg->parm; 325 int i; 326 327 for (i = 0; i < sas->nr_entries; ++i) { 328 struct strarray *sa = sas->entries[i]; 329 int idx = arg->val - sa->offset; 330 331 if (idx >= 0 && idx < sa->nr_entries) { 332 if (sa->entries[idx] == NULL) 333 break; 334 return scnprintf(bf, size, "%s", sa->entries[idx]); 335 } 336 } 337 338 return scnprintf(bf, size, "%d", arg->val); 339 } 340 341 #ifndef AT_FDCWD 342 #define AT_FDCWD -100 343 #endif 344 345 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size, 346 struct syscall_arg *arg) 347 { 348 int fd = arg->val; 349 350 if (fd == AT_FDCWD) 351 return scnprintf(bf, size, "CWD"); 352 353 return syscall_arg__scnprintf_fd(bf, size, arg); 354 } 355 356 #define SCA_FDAT syscall_arg__scnprintf_fd_at 357 358 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size, 359 struct syscall_arg *arg); 360 361 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd 362 363 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg) 364 { 365 return scnprintf(bf, size, "%#lx", arg->val); 366 } 367 368 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg) 369 { 370 return scnprintf(bf, size, "%d", arg->val); 371 } 372 373 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg) 374 { 375 return scnprintf(bf, size, "%ld", arg->val); 376 } 377 378 static const char *bpf_cmd[] = { 379 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM", 380 "MAP_GET_NEXT_KEY", "PROG_LOAD", 381 }; 382 static DEFINE_STRARRAY(bpf_cmd); 383 384 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", }; 385 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1); 386 387 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", }; 388 static DEFINE_STRARRAY(itimers); 389 390 static const char *keyctl_options[] = { 391 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN", 392 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ", 393 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT", 394 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT", 395 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT", 396 }; 397 static DEFINE_STRARRAY(keyctl_options); 398 399 static const char *whences[] = { "SET", "CUR", "END", 400 #ifdef SEEK_DATA 401 "DATA", 402 #endif 403 #ifdef SEEK_HOLE 404 "HOLE", 405 #endif 406 }; 407 static DEFINE_STRARRAY(whences); 408 409 static const char *fcntl_cmds[] = { 410 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK", 411 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64", 412 "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX", 413 "GETOWNER_UIDS", 414 }; 415 static DEFINE_STRARRAY(fcntl_cmds); 416 417 static const char *fcntl_linux_specific_cmds[] = { 418 "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC", 419 "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS", 420 "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT", 421 }; 422 423 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE); 424 425 static struct strarray *fcntl_cmds_arrays[] = { 426 &strarray__fcntl_cmds, 427 &strarray__fcntl_linux_specific_cmds, 428 }; 429 430 static DEFINE_STRARRAYS(fcntl_cmds_arrays); 431 432 static const char *rlimit_resources[] = { 433 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE", 434 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO", 435 "RTTIME", 436 }; 437 static DEFINE_STRARRAY(rlimit_resources); 438 439 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", }; 440 static DEFINE_STRARRAY(sighow); 441 442 static const char *clockid[] = { 443 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID", 444 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME", 445 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI" 446 }; 447 static DEFINE_STRARRAY(clockid); 448 449 static const char *socket_families[] = { 450 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM", 451 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI", 452 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC", 453 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC", 454 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF", 455 "ALG", "NFC", "VSOCK", 456 }; 457 static DEFINE_STRARRAY(socket_families); 458 459 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size, 460 struct syscall_arg *arg) 461 { 462 size_t printed = 0; 463 int mode = arg->val; 464 465 if (mode == F_OK) /* 0 */ 466 return scnprintf(bf, size, "F"); 467 #define P_MODE(n) \ 468 if (mode & n##_OK) { \ 469 printed += scnprintf(bf + printed, size - printed, "%s", #n); \ 470 mode &= ~n##_OK; \ 471 } 472 473 P_MODE(R); 474 P_MODE(W); 475 P_MODE(X); 476 #undef P_MODE 477 478 if (mode) 479 printed += scnprintf(bf + printed, size - printed, "|%#x", mode); 480 481 return printed; 482 } 483 484 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode 485 486 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, 487 struct syscall_arg *arg); 488 489 #define SCA_FILENAME syscall_arg__scnprintf_filename 490 491 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size, 492 struct syscall_arg *arg) 493 { 494 int printed = 0, flags = arg->val; 495 496 #define P_FLAG(n) \ 497 if (flags & O_##n) { \ 498 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ 499 flags &= ~O_##n; \ 500 } 501 502 P_FLAG(CLOEXEC); 503 P_FLAG(NONBLOCK); 504 #undef P_FLAG 505 506 if (flags) 507 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); 508 509 return printed; 510 } 511 512 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags 513 514 #ifndef GRND_NONBLOCK 515 #define GRND_NONBLOCK 0x0001 516 #endif 517 #ifndef GRND_RANDOM 518 #define GRND_RANDOM 0x0002 519 #endif 520 521 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, 522 struct syscall_arg *arg) 523 { 524 int printed = 0, flags = arg->val; 525 526 #define P_FLAG(n) \ 527 if (flags & GRND_##n) { \ 528 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ 529 flags &= ~GRND_##n; \ 530 } 531 532 P_FLAG(RANDOM); 533 P_FLAG(NONBLOCK); 534 #undef P_FLAG 535 536 if (flags) 537 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); 538 539 return printed; 540 } 541 542 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags 543 544 #define STRARRAY(name, array) \ 545 { .scnprintf = SCA_STRARRAY, \ 546 .parm = &strarray__##array, } 547 548 #include "trace/beauty/eventfd.c" 549 #include "trace/beauty/flock.c" 550 #include "trace/beauty/futex_op.c" 551 #include "trace/beauty/mmap.c" 552 #include "trace/beauty/mode_t.c" 553 #include "trace/beauty/msg_flags.c" 554 #include "trace/beauty/open_flags.c" 555 #include "trace/beauty/perf_event_open.c" 556 #include "trace/beauty/pid.c" 557 #include "trace/beauty/sched_policy.c" 558 #include "trace/beauty/seccomp.c" 559 #include "trace/beauty/signum.c" 560 #include "trace/beauty/socket_type.c" 561 #include "trace/beauty/waitid_options.c" 562 563 struct syscall_arg_fmt { 564 size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg); 565 void *parm; 566 const char *name; 567 bool show_zero; 568 }; 569 570 static struct syscall_fmt { 571 const char *name; 572 const char *alias; 573 struct syscall_arg_fmt arg[6]; 574 u8 nr_args; 575 bool errpid; 576 bool timeout; 577 bool hexret; 578 } syscall_fmts[] = { 579 { .name = "access", 580 .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, }, 581 { .name = "arch_prctl", .alias = "prctl", }, 582 { .name = "bpf", 583 .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, }, 584 { .name = "brk", .hexret = true, 585 .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, }, 586 { .name = "clock_gettime", 587 .arg = { [0] = STRARRAY(clk_id, clockid), }, }, 588 { .name = "clone", .errpid = true, .nr_args = 5, 589 .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, }, 590 [1] = { .name = "child_stack", .scnprintf = SCA_HEX, }, 591 [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, }, 592 [3] = { .name = "child_tidptr", .scnprintf = SCA_HEX, }, 593 [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, }, 594 { .name = "close", 595 .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, }, 596 { .name = "epoll_ctl", 597 .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, }, 598 { .name = "eventfd2", 599 .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, }, 600 { .name = "fchmodat", 601 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 602 { .name = "fchownat", 603 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 604 { .name = "fcntl", 605 .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */ 606 .parm = &strarrays__fcntl_cmds_arrays, 607 .show_zero = true, }, 608 [2] = { .scnprintf = SCA_FCNTL_ARG, /* arg */ }, }, }, 609 { .name = "flock", 610 .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, }, 611 { .name = "fstat", .alias = "newfstat", }, 612 { .name = "fstatat", .alias = "newfstatat", }, 613 { .name = "futex", 614 .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, }, }, 615 { .name = "futimesat", 616 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 617 { .name = "getitimer", 618 .arg = { [0] = STRARRAY(which, itimers), }, }, 619 { .name = "getpid", .errpid = true, }, 620 { .name = "getpgid", .errpid = true, }, 621 { .name = "getppid", .errpid = true, }, 622 { .name = "getrandom", 623 .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, }, 624 { .name = "getrlimit", 625 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, }, 626 { .name = "ioctl", 627 .arg = { 628 #if defined(__i386__) || defined(__x86_64__) 629 /* 630 * FIXME: Make this available to all arches. 631 */ 632 [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ }, 633 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, }, 634 #else 635 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, }, 636 #endif 637 { .name = "keyctl", 638 .arg = { [0] = STRARRAY(option, keyctl_options), }, }, 639 { .name = "kill", 640 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 641 { .name = "linkat", 642 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 643 { .name = "lseek", 644 .arg = { [2] = STRARRAY(whence, whences), }, }, 645 { .name = "lstat", .alias = "newlstat", }, 646 { .name = "madvise", 647 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ }, 648 [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, }, 649 { .name = "mkdirat", 650 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 651 { .name = "mknodat", 652 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 653 { .name = "mlock", 654 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, }, 655 { .name = "mlockall", 656 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, }, 657 { .name = "mmap", .hexret = true, 658 /* The standard mmap maps to old_mmap on s390x */ 659 #if defined(__s390x__) 660 .alias = "old_mmap", 661 #endif 662 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, 663 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, 664 [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, }, 665 { .name = "mprotect", 666 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ }, 667 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, }, 668 { .name = "mq_unlink", 669 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, }, 670 { .name = "mremap", .hexret = true, 671 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, 672 [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, 673 [4] = { .scnprintf = SCA_HEX, /* new_addr */ }, }, }, 674 { .name = "munlock", 675 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, }, 676 { .name = "munmap", 677 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, }, 678 { .name = "name_to_handle_at", 679 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 680 { .name = "newfstatat", 681 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 682 { .name = "open", 683 .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, 684 { .name = "open_by_handle_at", 685 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, 686 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, 687 { .name = "openat", 688 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, 689 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, 690 { .name = "perf_event_open", 691 .arg = { [2] = { .scnprintf = SCA_INT, /* cpu */ }, 692 [3] = { .scnprintf = SCA_FD, /* group_fd */ }, 693 [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, }, 694 { .name = "pipe2", 695 .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, }, 696 { .name = "pkey_alloc", 697 .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS, /* access_rights */ }, }, }, 698 { .name = "pkey_free", 699 .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, }, 700 { .name = "pkey_mprotect", 701 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ }, 702 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, 703 [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, }, 704 { .name = "poll", .timeout = true, }, 705 { .name = "ppoll", .timeout = true, }, 706 { .name = "pread", .alias = "pread64", }, 707 { .name = "preadv", .alias = "pread", }, 708 { .name = "prlimit64", 709 .arg = { [1] = STRARRAY(resource, rlimit_resources), }, }, 710 { .name = "pwrite", .alias = "pwrite64", }, 711 { .name = "readlinkat", 712 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 713 { .name = "recvfrom", 714 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 715 { .name = "recvmmsg", 716 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 717 { .name = "recvmsg", 718 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 719 { .name = "renameat", 720 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 721 { .name = "rt_sigaction", 722 .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 723 { .name = "rt_sigprocmask", 724 .arg = { [0] = STRARRAY(how, sighow), }, }, 725 { .name = "rt_sigqueueinfo", 726 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 727 { .name = "rt_tgsigqueueinfo", 728 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 729 { .name = "sched_setscheduler", 730 .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, }, 731 { .name = "seccomp", 732 .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP, /* op */ }, 733 [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, }, 734 { .name = "select", .timeout = true, }, 735 { .name = "sendmmsg", 736 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 737 { .name = "sendmsg", 738 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 739 { .name = "sendto", 740 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 741 { .name = "set_tid_address", .errpid = true, }, 742 { .name = "setitimer", 743 .arg = { [0] = STRARRAY(which, itimers), }, }, 744 { .name = "setrlimit", 745 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, }, 746 { .name = "socket", 747 .arg = { [0] = STRARRAY(family, socket_families), 748 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, }, 749 { .name = "socketpair", 750 .arg = { [0] = STRARRAY(family, socket_families), 751 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, }, 752 { .name = "stat", .alias = "newstat", }, 753 { .name = "statx", 754 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ }, 755 [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } , 756 [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, }, 757 { .name = "swapoff", 758 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, }, 759 { .name = "swapon", 760 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, }, 761 { .name = "symlinkat", 762 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 763 { .name = "tgkill", 764 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 765 { .name = "tkill", 766 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 767 { .name = "uname", .alias = "newuname", }, 768 { .name = "unlinkat", 769 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 770 { .name = "utimensat", 771 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, }, 772 { .name = "wait4", .errpid = true, 773 .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, }, 774 { .name = "waitid", .errpid = true, 775 .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, }, 776 }; 777 778 static int syscall_fmt__cmp(const void *name, const void *fmtp) 779 { 780 const struct syscall_fmt *fmt = fmtp; 781 return strcmp(name, fmt->name); 782 } 783 784 static struct syscall_fmt *syscall_fmt__find(const char *name) 785 { 786 const int nmemb = ARRAY_SIZE(syscall_fmts); 787 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp); 788 } 789 790 struct syscall { 791 struct event_format *tp_format; 792 int nr_args; 793 struct format_field *args; 794 const char *name; 795 bool is_exit; 796 struct syscall_fmt *fmt; 797 struct syscall_arg_fmt *arg_fmt; 798 }; 799 800 /* 801 * We need to have this 'calculated' boolean because in some cases we really 802 * don't know what is the duration of a syscall, for instance, when we start 803 * a session and some threads are waiting for a syscall to finish, say 'poll', 804 * in which case all we can do is to print "( ? ) for duration and for the 805 * start timestamp. 806 */ 807 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp) 808 { 809 double duration = (double)t / NSEC_PER_MSEC; 810 size_t printed = fprintf(fp, "("); 811 812 if (!calculated) 813 printed += fprintf(fp, " ? "); 814 else if (duration >= 1.0) 815 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration); 816 else if (duration >= 0.01) 817 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration); 818 else 819 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration); 820 return printed + fprintf(fp, "): "); 821 } 822 823 /** 824 * filename.ptr: The filename char pointer that will be vfs_getname'd 825 * filename.entry_str_pos: Where to insert the string translated from 826 * filename.ptr by the vfs_getname tracepoint/kprobe. 827 * ret_scnprintf: syscall args may set this to a different syscall return 828 * formatter, for instance, fcntl may return fds, file flags, etc. 829 */ 830 struct thread_trace { 831 u64 entry_time; 832 bool entry_pending; 833 unsigned long nr_events; 834 unsigned long pfmaj, pfmin; 835 char *entry_str; 836 double runtime_ms; 837 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg); 838 struct { 839 unsigned long ptr; 840 short int entry_str_pos; 841 bool pending_open; 842 unsigned int namelen; 843 char *name; 844 } filename; 845 struct { 846 int max; 847 char **table; 848 } paths; 849 850 struct intlist *syscall_stats; 851 }; 852 853 static struct thread_trace *thread_trace__new(void) 854 { 855 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace)); 856 857 if (ttrace) 858 ttrace->paths.max = -1; 859 860 ttrace->syscall_stats = intlist__new(NULL); 861 862 return ttrace; 863 } 864 865 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp) 866 { 867 struct thread_trace *ttrace; 868 869 if (thread == NULL) 870 goto fail; 871 872 if (thread__priv(thread) == NULL) 873 thread__set_priv(thread, thread_trace__new()); 874 875 if (thread__priv(thread) == NULL) 876 goto fail; 877 878 ttrace = thread__priv(thread); 879 ++ttrace->nr_events; 880 881 return ttrace; 882 fail: 883 color_fprintf(fp, PERF_COLOR_RED, 884 "WARNING: not enough memory, dropping samples!\n"); 885 return NULL; 886 } 887 888 889 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg, 890 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg)) 891 { 892 struct thread_trace *ttrace = thread__priv(arg->thread); 893 894 ttrace->ret_scnprintf = ret_scnprintf; 895 } 896 897 #define TRACE_PFMAJ (1 << 0) 898 #define TRACE_PFMIN (1 << 1) 899 900 static const size_t trace__entry_str_size = 2048; 901 902 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname) 903 { 904 struct thread_trace *ttrace = thread__priv(thread); 905 906 if (fd > ttrace->paths.max) { 907 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *)); 908 909 if (npath == NULL) 910 return -1; 911 912 if (ttrace->paths.max != -1) { 913 memset(npath + ttrace->paths.max + 1, 0, 914 (fd - ttrace->paths.max) * sizeof(char *)); 915 } else { 916 memset(npath, 0, (fd + 1) * sizeof(char *)); 917 } 918 919 ttrace->paths.table = npath; 920 ttrace->paths.max = fd; 921 } 922 923 ttrace->paths.table[fd] = strdup(pathname); 924 925 return ttrace->paths.table[fd] != NULL ? 0 : -1; 926 } 927 928 static int thread__read_fd_path(struct thread *thread, int fd) 929 { 930 char linkname[PATH_MAX], pathname[PATH_MAX]; 931 struct stat st; 932 int ret; 933 934 if (thread->pid_ == thread->tid) { 935 scnprintf(linkname, sizeof(linkname), 936 "/proc/%d/fd/%d", thread->pid_, fd); 937 } else { 938 scnprintf(linkname, sizeof(linkname), 939 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd); 940 } 941 942 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname)) 943 return -1; 944 945 ret = readlink(linkname, pathname, sizeof(pathname)); 946 947 if (ret < 0 || ret > st.st_size) 948 return -1; 949 950 pathname[ret] = '\0'; 951 return trace__set_fd_pathname(thread, fd, pathname); 952 } 953 954 static const char *thread__fd_path(struct thread *thread, int fd, 955 struct trace *trace) 956 { 957 struct thread_trace *ttrace = thread__priv(thread); 958 959 if (ttrace == NULL) 960 return NULL; 961 962 if (fd < 0) 963 return NULL; 964 965 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) { 966 if (!trace->live) 967 return NULL; 968 ++trace->stats.proc_getname; 969 if (thread__read_fd_path(thread, fd)) 970 return NULL; 971 } 972 973 return ttrace->paths.table[fd]; 974 } 975 976 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg) 977 { 978 int fd = arg->val; 979 size_t printed = scnprintf(bf, size, "%d", fd); 980 const char *path = thread__fd_path(arg->thread, fd, arg->trace); 981 982 if (path) 983 printed += scnprintf(bf + printed, size - printed, "<%s>", path); 984 985 return printed; 986 } 987 988 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size, 989 struct syscall_arg *arg) 990 { 991 int fd = arg->val; 992 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg); 993 struct thread_trace *ttrace = thread__priv(arg->thread); 994 995 if (ttrace && fd >= 0 && fd <= ttrace->paths.max) 996 zfree(&ttrace->paths.table[fd]); 997 998 return printed; 999 } 1000 1001 static void thread__set_filename_pos(struct thread *thread, const char *bf, 1002 unsigned long ptr) 1003 { 1004 struct thread_trace *ttrace = thread__priv(thread); 1005 1006 ttrace->filename.ptr = ptr; 1007 ttrace->filename.entry_str_pos = bf - ttrace->entry_str; 1008 } 1009 1010 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, 1011 struct syscall_arg *arg) 1012 { 1013 unsigned long ptr = arg->val; 1014 1015 if (!arg->trace->vfs_getname) 1016 return scnprintf(bf, size, "%#x", ptr); 1017 1018 thread__set_filename_pos(arg->thread, bf, ptr); 1019 return 0; 1020 } 1021 1022 static bool trace__filter_duration(struct trace *trace, double t) 1023 { 1024 return t < (trace->duration_filter * NSEC_PER_MSEC); 1025 } 1026 1027 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp) 1028 { 1029 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC; 1030 1031 return fprintf(fp, "%10.3f ", ts); 1032 } 1033 1034 /* 1035 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are 1036 * using ttrace->entry_time for a thread that receives a sys_exit without 1037 * first having received a sys_enter ("poll" issued before tracing session 1038 * starts, lost sys_enter exit due to ring buffer overflow). 1039 */ 1040 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp) 1041 { 1042 if (tstamp > 0) 1043 return __trace__fprintf_tstamp(trace, tstamp, fp); 1044 1045 return fprintf(fp, " ? "); 1046 } 1047 1048 static bool done = false; 1049 static bool interrupted = false; 1050 1051 static void sig_handler(int sig) 1052 { 1053 done = true; 1054 interrupted = sig == SIGINT; 1055 } 1056 1057 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread, 1058 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp) 1059 { 1060 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp); 1061 printed += fprintf_duration(duration, duration_calculated, fp); 1062 1063 if (trace->multiple_threads) { 1064 if (trace->show_comm) 1065 printed += fprintf(fp, "%.14s/", thread__comm_str(thread)); 1066 printed += fprintf(fp, "%d ", thread->tid); 1067 } 1068 1069 return printed; 1070 } 1071 1072 static int trace__process_event(struct trace *trace, struct machine *machine, 1073 union perf_event *event, struct perf_sample *sample) 1074 { 1075 int ret = 0; 1076 1077 switch (event->header.type) { 1078 case PERF_RECORD_LOST: 1079 color_fprintf(trace->output, PERF_COLOR_RED, 1080 "LOST %" PRIu64 " events!\n", event->lost.lost); 1081 ret = machine__process_lost_event(machine, event, sample); 1082 break; 1083 default: 1084 ret = machine__process_event(machine, event, sample); 1085 break; 1086 } 1087 1088 return ret; 1089 } 1090 1091 static int trace__tool_process(struct perf_tool *tool, 1092 union perf_event *event, 1093 struct perf_sample *sample, 1094 struct machine *machine) 1095 { 1096 struct trace *trace = container_of(tool, struct trace, tool); 1097 return trace__process_event(trace, machine, event, sample); 1098 } 1099 1100 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp) 1101 { 1102 struct machine *machine = vmachine; 1103 1104 if (machine->kptr_restrict_warned) 1105 return NULL; 1106 1107 if (symbol_conf.kptr_restrict) { 1108 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n" 1109 "Check /proc/sys/kernel/kptr_restrict.\n\n" 1110 "Kernel samples will not be resolved.\n"); 1111 machine->kptr_restrict_warned = true; 1112 return NULL; 1113 } 1114 1115 return machine__resolve_kernel_addr(vmachine, addrp, modp); 1116 } 1117 1118 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist) 1119 { 1120 int err = symbol__init(NULL); 1121 1122 if (err) 1123 return err; 1124 1125 trace->host = machine__new_host(); 1126 if (trace->host == NULL) 1127 return -ENOMEM; 1128 1129 if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0) 1130 return -errno; 1131 1132 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target, 1133 evlist->threads, trace__tool_process, false, 1134 trace->opts.proc_map_timeout); 1135 if (err) 1136 symbol__exit(); 1137 1138 return err; 1139 } 1140 1141 static void trace__symbols__exit(struct trace *trace) 1142 { 1143 machine__exit(trace->host); 1144 trace->host = NULL; 1145 1146 symbol__exit(); 1147 } 1148 1149 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args) 1150 { 1151 int idx; 1152 1153 if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0) 1154 nr_args = sc->fmt->nr_args; 1155 1156 sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt)); 1157 if (sc->arg_fmt == NULL) 1158 return -1; 1159 1160 for (idx = 0; idx < nr_args; ++idx) { 1161 if (sc->fmt) 1162 sc->arg_fmt[idx] = sc->fmt->arg[idx]; 1163 } 1164 1165 sc->nr_args = nr_args; 1166 return 0; 1167 } 1168 1169 static int syscall__set_arg_fmts(struct syscall *sc) 1170 { 1171 struct format_field *field; 1172 int idx = 0, len; 1173 1174 for (field = sc->args; field; field = field->next, ++idx) { 1175 if (sc->fmt && sc->fmt->arg[idx].scnprintf) 1176 continue; 1177 1178 if (strcmp(field->type, "const char *") == 0 && 1179 (strcmp(field->name, "filename") == 0 || 1180 strcmp(field->name, "path") == 0 || 1181 strcmp(field->name, "pathname") == 0)) 1182 sc->arg_fmt[idx].scnprintf = SCA_FILENAME; 1183 else if (field->flags & FIELD_IS_POINTER) 1184 sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex; 1185 else if (strcmp(field->type, "pid_t") == 0) 1186 sc->arg_fmt[idx].scnprintf = SCA_PID; 1187 else if (strcmp(field->type, "umode_t") == 0) 1188 sc->arg_fmt[idx].scnprintf = SCA_MODE_T; 1189 else if ((strcmp(field->type, "int") == 0 || 1190 strcmp(field->type, "unsigned int") == 0 || 1191 strcmp(field->type, "long") == 0) && 1192 (len = strlen(field->name)) >= 2 && 1193 strcmp(field->name + len - 2, "fd") == 0) { 1194 /* 1195 * /sys/kernel/tracing/events/syscalls/sys_enter* 1196 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c 1197 * 65 int 1198 * 23 unsigned int 1199 * 7 unsigned long 1200 */ 1201 sc->arg_fmt[idx].scnprintf = SCA_FD; 1202 } 1203 } 1204 1205 return 0; 1206 } 1207 1208 static int trace__read_syscall_info(struct trace *trace, int id) 1209 { 1210 char tp_name[128]; 1211 struct syscall *sc; 1212 const char *name = syscalltbl__name(trace->sctbl, id); 1213 1214 if (name == NULL) 1215 return -1; 1216 1217 if (id > trace->syscalls.max) { 1218 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc)); 1219 1220 if (nsyscalls == NULL) 1221 return -1; 1222 1223 if (trace->syscalls.max != -1) { 1224 memset(nsyscalls + trace->syscalls.max + 1, 0, 1225 (id - trace->syscalls.max) * sizeof(*sc)); 1226 } else { 1227 memset(nsyscalls, 0, (id + 1) * sizeof(*sc)); 1228 } 1229 1230 trace->syscalls.table = nsyscalls; 1231 trace->syscalls.max = id; 1232 } 1233 1234 sc = trace->syscalls.table + id; 1235 sc->name = name; 1236 1237 sc->fmt = syscall_fmt__find(sc->name); 1238 1239 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name); 1240 sc->tp_format = trace_event__tp_format("syscalls", tp_name); 1241 1242 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) { 1243 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias); 1244 sc->tp_format = trace_event__tp_format("syscalls", tp_name); 1245 } 1246 1247 if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields)) 1248 return -1; 1249 1250 if (IS_ERR(sc->tp_format)) 1251 return -1; 1252 1253 sc->args = sc->tp_format->format.fields; 1254 /* 1255 * We need to check and discard the first variable '__syscall_nr' 1256 * or 'nr' that mean the syscall number. It is needless here. 1257 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels. 1258 */ 1259 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) { 1260 sc->args = sc->args->next; 1261 --sc->nr_args; 1262 } 1263 1264 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit"); 1265 1266 return syscall__set_arg_fmts(sc); 1267 } 1268 1269 static int trace__validate_ev_qualifier(struct trace *trace) 1270 { 1271 int err = 0, i; 1272 size_t nr_allocated; 1273 struct str_node *pos; 1274 1275 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier); 1276 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr * 1277 sizeof(trace->ev_qualifier_ids.entries[0])); 1278 1279 if (trace->ev_qualifier_ids.entries == NULL) { 1280 fputs("Error:\tNot enough memory for allocating events qualifier ids\n", 1281 trace->output); 1282 err = -EINVAL; 1283 goto out; 1284 } 1285 1286 nr_allocated = trace->ev_qualifier_ids.nr; 1287 i = 0; 1288 1289 strlist__for_each_entry(pos, trace->ev_qualifier) { 1290 const char *sc = pos->s; 1291 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1; 1292 1293 if (id < 0) { 1294 id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next); 1295 if (id >= 0) 1296 goto matches; 1297 1298 if (err == 0) { 1299 fputs("Error:\tInvalid syscall ", trace->output); 1300 err = -EINVAL; 1301 } else { 1302 fputs(", ", trace->output); 1303 } 1304 1305 fputs(sc, trace->output); 1306 } 1307 matches: 1308 trace->ev_qualifier_ids.entries[i++] = id; 1309 if (match_next == -1) 1310 continue; 1311 1312 while (1) { 1313 id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next); 1314 if (id < 0) 1315 break; 1316 if (nr_allocated == trace->ev_qualifier_ids.nr) { 1317 void *entries; 1318 1319 nr_allocated += 8; 1320 entries = realloc(trace->ev_qualifier_ids.entries, 1321 nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0])); 1322 if (entries == NULL) { 1323 err = -ENOMEM; 1324 fputs("\nError:\t Not enough memory for parsing\n", trace->output); 1325 goto out_free; 1326 } 1327 trace->ev_qualifier_ids.entries = entries; 1328 } 1329 trace->ev_qualifier_ids.nr++; 1330 trace->ev_qualifier_ids.entries[i++] = id; 1331 } 1332 } 1333 1334 if (err < 0) { 1335 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'" 1336 "\nHint:\tand: 'man syscalls'\n", trace->output); 1337 out_free: 1338 zfree(&trace->ev_qualifier_ids.entries); 1339 trace->ev_qualifier_ids.nr = 0; 1340 } 1341 out: 1342 return err; 1343 } 1344 1345 /* 1346 * args is to be interpreted as a series of longs but we need to handle 1347 * 8-byte unaligned accesses. args points to raw_data within the event 1348 * and raw_data is guaranteed to be 8-byte unaligned because it is 1349 * preceded by raw_size which is a u32. So we need to copy args to a temp 1350 * variable to read it. Most notably this avoids extended load instructions 1351 * on unaligned addresses 1352 */ 1353 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx) 1354 { 1355 unsigned long val; 1356 unsigned char *p = arg->args + sizeof(unsigned long) * idx; 1357 1358 memcpy(&val, p, sizeof(val)); 1359 return val; 1360 } 1361 1362 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size, 1363 struct syscall_arg *arg) 1364 { 1365 if (sc->arg_fmt && sc->arg_fmt[arg->idx].name) 1366 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name); 1367 1368 return scnprintf(bf, size, "arg%d: ", arg->idx); 1369 } 1370 1371 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size, 1372 struct syscall_arg *arg, unsigned long val) 1373 { 1374 if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) { 1375 arg->val = val; 1376 if (sc->arg_fmt[arg->idx].parm) 1377 arg->parm = sc->arg_fmt[arg->idx].parm; 1378 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg); 1379 } 1380 return scnprintf(bf, size, "%ld", val); 1381 } 1382 1383 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, 1384 unsigned char *args, struct trace *trace, 1385 struct thread *thread) 1386 { 1387 size_t printed = 0; 1388 unsigned long val; 1389 u8 bit = 1; 1390 struct syscall_arg arg = { 1391 .args = args, 1392 .idx = 0, 1393 .mask = 0, 1394 .trace = trace, 1395 .thread = thread, 1396 }; 1397 struct thread_trace *ttrace = thread__priv(thread); 1398 1399 /* 1400 * Things like fcntl will set this in its 'cmd' formatter to pick the 1401 * right formatter for the return value (an fd? file flags?), which is 1402 * not needed for syscalls that always return a given type, say an fd. 1403 */ 1404 ttrace->ret_scnprintf = NULL; 1405 1406 if (sc->args != NULL) { 1407 struct format_field *field; 1408 1409 for (field = sc->args; field; 1410 field = field->next, ++arg.idx, bit <<= 1) { 1411 if (arg.mask & bit) 1412 continue; 1413 1414 val = syscall_arg__val(&arg, arg.idx); 1415 1416 /* 1417 * Suppress this argument if its value is zero and 1418 * and we don't have a string associated in an 1419 * strarray for it. 1420 */ 1421 if (val == 0 && 1422 !(sc->arg_fmt && 1423 (sc->arg_fmt[arg.idx].show_zero || 1424 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY || 1425 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) && 1426 sc->arg_fmt[arg.idx].parm)) 1427 continue; 1428 1429 printed += scnprintf(bf + printed, size - printed, 1430 "%s%s: ", printed ? ", " : "", field->name); 1431 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val); 1432 } 1433 } else if (IS_ERR(sc->tp_format)) { 1434 /* 1435 * If we managed to read the tracepoint /format file, then we 1436 * may end up not having any args, like with gettid(), so only 1437 * print the raw args when we didn't manage to read it. 1438 */ 1439 while (arg.idx < sc->nr_args) { 1440 if (arg.mask & bit) 1441 goto next_arg; 1442 val = syscall_arg__val(&arg, arg.idx); 1443 if (printed) 1444 printed += scnprintf(bf + printed, size - printed, ", "); 1445 printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg); 1446 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val); 1447 next_arg: 1448 ++arg.idx; 1449 bit <<= 1; 1450 } 1451 } 1452 1453 return printed; 1454 } 1455 1456 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel, 1457 union perf_event *event, 1458 struct perf_sample *sample); 1459 1460 static struct syscall *trace__syscall_info(struct trace *trace, 1461 struct perf_evsel *evsel, int id) 1462 { 1463 1464 if (id < 0) { 1465 1466 /* 1467 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried 1468 * before that, leaving at a higher verbosity level till that is 1469 * explained. Reproduced with plain ftrace with: 1470 * 1471 * echo 1 > /t/events/raw_syscalls/sys_exit/enable 1472 * grep "NR -1 " /t/trace_pipe 1473 * 1474 * After generating some load on the machine. 1475 */ 1476 if (verbose > 1) { 1477 static u64 n; 1478 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n", 1479 id, perf_evsel__name(evsel), ++n); 1480 } 1481 return NULL; 1482 } 1483 1484 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) && 1485 trace__read_syscall_info(trace, id)) 1486 goto out_cant_read; 1487 1488 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL)) 1489 goto out_cant_read; 1490 1491 return &trace->syscalls.table[id]; 1492 1493 out_cant_read: 1494 if (verbose > 0) { 1495 fprintf(trace->output, "Problems reading syscall %d", id); 1496 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL) 1497 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name); 1498 fputs(" information\n", trace->output); 1499 } 1500 return NULL; 1501 } 1502 1503 static void thread__update_stats(struct thread_trace *ttrace, 1504 int id, struct perf_sample *sample) 1505 { 1506 struct int_node *inode; 1507 struct stats *stats; 1508 u64 duration = 0; 1509 1510 inode = intlist__findnew(ttrace->syscall_stats, id); 1511 if (inode == NULL) 1512 return; 1513 1514 stats = inode->priv; 1515 if (stats == NULL) { 1516 stats = malloc(sizeof(struct stats)); 1517 if (stats == NULL) 1518 return; 1519 init_stats(stats); 1520 inode->priv = stats; 1521 } 1522 1523 if (ttrace->entry_time && sample->time > ttrace->entry_time) 1524 duration = sample->time - ttrace->entry_time; 1525 1526 update_stats(stats, duration); 1527 } 1528 1529 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample) 1530 { 1531 struct thread_trace *ttrace; 1532 u64 duration; 1533 size_t printed; 1534 1535 if (trace->current == NULL) 1536 return 0; 1537 1538 ttrace = thread__priv(trace->current); 1539 1540 if (!ttrace->entry_pending) 1541 return 0; 1542 1543 duration = sample->time - ttrace->entry_time; 1544 1545 printed = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output); 1546 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str); 1547 ttrace->entry_pending = false; 1548 1549 return printed; 1550 } 1551 1552 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, 1553 union perf_event *event __maybe_unused, 1554 struct perf_sample *sample) 1555 { 1556 char *msg; 1557 void *args; 1558 size_t printed = 0; 1559 struct thread *thread; 1560 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; 1561 struct syscall *sc = trace__syscall_info(trace, evsel, id); 1562 struct thread_trace *ttrace; 1563 1564 if (sc == NULL) 1565 return -1; 1566 1567 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1568 ttrace = thread__trace(thread, trace->output); 1569 if (ttrace == NULL) 1570 goto out_put; 1571 1572 args = perf_evsel__sc_tp_ptr(evsel, args, sample); 1573 1574 if (ttrace->entry_str == NULL) { 1575 ttrace->entry_str = malloc(trace__entry_str_size); 1576 if (!ttrace->entry_str) 1577 goto out_put; 1578 } 1579 1580 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) 1581 trace__printf_interrupted_entry(trace, sample); 1582 1583 ttrace->entry_time = sample->time; 1584 msg = ttrace->entry_str; 1585 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name); 1586 1587 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed, 1588 args, trace, thread); 1589 1590 if (sc->is_exit) { 1591 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) { 1592 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output); 1593 fprintf(trace->output, "%-70s)\n", ttrace->entry_str); 1594 } 1595 } else { 1596 ttrace->entry_pending = true; 1597 /* See trace__vfs_getname & trace__sys_exit */ 1598 ttrace->filename.pending_open = false; 1599 } 1600 1601 if (trace->current != thread) { 1602 thread__put(trace->current); 1603 trace->current = thread__get(thread); 1604 } 1605 err = 0; 1606 out_put: 1607 thread__put(thread); 1608 return err; 1609 } 1610 1611 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel, 1612 struct perf_sample *sample, 1613 struct callchain_cursor *cursor) 1614 { 1615 struct addr_location al; 1616 1617 if (machine__resolve(trace->host, &al, sample) < 0 || 1618 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack)) 1619 return -1; 1620 1621 return 0; 1622 } 1623 1624 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample) 1625 { 1626 /* TODO: user-configurable print_opts */ 1627 const unsigned int print_opts = EVSEL__PRINT_SYM | 1628 EVSEL__PRINT_DSO | 1629 EVSEL__PRINT_UNKNOWN_AS_ADDR; 1630 1631 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output); 1632 } 1633 1634 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, 1635 union perf_event *event __maybe_unused, 1636 struct perf_sample *sample) 1637 { 1638 long ret; 1639 u64 duration = 0; 1640 bool duration_calculated = false; 1641 struct thread *thread; 1642 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0; 1643 struct syscall *sc = trace__syscall_info(trace, evsel, id); 1644 struct thread_trace *ttrace; 1645 1646 if (sc == NULL) 1647 return -1; 1648 1649 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1650 ttrace = thread__trace(thread, trace->output); 1651 if (ttrace == NULL) 1652 goto out_put; 1653 1654 if (trace->summary) 1655 thread__update_stats(ttrace, id, sample); 1656 1657 ret = perf_evsel__sc_tp_uint(evsel, ret, sample); 1658 1659 if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) { 1660 trace__set_fd_pathname(thread, ret, ttrace->filename.name); 1661 ttrace->filename.pending_open = false; 1662 ++trace->stats.vfs_getname; 1663 } 1664 1665 if (ttrace->entry_time) { 1666 duration = sample->time - ttrace->entry_time; 1667 if (trace__filter_duration(trace, duration)) 1668 goto out; 1669 duration_calculated = true; 1670 } else if (trace->duration_filter) 1671 goto out; 1672 1673 if (sample->callchain) { 1674 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 1675 if (callchain_ret == 0) { 1676 if (callchain_cursor.nr < trace->min_stack) 1677 goto out; 1678 callchain_ret = 1; 1679 } 1680 } 1681 1682 if (trace->summary_only) 1683 goto out; 1684 1685 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output); 1686 1687 if (ttrace->entry_pending) { 1688 fprintf(trace->output, "%-70s", ttrace->entry_str); 1689 } else { 1690 fprintf(trace->output, " ... ["); 1691 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued"); 1692 fprintf(trace->output, "]: %s()", sc->name); 1693 } 1694 1695 if (sc->fmt == NULL) { 1696 if (ret < 0) 1697 goto errno_print; 1698 signed_print: 1699 fprintf(trace->output, ") = %ld", ret); 1700 } else if (ret < 0) { 1701 errno_print: { 1702 char bf[STRERR_BUFSIZE]; 1703 const char *emsg = str_error_r(-ret, bf, sizeof(bf)), 1704 *e = audit_errno_to_name(-ret); 1705 1706 fprintf(trace->output, ") = -1 %s %s", e, emsg); 1707 } 1708 } else if (ret == 0 && sc->fmt->timeout) 1709 fprintf(trace->output, ") = 0 Timeout"); 1710 else if (ttrace->ret_scnprintf) { 1711 char bf[1024]; 1712 struct syscall_arg arg = { 1713 .val = ret, 1714 .thread = thread, 1715 .trace = trace, 1716 }; 1717 ttrace->ret_scnprintf(bf, sizeof(bf), &arg); 1718 ttrace->ret_scnprintf = NULL; 1719 fprintf(trace->output, ") = %s", bf); 1720 } else if (sc->fmt->hexret) 1721 fprintf(trace->output, ") = %#lx", ret); 1722 else if (sc->fmt->errpid) { 1723 struct thread *child = machine__find_thread(trace->host, ret, ret); 1724 1725 if (child != NULL) { 1726 fprintf(trace->output, ") = %ld", ret); 1727 if (child->comm_set) 1728 fprintf(trace->output, " (%s)", thread__comm_str(child)); 1729 thread__put(child); 1730 } 1731 } else 1732 goto signed_print; 1733 1734 fputc('\n', trace->output); 1735 1736 if (callchain_ret > 0) 1737 trace__fprintf_callchain(trace, sample); 1738 else if (callchain_ret < 0) 1739 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 1740 out: 1741 ttrace->entry_pending = false; 1742 err = 0; 1743 out_put: 1744 thread__put(thread); 1745 return err; 1746 } 1747 1748 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel, 1749 union perf_event *event __maybe_unused, 1750 struct perf_sample *sample) 1751 { 1752 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1753 struct thread_trace *ttrace; 1754 size_t filename_len, entry_str_len, to_move; 1755 ssize_t remaining_space; 1756 char *pos; 1757 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname"); 1758 1759 if (!thread) 1760 goto out; 1761 1762 ttrace = thread__priv(thread); 1763 if (!ttrace) 1764 goto out_put; 1765 1766 filename_len = strlen(filename); 1767 if (filename_len == 0) 1768 goto out_put; 1769 1770 if (ttrace->filename.namelen < filename_len) { 1771 char *f = realloc(ttrace->filename.name, filename_len + 1); 1772 1773 if (f == NULL) 1774 goto out_put; 1775 1776 ttrace->filename.namelen = filename_len; 1777 ttrace->filename.name = f; 1778 } 1779 1780 strcpy(ttrace->filename.name, filename); 1781 ttrace->filename.pending_open = true; 1782 1783 if (!ttrace->filename.ptr) 1784 goto out_put; 1785 1786 entry_str_len = strlen(ttrace->entry_str); 1787 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */ 1788 if (remaining_space <= 0) 1789 goto out_put; 1790 1791 if (filename_len > (size_t)remaining_space) { 1792 filename += filename_len - remaining_space; 1793 filename_len = remaining_space; 1794 } 1795 1796 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */ 1797 pos = ttrace->entry_str + ttrace->filename.entry_str_pos; 1798 memmove(pos + filename_len, pos, to_move); 1799 memcpy(pos, filename, filename_len); 1800 1801 ttrace->filename.ptr = 0; 1802 ttrace->filename.entry_str_pos = 0; 1803 out_put: 1804 thread__put(thread); 1805 out: 1806 return 0; 1807 } 1808 1809 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel, 1810 union perf_event *event __maybe_unused, 1811 struct perf_sample *sample) 1812 { 1813 u64 runtime = perf_evsel__intval(evsel, sample, "runtime"); 1814 double runtime_ms = (double)runtime / NSEC_PER_MSEC; 1815 struct thread *thread = machine__findnew_thread(trace->host, 1816 sample->pid, 1817 sample->tid); 1818 struct thread_trace *ttrace = thread__trace(thread, trace->output); 1819 1820 if (ttrace == NULL) 1821 goto out_dump; 1822 1823 ttrace->runtime_ms += runtime_ms; 1824 trace->runtime_ms += runtime_ms; 1825 out_put: 1826 thread__put(thread); 1827 return 0; 1828 1829 out_dump: 1830 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n", 1831 evsel->name, 1832 perf_evsel__strval(evsel, sample, "comm"), 1833 (pid_t)perf_evsel__intval(evsel, sample, "pid"), 1834 runtime, 1835 perf_evsel__intval(evsel, sample, "vruntime")); 1836 goto out_put; 1837 } 1838 1839 static void bpf_output__printer(enum binary_printer_ops op, 1840 unsigned int val, void *extra) 1841 { 1842 FILE *output = extra; 1843 unsigned char ch = (unsigned char)val; 1844 1845 switch (op) { 1846 case BINARY_PRINT_CHAR_DATA: 1847 fprintf(output, "%c", isprint(ch) ? ch : '.'); 1848 break; 1849 case BINARY_PRINT_DATA_BEGIN: 1850 case BINARY_PRINT_LINE_BEGIN: 1851 case BINARY_PRINT_ADDR: 1852 case BINARY_PRINT_NUM_DATA: 1853 case BINARY_PRINT_NUM_PAD: 1854 case BINARY_PRINT_SEP: 1855 case BINARY_PRINT_CHAR_PAD: 1856 case BINARY_PRINT_LINE_END: 1857 case BINARY_PRINT_DATA_END: 1858 default: 1859 break; 1860 } 1861 } 1862 1863 static void bpf_output__fprintf(struct trace *trace, 1864 struct perf_sample *sample) 1865 { 1866 print_binary(sample->raw_data, sample->raw_size, 8, 1867 bpf_output__printer, trace->output); 1868 } 1869 1870 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, 1871 union perf_event *event __maybe_unused, 1872 struct perf_sample *sample) 1873 { 1874 int callchain_ret = 0; 1875 1876 if (sample->callchain) { 1877 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 1878 if (callchain_ret == 0) { 1879 if (callchain_cursor.nr < trace->min_stack) 1880 goto out; 1881 callchain_ret = 1; 1882 } 1883 } 1884 1885 trace__printf_interrupted_entry(trace, sample); 1886 trace__fprintf_tstamp(trace, sample->time, trace->output); 1887 1888 if (trace->trace_syscalls) 1889 fprintf(trace->output, "( ): "); 1890 1891 fprintf(trace->output, "%s:", evsel->name); 1892 1893 if (perf_evsel__is_bpf_output(evsel)) { 1894 bpf_output__fprintf(trace, sample); 1895 } else if (evsel->tp_format) { 1896 event_format__fprintf(evsel->tp_format, sample->cpu, 1897 sample->raw_data, sample->raw_size, 1898 trace->output); 1899 } 1900 1901 fprintf(trace->output, ")\n"); 1902 1903 if (callchain_ret > 0) 1904 trace__fprintf_callchain(trace, sample); 1905 else if (callchain_ret < 0) 1906 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 1907 out: 1908 return 0; 1909 } 1910 1911 static void print_location(FILE *f, struct perf_sample *sample, 1912 struct addr_location *al, 1913 bool print_dso, bool print_sym) 1914 { 1915 1916 if ((verbose > 0 || print_dso) && al->map) 1917 fprintf(f, "%s@", al->map->dso->long_name); 1918 1919 if ((verbose > 0 || print_sym) && al->sym) 1920 fprintf(f, "%s+0x%" PRIx64, al->sym->name, 1921 al->addr - al->sym->start); 1922 else if (al->map) 1923 fprintf(f, "0x%" PRIx64, al->addr); 1924 else 1925 fprintf(f, "0x%" PRIx64, sample->addr); 1926 } 1927 1928 static int trace__pgfault(struct trace *trace, 1929 struct perf_evsel *evsel, 1930 union perf_event *event __maybe_unused, 1931 struct perf_sample *sample) 1932 { 1933 struct thread *thread; 1934 struct addr_location al; 1935 char map_type = 'd'; 1936 struct thread_trace *ttrace; 1937 int err = -1; 1938 int callchain_ret = 0; 1939 1940 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1941 1942 if (sample->callchain) { 1943 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 1944 if (callchain_ret == 0) { 1945 if (callchain_cursor.nr < trace->min_stack) 1946 goto out_put; 1947 callchain_ret = 1; 1948 } 1949 } 1950 1951 ttrace = thread__trace(thread, trace->output); 1952 if (ttrace == NULL) 1953 goto out_put; 1954 1955 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ) 1956 ttrace->pfmaj++; 1957 else 1958 ttrace->pfmin++; 1959 1960 if (trace->summary_only) 1961 goto out; 1962 1963 thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION, 1964 sample->ip, &al); 1965 1966 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output); 1967 1968 fprintf(trace->output, "%sfault [", 1969 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ? 1970 "maj" : "min"); 1971 1972 print_location(trace->output, sample, &al, false, true); 1973 1974 fprintf(trace->output, "] => "); 1975 1976 thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE, 1977 sample->addr, &al); 1978 1979 if (!al.map) { 1980 thread__find_addr_location(thread, sample->cpumode, 1981 MAP__FUNCTION, sample->addr, &al); 1982 1983 if (al.map) 1984 map_type = 'x'; 1985 else 1986 map_type = '?'; 1987 } 1988 1989 print_location(trace->output, sample, &al, true, false); 1990 1991 fprintf(trace->output, " (%c%c)\n", map_type, al.level); 1992 1993 if (callchain_ret > 0) 1994 trace__fprintf_callchain(trace, sample); 1995 else if (callchain_ret < 0) 1996 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 1997 out: 1998 err = 0; 1999 out_put: 2000 thread__put(thread); 2001 return err; 2002 } 2003 2004 static void trace__set_base_time(struct trace *trace, 2005 struct perf_evsel *evsel, 2006 struct perf_sample *sample) 2007 { 2008 /* 2009 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust 2010 * and don't use sample->time unconditionally, we may end up having 2011 * some other event in the future without PERF_SAMPLE_TIME for good 2012 * reason, i.e. we may not be interested in its timestamps, just in 2013 * it taking place, picking some piece of information when it 2014 * appears in our event stream (vfs_getname comes to mind). 2015 */ 2016 if (trace->base_time == 0 && !trace->full_time && 2017 (evsel->attr.sample_type & PERF_SAMPLE_TIME)) 2018 trace->base_time = sample->time; 2019 } 2020 2021 static int trace__process_sample(struct perf_tool *tool, 2022 union perf_event *event, 2023 struct perf_sample *sample, 2024 struct perf_evsel *evsel, 2025 struct machine *machine __maybe_unused) 2026 { 2027 struct trace *trace = container_of(tool, struct trace, tool); 2028 struct thread *thread; 2029 int err = 0; 2030 2031 tracepoint_handler handler = evsel->handler; 2032 2033 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 2034 if (thread && thread__is_filtered(thread)) 2035 goto out; 2036 2037 trace__set_base_time(trace, evsel, sample); 2038 2039 if (handler) { 2040 ++trace->nr_events; 2041 handler(trace, evsel, event, sample); 2042 } 2043 out: 2044 thread__put(thread); 2045 return err; 2046 } 2047 2048 static int trace__record(struct trace *trace, int argc, const char **argv) 2049 { 2050 unsigned int rec_argc, i, j; 2051 const char **rec_argv; 2052 const char * const record_args[] = { 2053 "record", 2054 "-R", 2055 "-m", "1024", 2056 "-c", "1", 2057 }; 2058 2059 const char * const sc_args[] = { "-e", }; 2060 unsigned int sc_args_nr = ARRAY_SIZE(sc_args); 2061 const char * const majpf_args[] = { "-e", "major-faults" }; 2062 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args); 2063 const char * const minpf_args[] = { "-e", "minor-faults" }; 2064 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args); 2065 2066 /* +1 is for the event string below */ 2067 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 + 2068 majpf_args_nr + minpf_args_nr + argc; 2069 rec_argv = calloc(rec_argc + 1, sizeof(char *)); 2070 2071 if (rec_argv == NULL) 2072 return -ENOMEM; 2073 2074 j = 0; 2075 for (i = 0; i < ARRAY_SIZE(record_args); i++) 2076 rec_argv[j++] = record_args[i]; 2077 2078 if (trace->trace_syscalls) { 2079 for (i = 0; i < sc_args_nr; i++) 2080 rec_argv[j++] = sc_args[i]; 2081 2082 /* event string may be different for older kernels - e.g., RHEL6 */ 2083 if (is_valid_tracepoint("raw_syscalls:sys_enter")) 2084 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit"; 2085 else if (is_valid_tracepoint("syscalls:sys_enter")) 2086 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit"; 2087 else { 2088 pr_err("Neither raw_syscalls nor syscalls events exist.\n"); 2089 return -1; 2090 } 2091 } 2092 2093 if (trace->trace_pgfaults & TRACE_PFMAJ) 2094 for (i = 0; i < majpf_args_nr; i++) 2095 rec_argv[j++] = majpf_args[i]; 2096 2097 if (trace->trace_pgfaults & TRACE_PFMIN) 2098 for (i = 0; i < minpf_args_nr; i++) 2099 rec_argv[j++] = minpf_args[i]; 2100 2101 for (i = 0; i < (unsigned int)argc; i++) 2102 rec_argv[j++] = argv[i]; 2103 2104 return cmd_record(j, rec_argv); 2105 } 2106 2107 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp); 2108 2109 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist) 2110 { 2111 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname"); 2112 2113 if (IS_ERR(evsel)) 2114 return false; 2115 2116 if (perf_evsel__field(evsel, "pathname") == NULL) { 2117 perf_evsel__delete(evsel); 2118 return false; 2119 } 2120 2121 evsel->handler = trace__vfs_getname; 2122 perf_evlist__add(evlist, evsel); 2123 return true; 2124 } 2125 2126 static struct perf_evsel *perf_evsel__new_pgfault(u64 config) 2127 { 2128 struct perf_evsel *evsel; 2129 struct perf_event_attr attr = { 2130 .type = PERF_TYPE_SOFTWARE, 2131 .mmap_data = 1, 2132 }; 2133 2134 attr.config = config; 2135 attr.sample_period = 1; 2136 2137 event_attr_init(&attr); 2138 2139 evsel = perf_evsel__new(&attr); 2140 if (evsel) 2141 evsel->handler = trace__pgfault; 2142 2143 return evsel; 2144 } 2145 2146 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample) 2147 { 2148 const u32 type = event->header.type; 2149 struct perf_evsel *evsel; 2150 2151 if (type != PERF_RECORD_SAMPLE) { 2152 trace__process_event(trace, trace->host, event, sample); 2153 return; 2154 } 2155 2156 evsel = perf_evlist__id2evsel(trace->evlist, sample->id); 2157 if (evsel == NULL) { 2158 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id); 2159 return; 2160 } 2161 2162 trace__set_base_time(trace, evsel, sample); 2163 2164 if (evsel->attr.type == PERF_TYPE_TRACEPOINT && 2165 sample->raw_data == NULL) { 2166 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n", 2167 perf_evsel__name(evsel), sample->tid, 2168 sample->cpu, sample->raw_size); 2169 } else { 2170 tracepoint_handler handler = evsel->handler; 2171 handler(trace, evsel, event, sample); 2172 } 2173 } 2174 2175 static int trace__add_syscall_newtp(struct trace *trace) 2176 { 2177 int ret = -1; 2178 struct perf_evlist *evlist = trace->evlist; 2179 struct perf_evsel *sys_enter, *sys_exit; 2180 2181 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter); 2182 if (sys_enter == NULL) 2183 goto out; 2184 2185 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args)) 2186 goto out_delete_sys_enter; 2187 2188 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit); 2189 if (sys_exit == NULL) 2190 goto out_delete_sys_enter; 2191 2192 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret)) 2193 goto out_delete_sys_exit; 2194 2195 perf_evlist__add(evlist, sys_enter); 2196 perf_evlist__add(evlist, sys_exit); 2197 2198 if (callchain_param.enabled && !trace->kernel_syscallchains) { 2199 /* 2200 * We're interested only in the user space callchain 2201 * leading to the syscall, allow overriding that for 2202 * debugging reasons using --kernel_syscall_callchains 2203 */ 2204 sys_exit->attr.exclude_callchain_kernel = 1; 2205 } 2206 2207 trace->syscalls.events.sys_enter = sys_enter; 2208 trace->syscalls.events.sys_exit = sys_exit; 2209 2210 ret = 0; 2211 out: 2212 return ret; 2213 2214 out_delete_sys_exit: 2215 perf_evsel__delete_priv(sys_exit); 2216 out_delete_sys_enter: 2217 perf_evsel__delete_priv(sys_enter); 2218 goto out; 2219 } 2220 2221 static int trace__set_ev_qualifier_filter(struct trace *trace) 2222 { 2223 int err = -1; 2224 struct perf_evsel *sys_exit; 2225 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier, 2226 trace->ev_qualifier_ids.nr, 2227 trace->ev_qualifier_ids.entries); 2228 2229 if (filter == NULL) 2230 goto out_enomem; 2231 2232 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter, 2233 filter)) { 2234 sys_exit = trace->syscalls.events.sys_exit; 2235 err = perf_evsel__append_tp_filter(sys_exit, filter); 2236 } 2237 2238 free(filter); 2239 out: 2240 return err; 2241 out_enomem: 2242 errno = ENOMEM; 2243 goto out; 2244 } 2245 2246 static int trace__set_filter_loop_pids(struct trace *trace) 2247 { 2248 unsigned int nr = 1; 2249 pid_t pids[32] = { 2250 getpid(), 2251 }; 2252 struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]); 2253 2254 while (thread && nr < ARRAY_SIZE(pids)) { 2255 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid); 2256 2257 if (parent == NULL) 2258 break; 2259 2260 if (!strcmp(thread__comm_str(parent), "sshd")) { 2261 pids[nr++] = parent->tid; 2262 break; 2263 } 2264 thread = parent; 2265 } 2266 2267 return perf_evlist__set_filter_pids(trace->evlist, nr, pids); 2268 } 2269 2270 static int trace__run(struct trace *trace, int argc, const char **argv) 2271 { 2272 struct perf_evlist *evlist = trace->evlist; 2273 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL; 2274 int err = -1, i; 2275 unsigned long before; 2276 const bool forks = argc > 0; 2277 bool draining = false; 2278 2279 trace->live = true; 2280 2281 if (trace->trace_syscalls && trace__add_syscall_newtp(trace)) 2282 goto out_error_raw_syscalls; 2283 2284 if (trace->trace_syscalls) 2285 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist); 2286 2287 if ((trace->trace_pgfaults & TRACE_PFMAJ)) { 2288 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ); 2289 if (pgfault_maj == NULL) 2290 goto out_error_mem; 2291 perf_evlist__add(evlist, pgfault_maj); 2292 } 2293 2294 if ((trace->trace_pgfaults & TRACE_PFMIN)) { 2295 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN); 2296 if (pgfault_min == NULL) 2297 goto out_error_mem; 2298 perf_evlist__add(evlist, pgfault_min); 2299 } 2300 2301 if (trace->sched && 2302 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime", 2303 trace__sched_stat_runtime)) 2304 goto out_error_sched_stat_runtime; 2305 2306 err = perf_evlist__create_maps(evlist, &trace->opts.target); 2307 if (err < 0) { 2308 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n"); 2309 goto out_delete_evlist; 2310 } 2311 2312 err = trace__symbols_init(trace, evlist); 2313 if (err < 0) { 2314 fprintf(trace->output, "Problems initializing symbol libraries!\n"); 2315 goto out_delete_evlist; 2316 } 2317 2318 perf_evlist__config(evlist, &trace->opts, NULL); 2319 2320 if (callchain_param.enabled) { 2321 bool use_identifier = false; 2322 2323 if (trace->syscalls.events.sys_exit) { 2324 perf_evsel__config_callchain(trace->syscalls.events.sys_exit, 2325 &trace->opts, &callchain_param); 2326 use_identifier = true; 2327 } 2328 2329 if (pgfault_maj) { 2330 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param); 2331 use_identifier = true; 2332 } 2333 2334 if (pgfault_min) { 2335 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param); 2336 use_identifier = true; 2337 } 2338 2339 if (use_identifier) { 2340 /* 2341 * Now we have evsels with different sample_ids, use 2342 * PERF_SAMPLE_IDENTIFIER to map from sample to evsel 2343 * from a fixed position in each ring buffer record. 2344 * 2345 * As of this the changeset introducing this comment, this 2346 * isn't strictly needed, as the fields that can come before 2347 * PERF_SAMPLE_ID are all used, but we'll probably disable 2348 * some of those for things like copying the payload of 2349 * pointer syscall arguments, and for vfs_getname we don't 2350 * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this 2351 * here as a warning we need to use PERF_SAMPLE_IDENTIFIER. 2352 */ 2353 perf_evlist__set_sample_bit(evlist, IDENTIFIER); 2354 perf_evlist__reset_sample_bit(evlist, ID); 2355 } 2356 } 2357 2358 signal(SIGCHLD, sig_handler); 2359 signal(SIGINT, sig_handler); 2360 2361 if (forks) { 2362 err = perf_evlist__prepare_workload(evlist, &trace->opts.target, 2363 argv, false, NULL); 2364 if (err < 0) { 2365 fprintf(trace->output, "Couldn't run the workload!\n"); 2366 goto out_delete_evlist; 2367 } 2368 } 2369 2370 err = perf_evlist__open(evlist); 2371 if (err < 0) 2372 goto out_error_open; 2373 2374 err = bpf__apply_obj_config(); 2375 if (err) { 2376 char errbuf[BUFSIZ]; 2377 2378 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf)); 2379 pr_err("ERROR: Apply config to BPF failed: %s\n", 2380 errbuf); 2381 goto out_error_open; 2382 } 2383 2384 /* 2385 * Better not use !target__has_task() here because we need to cover the 2386 * case where no threads were specified in the command line, but a 2387 * workload was, and in that case we will fill in the thread_map when 2388 * we fork the workload in perf_evlist__prepare_workload. 2389 */ 2390 if (trace->filter_pids.nr > 0) 2391 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries); 2392 else if (thread_map__pid(evlist->threads, 0) == -1) 2393 err = trace__set_filter_loop_pids(trace); 2394 2395 if (err < 0) 2396 goto out_error_mem; 2397 2398 if (trace->ev_qualifier_ids.nr > 0) { 2399 err = trace__set_ev_qualifier_filter(trace); 2400 if (err < 0) 2401 goto out_errno; 2402 2403 pr_debug("event qualifier tracepoint filter: %s\n", 2404 trace->syscalls.events.sys_exit->filter); 2405 } 2406 2407 err = perf_evlist__apply_filters(evlist, &evsel); 2408 if (err < 0) 2409 goto out_error_apply_filters; 2410 2411 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false); 2412 if (err < 0) 2413 goto out_error_mmap; 2414 2415 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay) 2416 perf_evlist__enable(evlist); 2417 2418 if (forks) 2419 perf_evlist__start_workload(evlist); 2420 2421 if (trace->opts.initial_delay) { 2422 usleep(trace->opts.initial_delay * 1000); 2423 perf_evlist__enable(evlist); 2424 } 2425 2426 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 || 2427 evlist->threads->nr > 1 || 2428 perf_evlist__first(evlist)->attr.inherit; 2429 again: 2430 before = trace->nr_events; 2431 2432 for (i = 0; i < evlist->nr_mmaps; i++) { 2433 union perf_event *event; 2434 2435 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) { 2436 struct perf_sample sample; 2437 2438 ++trace->nr_events; 2439 2440 err = perf_evlist__parse_sample(evlist, event, &sample); 2441 if (err) { 2442 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err); 2443 goto next_event; 2444 } 2445 2446 trace__handle_event(trace, event, &sample); 2447 next_event: 2448 perf_evlist__mmap_consume(evlist, i); 2449 2450 if (interrupted) 2451 goto out_disable; 2452 2453 if (done && !draining) { 2454 perf_evlist__disable(evlist); 2455 draining = true; 2456 } 2457 } 2458 } 2459 2460 if (trace->nr_events == before) { 2461 int timeout = done ? 100 : -1; 2462 2463 if (!draining && perf_evlist__poll(evlist, timeout) > 0) { 2464 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0) 2465 draining = true; 2466 2467 goto again; 2468 } 2469 } else { 2470 goto again; 2471 } 2472 2473 out_disable: 2474 thread__zput(trace->current); 2475 2476 perf_evlist__disable(evlist); 2477 2478 if (!err) { 2479 if (trace->summary) 2480 trace__fprintf_thread_summary(trace, trace->output); 2481 2482 if (trace->show_tool_stats) { 2483 fprintf(trace->output, "Stats:\n " 2484 " vfs_getname : %" PRIu64 "\n" 2485 " proc_getname: %" PRIu64 "\n", 2486 trace->stats.vfs_getname, 2487 trace->stats.proc_getname); 2488 } 2489 } 2490 2491 out_delete_evlist: 2492 trace__symbols__exit(trace); 2493 2494 perf_evlist__delete(evlist); 2495 trace->evlist = NULL; 2496 trace->live = false; 2497 return err; 2498 { 2499 char errbuf[BUFSIZ]; 2500 2501 out_error_sched_stat_runtime: 2502 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime"); 2503 goto out_error; 2504 2505 out_error_raw_syscalls: 2506 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)"); 2507 goto out_error; 2508 2509 out_error_mmap: 2510 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf)); 2511 goto out_error; 2512 2513 out_error_open: 2514 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf)); 2515 2516 out_error: 2517 fprintf(trace->output, "%s\n", errbuf); 2518 goto out_delete_evlist; 2519 2520 out_error_apply_filters: 2521 fprintf(trace->output, 2522 "Failed to set filter \"%s\" on event %s with %d (%s)\n", 2523 evsel->filter, perf_evsel__name(evsel), errno, 2524 str_error_r(errno, errbuf, sizeof(errbuf))); 2525 goto out_delete_evlist; 2526 } 2527 out_error_mem: 2528 fprintf(trace->output, "Not enough memory to run!\n"); 2529 goto out_delete_evlist; 2530 2531 out_errno: 2532 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno)); 2533 goto out_delete_evlist; 2534 } 2535 2536 static int trace__replay(struct trace *trace) 2537 { 2538 const struct perf_evsel_str_handler handlers[] = { 2539 { "probe:vfs_getname", trace__vfs_getname, }, 2540 }; 2541 struct perf_data_file file = { 2542 .path = input_name, 2543 .mode = PERF_DATA_MODE_READ, 2544 .force = trace->force, 2545 }; 2546 struct perf_session *session; 2547 struct perf_evsel *evsel; 2548 int err = -1; 2549 2550 trace->tool.sample = trace__process_sample; 2551 trace->tool.mmap = perf_event__process_mmap; 2552 trace->tool.mmap2 = perf_event__process_mmap2; 2553 trace->tool.comm = perf_event__process_comm; 2554 trace->tool.exit = perf_event__process_exit; 2555 trace->tool.fork = perf_event__process_fork; 2556 trace->tool.attr = perf_event__process_attr; 2557 trace->tool.tracing_data = perf_event__process_tracing_data; 2558 trace->tool.build_id = perf_event__process_build_id; 2559 trace->tool.namespaces = perf_event__process_namespaces; 2560 2561 trace->tool.ordered_events = true; 2562 trace->tool.ordering_requires_timestamps = true; 2563 2564 /* add tid to output */ 2565 trace->multiple_threads = true; 2566 2567 session = perf_session__new(&file, false, &trace->tool); 2568 if (session == NULL) 2569 return -1; 2570 2571 if (trace->opts.target.pid) 2572 symbol_conf.pid_list_str = strdup(trace->opts.target.pid); 2573 2574 if (trace->opts.target.tid) 2575 symbol_conf.tid_list_str = strdup(trace->opts.target.tid); 2576 2577 if (symbol__init(&session->header.env) < 0) 2578 goto out; 2579 2580 trace->host = &session->machines.host; 2581 2582 err = perf_session__set_tracepoints_handlers(session, handlers); 2583 if (err) 2584 goto out; 2585 2586 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2587 "raw_syscalls:sys_enter"); 2588 /* older kernels have syscalls tp versus raw_syscalls */ 2589 if (evsel == NULL) 2590 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2591 "syscalls:sys_enter"); 2592 2593 if (evsel && 2594 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 || 2595 perf_evsel__init_sc_tp_ptr_field(evsel, args))) { 2596 pr_err("Error during initialize raw_syscalls:sys_enter event\n"); 2597 goto out; 2598 } 2599 2600 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2601 "raw_syscalls:sys_exit"); 2602 if (evsel == NULL) 2603 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2604 "syscalls:sys_exit"); 2605 if (evsel && 2606 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 || 2607 perf_evsel__init_sc_tp_uint_field(evsel, ret))) { 2608 pr_err("Error during initialize raw_syscalls:sys_exit event\n"); 2609 goto out; 2610 } 2611 2612 evlist__for_each_entry(session->evlist, evsel) { 2613 if (evsel->attr.type == PERF_TYPE_SOFTWARE && 2614 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ || 2615 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN || 2616 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS)) 2617 evsel->handler = trace__pgfault; 2618 } 2619 2620 setup_pager(); 2621 2622 err = perf_session__process_events(session); 2623 if (err) 2624 pr_err("Failed to process events, error %d", err); 2625 2626 else if (trace->summary) 2627 trace__fprintf_thread_summary(trace, trace->output); 2628 2629 out: 2630 perf_session__delete(session); 2631 2632 return err; 2633 } 2634 2635 static size_t trace__fprintf_threads_header(FILE *fp) 2636 { 2637 size_t printed; 2638 2639 printed = fprintf(fp, "\n Summary of events:\n\n"); 2640 2641 return printed; 2642 } 2643 2644 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs, 2645 struct stats *stats; 2646 double msecs; 2647 int syscall; 2648 ) 2649 { 2650 struct int_node *source = rb_entry(nd, struct int_node, rb_node); 2651 struct stats *stats = source->priv; 2652 2653 entry->syscall = source->i; 2654 entry->stats = stats; 2655 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0; 2656 } 2657 2658 static size_t thread__dump_stats(struct thread_trace *ttrace, 2659 struct trace *trace, FILE *fp) 2660 { 2661 size_t printed = 0; 2662 struct syscall *sc; 2663 struct rb_node *nd; 2664 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats); 2665 2666 if (syscall_stats == NULL) 2667 return 0; 2668 2669 printed += fprintf(fp, "\n"); 2670 2671 printed += fprintf(fp, " syscall calls total min avg max stddev\n"); 2672 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); 2673 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n"); 2674 2675 resort_rb__for_each_entry(nd, syscall_stats) { 2676 struct stats *stats = syscall_stats_entry->stats; 2677 if (stats) { 2678 double min = (double)(stats->min) / NSEC_PER_MSEC; 2679 double max = (double)(stats->max) / NSEC_PER_MSEC; 2680 double avg = avg_stats(stats); 2681 double pct; 2682 u64 n = (u64) stats->n; 2683 2684 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0; 2685 avg /= NSEC_PER_MSEC; 2686 2687 sc = &trace->syscalls.table[syscall_stats_entry->syscall]; 2688 printed += fprintf(fp, " %-15s", sc->name); 2689 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f", 2690 n, syscall_stats_entry->msecs, min, avg); 2691 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct); 2692 } 2693 } 2694 2695 resort_rb__delete(syscall_stats); 2696 printed += fprintf(fp, "\n\n"); 2697 2698 return printed; 2699 } 2700 2701 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace) 2702 { 2703 size_t printed = 0; 2704 struct thread_trace *ttrace = thread__priv(thread); 2705 double ratio; 2706 2707 if (ttrace == NULL) 2708 return 0; 2709 2710 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0; 2711 2712 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid); 2713 printed += fprintf(fp, "%lu events, ", ttrace->nr_events); 2714 printed += fprintf(fp, "%.1f%%", ratio); 2715 if (ttrace->pfmaj) 2716 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj); 2717 if (ttrace->pfmin) 2718 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin); 2719 if (trace->sched) 2720 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms); 2721 else if (fputc('\n', fp) != EOF) 2722 ++printed; 2723 2724 printed += thread__dump_stats(ttrace, trace, fp); 2725 2726 return printed; 2727 } 2728 2729 static unsigned long thread__nr_events(struct thread_trace *ttrace) 2730 { 2731 return ttrace ? ttrace->nr_events : 0; 2732 } 2733 2734 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)), 2735 struct thread *thread; 2736 ) 2737 { 2738 entry->thread = rb_entry(nd, struct thread, rb_node); 2739 } 2740 2741 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp) 2742 { 2743 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host); 2744 size_t printed = trace__fprintf_threads_header(fp); 2745 struct rb_node *nd; 2746 2747 if (threads == NULL) { 2748 fprintf(fp, "%s", "Error sorting output by nr_events!\n"); 2749 return 0; 2750 } 2751 2752 resort_rb__for_each_entry(nd, threads) 2753 printed += trace__fprintf_thread(fp, threads_entry->thread, trace); 2754 2755 resort_rb__delete(threads); 2756 2757 return printed; 2758 } 2759 2760 static int trace__set_duration(const struct option *opt, const char *str, 2761 int unset __maybe_unused) 2762 { 2763 struct trace *trace = opt->value; 2764 2765 trace->duration_filter = atof(str); 2766 return 0; 2767 } 2768 2769 static int trace__set_filter_pids(const struct option *opt, const char *str, 2770 int unset __maybe_unused) 2771 { 2772 int ret = -1; 2773 size_t i; 2774 struct trace *trace = opt->value; 2775 /* 2776 * FIXME: introduce a intarray class, plain parse csv and create a 2777 * { int nr, int entries[] } struct... 2778 */ 2779 struct intlist *list = intlist__new(str); 2780 2781 if (list == NULL) 2782 return -1; 2783 2784 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1; 2785 trace->filter_pids.entries = calloc(i, sizeof(pid_t)); 2786 2787 if (trace->filter_pids.entries == NULL) 2788 goto out; 2789 2790 trace->filter_pids.entries[0] = getpid(); 2791 2792 for (i = 1; i < trace->filter_pids.nr; ++i) 2793 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i; 2794 2795 intlist__delete(list); 2796 ret = 0; 2797 out: 2798 return ret; 2799 } 2800 2801 static int trace__open_output(struct trace *trace, const char *filename) 2802 { 2803 struct stat st; 2804 2805 if (!stat(filename, &st) && st.st_size) { 2806 char oldname[PATH_MAX]; 2807 2808 scnprintf(oldname, sizeof(oldname), "%s.old", filename); 2809 unlink(oldname); 2810 rename(filename, oldname); 2811 } 2812 2813 trace->output = fopen(filename, "w"); 2814 2815 return trace->output == NULL ? -errno : 0; 2816 } 2817 2818 static int parse_pagefaults(const struct option *opt, const char *str, 2819 int unset __maybe_unused) 2820 { 2821 int *trace_pgfaults = opt->value; 2822 2823 if (strcmp(str, "all") == 0) 2824 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN; 2825 else if (strcmp(str, "maj") == 0) 2826 *trace_pgfaults |= TRACE_PFMAJ; 2827 else if (strcmp(str, "min") == 0) 2828 *trace_pgfaults |= TRACE_PFMIN; 2829 else 2830 return -1; 2831 2832 return 0; 2833 } 2834 2835 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler) 2836 { 2837 struct perf_evsel *evsel; 2838 2839 evlist__for_each_entry(evlist, evsel) 2840 evsel->handler = handler; 2841 } 2842 2843 /* 2844 * XXX: Hackish, just splitting the combined -e+--event (syscalls 2845 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use 2846 * existing facilities unchanged (trace->ev_qualifier + parse_options()). 2847 * 2848 * It'd be better to introduce a parse_options() variant that would return a 2849 * list with the terms it didn't match to an event... 2850 */ 2851 static int trace__parse_events_option(const struct option *opt, const char *str, 2852 int unset __maybe_unused) 2853 { 2854 struct trace *trace = (struct trace *)opt->value; 2855 const char *s = str; 2856 char *sep = NULL, *lists[2] = { NULL, NULL, }; 2857 int len = strlen(str) + 1, err = -1, list, idx; 2858 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR); 2859 char group_name[PATH_MAX]; 2860 2861 if (strace_groups_dir == NULL) 2862 return -1; 2863 2864 if (*s == '!') { 2865 ++s; 2866 trace->not_ev_qualifier = true; 2867 } 2868 2869 while (1) { 2870 if ((sep = strchr(s, ',')) != NULL) 2871 *sep = '\0'; 2872 2873 list = 0; 2874 if (syscalltbl__id(trace->sctbl, s) >= 0 || 2875 syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) { 2876 list = 1; 2877 } else { 2878 path__join(group_name, sizeof(group_name), strace_groups_dir, s); 2879 if (access(group_name, R_OK) == 0) 2880 list = 1; 2881 } 2882 2883 if (lists[list]) { 2884 sprintf(lists[list] + strlen(lists[list]), ",%s", s); 2885 } else { 2886 lists[list] = malloc(len); 2887 if (lists[list] == NULL) 2888 goto out; 2889 strcpy(lists[list], s); 2890 } 2891 2892 if (!sep) 2893 break; 2894 2895 *sep = ','; 2896 s = sep + 1; 2897 } 2898 2899 if (lists[1] != NULL) { 2900 struct strlist_config slist_config = { 2901 .dirname = strace_groups_dir, 2902 }; 2903 2904 trace->ev_qualifier = strlist__new(lists[1], &slist_config); 2905 if (trace->ev_qualifier == NULL) { 2906 fputs("Not enough memory to parse event qualifier", trace->output); 2907 goto out; 2908 } 2909 2910 if (trace__validate_ev_qualifier(trace)) 2911 goto out; 2912 } 2913 2914 err = 0; 2915 2916 if (lists[0]) { 2917 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event", 2918 "event selector. use 'perf list' to list available events", 2919 parse_events_option); 2920 err = parse_events_option(&o, lists[0], 0); 2921 } 2922 out: 2923 if (sep) 2924 *sep = ','; 2925 2926 return err; 2927 } 2928 2929 int cmd_trace(int argc, const char **argv) 2930 { 2931 const char *trace_usage[] = { 2932 "perf trace [<options>] [<command>]", 2933 "perf trace [<options>] -- <command> [<options>]", 2934 "perf trace record [<options>] [<command>]", 2935 "perf trace record [<options>] -- <command> [<options>]", 2936 NULL 2937 }; 2938 struct trace trace = { 2939 .syscalls = { 2940 . max = -1, 2941 }, 2942 .opts = { 2943 .target = { 2944 .uid = UINT_MAX, 2945 .uses_mmap = true, 2946 }, 2947 .user_freq = UINT_MAX, 2948 .user_interval = ULLONG_MAX, 2949 .no_buffering = true, 2950 .mmap_pages = UINT_MAX, 2951 .proc_map_timeout = 500, 2952 }, 2953 .output = stderr, 2954 .show_comm = true, 2955 .trace_syscalls = true, 2956 .kernel_syscallchains = false, 2957 .max_stack = UINT_MAX, 2958 }; 2959 const char *output_name = NULL; 2960 const struct option trace_options[] = { 2961 OPT_CALLBACK('e', "event", &trace, "event", 2962 "event/syscall selector. use 'perf list' to list available events", 2963 trace__parse_events_option), 2964 OPT_BOOLEAN(0, "comm", &trace.show_comm, 2965 "show the thread COMM next to its id"), 2966 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"), 2967 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace", 2968 trace__parse_events_option), 2969 OPT_STRING('o', "output", &output_name, "file", "output file name"), 2970 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"), 2971 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid", 2972 "trace events on existing process id"), 2973 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid", 2974 "trace events on existing thread id"), 2975 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids", 2976 "pids to filter (by the kernel)", trace__set_filter_pids), 2977 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide, 2978 "system-wide collection from all CPUs"), 2979 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu", 2980 "list of cpus to monitor"), 2981 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit, 2982 "child tasks do not inherit counters"), 2983 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages", 2984 "number of mmap data pages", 2985 perf_evlist__parse_mmap_pages), 2986 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user", 2987 "user to profile"), 2988 OPT_CALLBACK(0, "duration", &trace, "float", 2989 "show only events with duration > N.M ms", 2990 trace__set_duration), 2991 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"), 2992 OPT_INCR('v', "verbose", &verbose, "be more verbose"), 2993 OPT_BOOLEAN('T', "time", &trace.full_time, 2994 "Show full timestamp, not time relative to first start"), 2995 OPT_BOOLEAN('s', "summary", &trace.summary_only, 2996 "Show only syscall summary with statistics"), 2997 OPT_BOOLEAN('S', "with-summary", &trace.summary, 2998 "Show all syscalls and summary with statistics"), 2999 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min", 3000 "Trace pagefaults", parse_pagefaults, "maj"), 3001 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"), 3002 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"), 3003 OPT_CALLBACK(0, "call-graph", &trace.opts, 3004 "record_mode[,record_size]", record_callchain_help, 3005 &record_parse_callchain_opt), 3006 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains, 3007 "Show the kernel callchains on the syscall exit path"), 3008 OPT_UINTEGER(0, "min-stack", &trace.min_stack, 3009 "Set the minimum stack depth when parsing the callchain, " 3010 "anything below the specified depth will be ignored."), 3011 OPT_UINTEGER(0, "max-stack", &trace.max_stack, 3012 "Set the maximum stack depth when parsing the callchain, " 3013 "anything beyond the specified depth will be ignored. " 3014 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)), 3015 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout, 3016 "per thread proc mmap processing timeout in ms"), 3017 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay, 3018 "ms to wait before starting measurement after program " 3019 "start"), 3020 OPT_END() 3021 }; 3022 bool __maybe_unused max_stack_user_set = true; 3023 bool mmap_pages_user_set = true; 3024 const char * const trace_subcommands[] = { "record", NULL }; 3025 int err; 3026 char bf[BUFSIZ]; 3027 3028 signal(SIGSEGV, sighandler_dump_stack); 3029 signal(SIGFPE, sighandler_dump_stack); 3030 3031 trace.evlist = perf_evlist__new(); 3032 trace.sctbl = syscalltbl__new(); 3033 3034 if (trace.evlist == NULL || trace.sctbl == NULL) { 3035 pr_err("Not enough memory to run!\n"); 3036 err = -ENOMEM; 3037 goto out; 3038 } 3039 3040 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands, 3041 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION); 3042 3043 err = bpf__setup_stdout(trace.evlist); 3044 if (err) { 3045 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf)); 3046 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf); 3047 goto out; 3048 } 3049 3050 err = -1; 3051 3052 if (trace.trace_pgfaults) { 3053 trace.opts.sample_address = true; 3054 trace.opts.sample_time = true; 3055 } 3056 3057 if (trace.opts.mmap_pages == UINT_MAX) 3058 mmap_pages_user_set = false; 3059 3060 if (trace.max_stack == UINT_MAX) { 3061 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack; 3062 max_stack_user_set = false; 3063 } 3064 3065 #ifdef HAVE_DWARF_UNWIND_SUPPORT 3066 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls) 3067 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false); 3068 #endif 3069 3070 if (callchain_param.enabled) { 3071 if (!mmap_pages_user_set && geteuid() == 0) 3072 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4; 3073 3074 symbol_conf.use_callchain = true; 3075 } 3076 3077 if (trace.evlist->nr_entries > 0) 3078 evlist__set_evsel_handler(trace.evlist, trace__event_handler); 3079 3080 if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) 3081 return trace__record(&trace, argc-1, &argv[1]); 3082 3083 /* summary_only implies summary option, but don't overwrite summary if set */ 3084 if (trace.summary_only) 3085 trace.summary = trace.summary_only; 3086 3087 if (!trace.trace_syscalls && !trace.trace_pgfaults && 3088 trace.evlist->nr_entries == 0 /* Was --events used? */) { 3089 pr_err("Please specify something to trace.\n"); 3090 return -1; 3091 } 3092 3093 if (!trace.trace_syscalls && trace.ev_qualifier) { 3094 pr_err("The -e option can't be used with --no-syscalls.\n"); 3095 goto out; 3096 } 3097 3098 if (output_name != NULL) { 3099 err = trace__open_output(&trace, output_name); 3100 if (err < 0) { 3101 perror("failed to create output file"); 3102 goto out; 3103 } 3104 } 3105 3106 trace.open_id = syscalltbl__id(trace.sctbl, "open"); 3107 3108 err = target__validate(&trace.opts.target); 3109 if (err) { 3110 target__strerror(&trace.opts.target, err, bf, sizeof(bf)); 3111 fprintf(trace.output, "%s", bf); 3112 goto out_close; 3113 } 3114 3115 err = target__parse_uid(&trace.opts.target); 3116 if (err) { 3117 target__strerror(&trace.opts.target, err, bf, sizeof(bf)); 3118 fprintf(trace.output, "%s", bf); 3119 goto out_close; 3120 } 3121 3122 if (!argc && target__none(&trace.opts.target)) 3123 trace.opts.target.system_wide = true; 3124 3125 if (input_name) 3126 err = trace__replay(&trace); 3127 else 3128 err = trace__run(&trace, argc, argv); 3129 3130 out_close: 3131 if (output_name != NULL) 3132 fclose(trace.output); 3133 out: 3134 return err; 3135 } 3136