1 /* 2 * builtin-trace.c 3 * 4 * Builtin 'trace' command: 5 * 6 * Display a continuously updated trace of any workload, CPU, specific PID, 7 * system wide, etc. Default format is loosely strace like, but any other 8 * event may be specified using --event. 9 * 10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> 11 * 12 * Initially based on the 'trace' prototype by Thomas Gleixner: 13 * 14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'") 15 * 16 * Released under the GPL v2. (and only v2, not any later version) 17 */ 18 19 #include <traceevent/event-parse.h> 20 #include <api/fs/tracing_path.h> 21 #include "builtin.h" 22 #include "util/cgroup.h" 23 #include "util/color.h" 24 #include "util/debug.h" 25 #include "util/env.h" 26 #include "util/event.h" 27 #include "util/evlist.h" 28 #include <subcmd/exec-cmd.h> 29 #include "util/machine.h" 30 #include "util/path.h" 31 #include "util/session.h" 32 #include "util/thread.h" 33 #include <subcmd/parse-options.h> 34 #include "util/strlist.h" 35 #include "util/intlist.h" 36 #include "util/thread_map.h" 37 #include "util/stat.h" 38 #include "trace/beauty/beauty.h" 39 #include "trace-event.h" 40 #include "util/parse-events.h" 41 #include "util/bpf-loader.h" 42 #include "callchain.h" 43 #include "print_binary.h" 44 #include "string2.h" 45 #include "syscalltbl.h" 46 #include "rb_resort.h" 47 48 #include <errno.h> 49 #include <inttypes.h> 50 #include <poll.h> 51 #include <signal.h> 52 #include <stdlib.h> 53 #include <string.h> 54 #include <linux/err.h> 55 #include <linux/filter.h> 56 #include <linux/kernel.h> 57 #include <linux/random.h> 58 #include <linux/stringify.h> 59 #include <linux/time64.h> 60 #include <fcntl.h> 61 62 #include "sane_ctype.h" 63 64 #ifndef O_CLOEXEC 65 # define O_CLOEXEC 02000000 66 #endif 67 68 #ifndef F_LINUX_SPECIFIC_BASE 69 # define F_LINUX_SPECIFIC_BASE 1024 70 #endif 71 72 struct trace { 73 struct perf_tool tool; 74 struct syscalltbl *sctbl; 75 struct { 76 int max; 77 struct syscall *table; 78 struct { 79 struct perf_evsel *sys_enter, 80 *sys_exit, 81 *augmented; 82 } events; 83 } syscalls; 84 struct record_opts opts; 85 struct perf_evlist *evlist; 86 struct machine *host; 87 struct thread *current; 88 struct cgroup *cgroup; 89 u64 base_time; 90 FILE *output; 91 unsigned long nr_events; 92 struct strlist *ev_qualifier; 93 struct { 94 size_t nr; 95 int *entries; 96 } ev_qualifier_ids; 97 struct { 98 size_t nr; 99 pid_t *entries; 100 } filter_pids; 101 double duration_filter; 102 double runtime_ms; 103 struct { 104 u64 vfs_getname, 105 proc_getname; 106 } stats; 107 unsigned int max_stack; 108 unsigned int min_stack; 109 bool not_ev_qualifier; 110 bool live; 111 bool full_time; 112 bool sched; 113 bool multiple_threads; 114 bool summary; 115 bool summary_only; 116 bool failure_only; 117 bool show_comm; 118 bool print_sample; 119 bool show_tool_stats; 120 bool trace_syscalls; 121 bool kernel_syscallchains; 122 bool force; 123 bool vfs_getname; 124 int trace_pgfaults; 125 }; 126 127 struct tp_field { 128 int offset; 129 union { 130 u64 (*integer)(struct tp_field *field, struct perf_sample *sample); 131 void *(*pointer)(struct tp_field *field, struct perf_sample *sample); 132 }; 133 }; 134 135 #define TP_UINT_FIELD(bits) \ 136 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \ 137 { \ 138 u##bits value; \ 139 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \ 140 return value; \ 141 } 142 143 TP_UINT_FIELD(8); 144 TP_UINT_FIELD(16); 145 TP_UINT_FIELD(32); 146 TP_UINT_FIELD(64); 147 148 #define TP_UINT_FIELD__SWAPPED(bits) \ 149 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \ 150 { \ 151 u##bits value; \ 152 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \ 153 return bswap_##bits(value);\ 154 } 155 156 TP_UINT_FIELD__SWAPPED(16); 157 TP_UINT_FIELD__SWAPPED(32); 158 TP_UINT_FIELD__SWAPPED(64); 159 160 static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap) 161 { 162 field->offset = offset; 163 164 switch (size) { 165 case 1: 166 field->integer = tp_field__u8; 167 break; 168 case 2: 169 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16; 170 break; 171 case 4: 172 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32; 173 break; 174 case 8: 175 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64; 176 break; 177 default: 178 return -1; 179 } 180 181 return 0; 182 } 183 184 static int tp_field__init_uint(struct tp_field *field, struct format_field *format_field, bool needs_swap) 185 { 186 return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap); 187 } 188 189 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample) 190 { 191 return sample->raw_data + field->offset; 192 } 193 194 static int __tp_field__init_ptr(struct tp_field *field, int offset) 195 { 196 field->offset = offset; 197 field->pointer = tp_field__ptr; 198 return 0; 199 } 200 201 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field) 202 { 203 return __tp_field__init_ptr(field, format_field->offset); 204 } 205 206 struct syscall_tp { 207 struct tp_field id; 208 union { 209 struct tp_field args, ret; 210 }; 211 }; 212 213 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel, 214 struct tp_field *field, 215 const char *name) 216 { 217 struct format_field *format_field = perf_evsel__field(evsel, name); 218 219 if (format_field == NULL) 220 return -1; 221 222 return tp_field__init_uint(field, format_field, evsel->needs_swap); 223 } 224 225 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \ 226 ({ struct syscall_tp *sc = evsel->priv;\ 227 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); }) 228 229 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel, 230 struct tp_field *field, 231 const char *name) 232 { 233 struct format_field *format_field = perf_evsel__field(evsel, name); 234 235 if (format_field == NULL) 236 return -1; 237 238 return tp_field__init_ptr(field, format_field); 239 } 240 241 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \ 242 ({ struct syscall_tp *sc = evsel->priv;\ 243 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); }) 244 245 static void perf_evsel__delete_priv(struct perf_evsel *evsel) 246 { 247 zfree(&evsel->priv); 248 perf_evsel__delete(evsel); 249 } 250 251 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel) 252 { 253 struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp)); 254 255 if (evsel->priv != NULL) { 256 if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr")) 257 goto out_delete; 258 return 0; 259 } 260 261 return -ENOMEM; 262 out_delete: 263 zfree(&evsel->priv); 264 return -ENOENT; 265 } 266 267 static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel) 268 { 269 struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp)); 270 271 if (evsel->priv != NULL) { /* field, sizeof_field, offsetof_field */ 272 if (__tp_field__init_uint(&sc->id, sizeof(long), sizeof(long long), evsel->needs_swap)) 273 goto out_delete; 274 275 return 0; 276 } 277 278 return -ENOMEM; 279 out_delete: 280 zfree(&evsel->priv); 281 return -EINVAL; 282 } 283 284 static int perf_evsel__init_augmented_syscall_tp_args(struct perf_evsel *evsel) 285 { 286 struct syscall_tp *sc = evsel->priv; 287 288 return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)); 289 } 290 291 static int perf_evsel__init_raw_syscall_tp(struct perf_evsel *evsel, void *handler) 292 { 293 evsel->priv = malloc(sizeof(struct syscall_tp)); 294 if (evsel->priv != NULL) { 295 if (perf_evsel__init_sc_tp_uint_field(evsel, id)) 296 goto out_delete; 297 298 evsel->handler = handler; 299 return 0; 300 } 301 302 return -ENOMEM; 303 304 out_delete: 305 zfree(&evsel->priv); 306 return -ENOENT; 307 } 308 309 static struct perf_evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler) 310 { 311 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction); 312 313 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */ 314 if (IS_ERR(evsel)) 315 evsel = perf_evsel__newtp("syscalls", direction); 316 317 if (IS_ERR(evsel)) 318 return NULL; 319 320 if (perf_evsel__init_raw_syscall_tp(evsel, handler)) 321 goto out_delete; 322 323 return evsel; 324 325 out_delete: 326 perf_evsel__delete_priv(evsel); 327 return NULL; 328 } 329 330 #define perf_evsel__sc_tp_uint(evsel, name, sample) \ 331 ({ struct syscall_tp *fields = evsel->priv; \ 332 fields->name.integer(&fields->name, sample); }) 333 334 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \ 335 ({ struct syscall_tp *fields = evsel->priv; \ 336 fields->name.pointer(&fields->name, sample); }) 337 338 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val) 339 { 340 int idx = val - sa->offset; 341 342 if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) 343 return scnprintf(bf, size, intfmt, val); 344 345 return scnprintf(bf, size, "%s", sa->entries[idx]); 346 } 347 348 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size, 349 const char *intfmt, 350 struct syscall_arg *arg) 351 { 352 return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val); 353 } 354 355 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size, 356 struct syscall_arg *arg) 357 { 358 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg); 359 } 360 361 #define SCA_STRARRAY syscall_arg__scnprintf_strarray 362 363 struct strarrays { 364 int nr_entries; 365 struct strarray **entries; 366 }; 367 368 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \ 369 .nr_entries = ARRAY_SIZE(array), \ 370 .entries = array, \ 371 } 372 373 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size, 374 struct syscall_arg *arg) 375 { 376 struct strarrays *sas = arg->parm; 377 int i; 378 379 for (i = 0; i < sas->nr_entries; ++i) { 380 struct strarray *sa = sas->entries[i]; 381 int idx = arg->val - sa->offset; 382 383 if (idx >= 0 && idx < sa->nr_entries) { 384 if (sa->entries[idx] == NULL) 385 break; 386 return scnprintf(bf, size, "%s", sa->entries[idx]); 387 } 388 } 389 390 return scnprintf(bf, size, "%d", arg->val); 391 } 392 393 #ifndef AT_FDCWD 394 #define AT_FDCWD -100 395 #endif 396 397 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size, 398 struct syscall_arg *arg) 399 { 400 int fd = arg->val; 401 402 if (fd == AT_FDCWD) 403 return scnprintf(bf, size, "CWD"); 404 405 return syscall_arg__scnprintf_fd(bf, size, arg); 406 } 407 408 #define SCA_FDAT syscall_arg__scnprintf_fd_at 409 410 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size, 411 struct syscall_arg *arg); 412 413 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd 414 415 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg) 416 { 417 return scnprintf(bf, size, "%#lx", arg->val); 418 } 419 420 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg) 421 { 422 return scnprintf(bf, size, "%d", arg->val); 423 } 424 425 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg) 426 { 427 return scnprintf(bf, size, "%ld", arg->val); 428 } 429 430 static const char *bpf_cmd[] = { 431 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM", 432 "MAP_GET_NEXT_KEY", "PROG_LOAD", 433 }; 434 static DEFINE_STRARRAY(bpf_cmd); 435 436 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", }; 437 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1); 438 439 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", }; 440 static DEFINE_STRARRAY(itimers); 441 442 static const char *keyctl_options[] = { 443 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN", 444 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ", 445 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT", 446 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT", 447 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT", 448 }; 449 static DEFINE_STRARRAY(keyctl_options); 450 451 static const char *whences[] = { "SET", "CUR", "END", 452 #ifdef SEEK_DATA 453 "DATA", 454 #endif 455 #ifdef SEEK_HOLE 456 "HOLE", 457 #endif 458 }; 459 static DEFINE_STRARRAY(whences); 460 461 static const char *fcntl_cmds[] = { 462 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK", 463 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64", 464 "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX", 465 "GETOWNER_UIDS", 466 }; 467 static DEFINE_STRARRAY(fcntl_cmds); 468 469 static const char *fcntl_linux_specific_cmds[] = { 470 "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC", 471 "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS", 472 "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT", 473 }; 474 475 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE); 476 477 static struct strarray *fcntl_cmds_arrays[] = { 478 &strarray__fcntl_cmds, 479 &strarray__fcntl_linux_specific_cmds, 480 }; 481 482 static DEFINE_STRARRAYS(fcntl_cmds_arrays); 483 484 static const char *rlimit_resources[] = { 485 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE", 486 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO", 487 "RTTIME", 488 }; 489 static DEFINE_STRARRAY(rlimit_resources); 490 491 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", }; 492 static DEFINE_STRARRAY(sighow); 493 494 static const char *clockid[] = { 495 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID", 496 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME", 497 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI" 498 }; 499 static DEFINE_STRARRAY(clockid); 500 501 static const char *socket_families[] = { 502 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM", 503 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI", 504 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC", 505 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC", 506 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF", 507 "ALG", "NFC", "VSOCK", 508 }; 509 static DEFINE_STRARRAY(socket_families); 510 511 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size, 512 struct syscall_arg *arg) 513 { 514 size_t printed = 0; 515 int mode = arg->val; 516 517 if (mode == F_OK) /* 0 */ 518 return scnprintf(bf, size, "F"); 519 #define P_MODE(n) \ 520 if (mode & n##_OK) { \ 521 printed += scnprintf(bf + printed, size - printed, "%s", #n); \ 522 mode &= ~n##_OK; \ 523 } 524 525 P_MODE(R); 526 P_MODE(W); 527 P_MODE(X); 528 #undef P_MODE 529 530 if (mode) 531 printed += scnprintf(bf + printed, size - printed, "|%#x", mode); 532 533 return printed; 534 } 535 536 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode 537 538 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, 539 struct syscall_arg *arg); 540 541 #define SCA_FILENAME syscall_arg__scnprintf_filename 542 543 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size, 544 struct syscall_arg *arg) 545 { 546 int printed = 0, flags = arg->val; 547 548 #define P_FLAG(n) \ 549 if (flags & O_##n) { \ 550 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ 551 flags &= ~O_##n; \ 552 } 553 554 P_FLAG(CLOEXEC); 555 P_FLAG(NONBLOCK); 556 #undef P_FLAG 557 558 if (flags) 559 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); 560 561 return printed; 562 } 563 564 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags 565 566 #ifndef GRND_NONBLOCK 567 #define GRND_NONBLOCK 0x0001 568 #endif 569 #ifndef GRND_RANDOM 570 #define GRND_RANDOM 0x0002 571 #endif 572 573 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, 574 struct syscall_arg *arg) 575 { 576 int printed = 0, flags = arg->val; 577 578 #define P_FLAG(n) \ 579 if (flags & GRND_##n) { \ 580 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \ 581 flags &= ~GRND_##n; \ 582 } 583 584 P_FLAG(RANDOM); 585 P_FLAG(NONBLOCK); 586 #undef P_FLAG 587 588 if (flags) 589 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); 590 591 return printed; 592 } 593 594 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags 595 596 #define STRARRAY(name, array) \ 597 { .scnprintf = SCA_STRARRAY, \ 598 .parm = &strarray__##array, } 599 600 #include "trace/beauty/arch_errno_names.c" 601 #include "trace/beauty/eventfd.c" 602 #include "trace/beauty/futex_op.c" 603 #include "trace/beauty/futex_val3.c" 604 #include "trace/beauty/mmap.c" 605 #include "trace/beauty/mode_t.c" 606 #include "trace/beauty/msg_flags.c" 607 #include "trace/beauty/open_flags.c" 608 #include "trace/beauty/perf_event_open.c" 609 #include "trace/beauty/pid.c" 610 #include "trace/beauty/sched_policy.c" 611 #include "trace/beauty/seccomp.c" 612 #include "trace/beauty/signum.c" 613 #include "trace/beauty/socket_type.c" 614 #include "trace/beauty/waitid_options.c" 615 616 struct syscall_arg_fmt { 617 size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg); 618 void *parm; 619 const char *name; 620 bool show_zero; 621 }; 622 623 static struct syscall_fmt { 624 const char *name; 625 const char *alias; 626 struct syscall_arg_fmt arg[6]; 627 u8 nr_args; 628 bool errpid; 629 bool timeout; 630 bool hexret; 631 } syscall_fmts[] = { 632 { .name = "access", 633 .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, }, 634 { .name = "bpf", 635 .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, }, 636 { .name = "brk", .hexret = true, 637 .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, }, 638 { .name = "clock_gettime", 639 .arg = { [0] = STRARRAY(clk_id, clockid), }, }, 640 { .name = "clone", .errpid = true, .nr_args = 5, 641 .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, }, 642 [1] = { .name = "child_stack", .scnprintf = SCA_HEX, }, 643 [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, }, 644 [3] = { .name = "child_tidptr", .scnprintf = SCA_HEX, }, 645 [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, }, 646 { .name = "close", 647 .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, }, 648 { .name = "epoll_ctl", 649 .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, }, 650 { .name = "eventfd2", 651 .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, }, 652 { .name = "fchmodat", 653 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 654 { .name = "fchownat", 655 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 656 { .name = "fcntl", 657 .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */ 658 .parm = &strarrays__fcntl_cmds_arrays, 659 .show_zero = true, }, 660 [2] = { .scnprintf = SCA_FCNTL_ARG, /* arg */ }, }, }, 661 { .name = "flock", 662 .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, }, 663 { .name = "fstat", .alias = "newfstat", }, 664 { .name = "fstatat", .alias = "newfstatat", }, 665 { .name = "futex", 666 .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, 667 [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, }, 668 { .name = "futimesat", 669 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 670 { .name = "getitimer", 671 .arg = { [0] = STRARRAY(which, itimers), }, }, 672 { .name = "getpid", .errpid = true, }, 673 { .name = "getpgid", .errpid = true, }, 674 { .name = "getppid", .errpid = true, }, 675 { .name = "getrandom", 676 .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, }, 677 { .name = "getrlimit", 678 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, }, 679 { .name = "gettid", .errpid = true, }, 680 { .name = "ioctl", 681 .arg = { 682 #if defined(__i386__) || defined(__x86_64__) 683 /* 684 * FIXME: Make this available to all arches. 685 */ 686 [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ }, 687 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, }, 688 #else 689 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, }, 690 #endif 691 { .name = "kcmp", .nr_args = 5, 692 .arg = { [0] = { .name = "pid1", .scnprintf = SCA_PID, }, 693 [1] = { .name = "pid2", .scnprintf = SCA_PID, }, 694 [2] = { .name = "type", .scnprintf = SCA_KCMP_TYPE, }, 695 [3] = { .name = "idx1", .scnprintf = SCA_KCMP_IDX, }, 696 [4] = { .name = "idx2", .scnprintf = SCA_KCMP_IDX, }, }, }, 697 { .name = "keyctl", 698 .arg = { [0] = STRARRAY(option, keyctl_options), }, }, 699 { .name = "kill", 700 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 701 { .name = "linkat", 702 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 703 { .name = "lseek", 704 .arg = { [2] = STRARRAY(whence, whences), }, }, 705 { .name = "lstat", .alias = "newlstat", }, 706 { .name = "madvise", 707 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ }, 708 [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, }, 709 { .name = "mkdirat", 710 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 711 { .name = "mknodat", 712 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 713 { .name = "mlock", 714 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, }, 715 { .name = "mlockall", 716 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, }, 717 { .name = "mmap", .hexret = true, 718 /* The standard mmap maps to old_mmap on s390x */ 719 #if defined(__s390x__) 720 .alias = "old_mmap", 721 #endif 722 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, 723 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, 724 [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, }, 725 { .name = "mprotect", 726 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ }, 727 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, }, 728 { .name = "mq_unlink", 729 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, }, 730 { .name = "mremap", .hexret = true, 731 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, 732 [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, 733 [4] = { .scnprintf = SCA_HEX, /* new_addr */ }, }, }, 734 { .name = "munlock", 735 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, }, 736 { .name = "munmap", 737 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, }, 738 { .name = "name_to_handle_at", 739 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 740 { .name = "newfstatat", 741 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 742 { .name = "open", 743 .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, 744 { .name = "open_by_handle_at", 745 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, 746 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, 747 { .name = "openat", 748 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, 749 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, 750 { .name = "perf_event_open", 751 .arg = { [2] = { .scnprintf = SCA_INT, /* cpu */ }, 752 [3] = { .scnprintf = SCA_FD, /* group_fd */ }, 753 [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, }, 754 { .name = "pipe2", 755 .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, }, 756 { .name = "pkey_alloc", 757 .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS, /* access_rights */ }, }, }, 758 { .name = "pkey_free", 759 .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, }, 760 { .name = "pkey_mprotect", 761 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ }, 762 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, 763 [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, }, 764 { .name = "poll", .timeout = true, }, 765 { .name = "ppoll", .timeout = true, }, 766 { .name = "prctl", .alias = "arch_prctl", 767 .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ }, 768 [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ }, 769 [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, }, 770 { .name = "pread", .alias = "pread64", }, 771 { .name = "preadv", .alias = "pread", }, 772 { .name = "prlimit64", 773 .arg = { [1] = STRARRAY(resource, rlimit_resources), }, }, 774 { .name = "pwrite", .alias = "pwrite64", }, 775 { .name = "readlinkat", 776 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 777 { .name = "recvfrom", 778 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 779 { .name = "recvmmsg", 780 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 781 { .name = "recvmsg", 782 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 783 { .name = "renameat", 784 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 785 { .name = "rt_sigaction", 786 .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 787 { .name = "rt_sigprocmask", 788 .arg = { [0] = STRARRAY(how, sighow), }, }, 789 { .name = "rt_sigqueueinfo", 790 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 791 { .name = "rt_tgsigqueueinfo", 792 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 793 { .name = "sched_setscheduler", 794 .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, }, 795 { .name = "seccomp", 796 .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP, /* op */ }, 797 [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, }, 798 { .name = "select", .timeout = true, }, 799 { .name = "sendmmsg", 800 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 801 { .name = "sendmsg", 802 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 803 { .name = "sendto", 804 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 805 { .name = "set_tid_address", .errpid = true, }, 806 { .name = "setitimer", 807 .arg = { [0] = STRARRAY(which, itimers), }, }, 808 { .name = "setrlimit", 809 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, }, 810 { .name = "socket", 811 .arg = { [0] = STRARRAY(family, socket_families), 812 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, 813 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, }, 814 { .name = "socketpair", 815 .arg = { [0] = STRARRAY(family, socket_families), 816 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, 817 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, }, 818 { .name = "stat", .alias = "newstat", }, 819 { .name = "statx", 820 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ }, 821 [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } , 822 [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, }, 823 { .name = "swapoff", 824 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, }, 825 { .name = "swapon", 826 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, }, 827 { .name = "symlinkat", 828 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 829 { .name = "tgkill", 830 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 831 { .name = "tkill", 832 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 833 { .name = "uname", .alias = "newuname", }, 834 { .name = "unlinkat", 835 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 836 { .name = "utimensat", 837 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, }, 838 { .name = "wait4", .errpid = true, 839 .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, }, 840 { .name = "waitid", .errpid = true, 841 .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, }, 842 }; 843 844 static int syscall_fmt__cmp(const void *name, const void *fmtp) 845 { 846 const struct syscall_fmt *fmt = fmtp; 847 return strcmp(name, fmt->name); 848 } 849 850 static struct syscall_fmt *syscall_fmt__find(const char *name) 851 { 852 const int nmemb = ARRAY_SIZE(syscall_fmts); 853 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp); 854 } 855 856 /* 857 * is_exit: is this "exit" or "exit_group"? 858 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter. 859 * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc. 860 */ 861 struct syscall { 862 struct event_format *tp_format; 863 int nr_args; 864 int args_size; 865 bool is_exit; 866 bool is_open; 867 struct format_field *args; 868 const char *name; 869 struct syscall_fmt *fmt; 870 struct syscall_arg_fmt *arg_fmt; 871 }; 872 873 /* 874 * We need to have this 'calculated' boolean because in some cases we really 875 * don't know what is the duration of a syscall, for instance, when we start 876 * a session and some threads are waiting for a syscall to finish, say 'poll', 877 * in which case all we can do is to print "( ? ) for duration and for the 878 * start timestamp. 879 */ 880 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp) 881 { 882 double duration = (double)t / NSEC_PER_MSEC; 883 size_t printed = fprintf(fp, "("); 884 885 if (!calculated) 886 printed += fprintf(fp, " "); 887 else if (duration >= 1.0) 888 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration); 889 else if (duration >= 0.01) 890 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration); 891 else 892 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration); 893 return printed + fprintf(fp, "): "); 894 } 895 896 /** 897 * filename.ptr: The filename char pointer that will be vfs_getname'd 898 * filename.entry_str_pos: Where to insert the string translated from 899 * filename.ptr by the vfs_getname tracepoint/kprobe. 900 * ret_scnprintf: syscall args may set this to a different syscall return 901 * formatter, for instance, fcntl may return fds, file flags, etc. 902 */ 903 struct thread_trace { 904 u64 entry_time; 905 bool entry_pending; 906 unsigned long nr_events; 907 unsigned long pfmaj, pfmin; 908 char *entry_str; 909 double runtime_ms; 910 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg); 911 struct { 912 unsigned long ptr; 913 short int entry_str_pos; 914 bool pending_open; 915 unsigned int namelen; 916 char *name; 917 } filename; 918 struct { 919 int max; 920 char **table; 921 } paths; 922 923 struct intlist *syscall_stats; 924 }; 925 926 static struct thread_trace *thread_trace__new(void) 927 { 928 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace)); 929 930 if (ttrace) 931 ttrace->paths.max = -1; 932 933 ttrace->syscall_stats = intlist__new(NULL); 934 935 return ttrace; 936 } 937 938 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp) 939 { 940 struct thread_trace *ttrace; 941 942 if (thread == NULL) 943 goto fail; 944 945 if (thread__priv(thread) == NULL) 946 thread__set_priv(thread, thread_trace__new()); 947 948 if (thread__priv(thread) == NULL) 949 goto fail; 950 951 ttrace = thread__priv(thread); 952 ++ttrace->nr_events; 953 954 return ttrace; 955 fail: 956 color_fprintf(fp, PERF_COLOR_RED, 957 "WARNING: not enough memory, dropping samples!\n"); 958 return NULL; 959 } 960 961 962 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg, 963 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg)) 964 { 965 struct thread_trace *ttrace = thread__priv(arg->thread); 966 967 ttrace->ret_scnprintf = ret_scnprintf; 968 } 969 970 #define TRACE_PFMAJ (1 << 0) 971 #define TRACE_PFMIN (1 << 1) 972 973 static const size_t trace__entry_str_size = 2048; 974 975 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname) 976 { 977 struct thread_trace *ttrace = thread__priv(thread); 978 979 if (fd > ttrace->paths.max) { 980 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *)); 981 982 if (npath == NULL) 983 return -1; 984 985 if (ttrace->paths.max != -1) { 986 memset(npath + ttrace->paths.max + 1, 0, 987 (fd - ttrace->paths.max) * sizeof(char *)); 988 } else { 989 memset(npath, 0, (fd + 1) * sizeof(char *)); 990 } 991 992 ttrace->paths.table = npath; 993 ttrace->paths.max = fd; 994 } 995 996 ttrace->paths.table[fd] = strdup(pathname); 997 998 return ttrace->paths.table[fd] != NULL ? 0 : -1; 999 } 1000 1001 static int thread__read_fd_path(struct thread *thread, int fd) 1002 { 1003 char linkname[PATH_MAX], pathname[PATH_MAX]; 1004 struct stat st; 1005 int ret; 1006 1007 if (thread->pid_ == thread->tid) { 1008 scnprintf(linkname, sizeof(linkname), 1009 "/proc/%d/fd/%d", thread->pid_, fd); 1010 } else { 1011 scnprintf(linkname, sizeof(linkname), 1012 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd); 1013 } 1014 1015 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname)) 1016 return -1; 1017 1018 ret = readlink(linkname, pathname, sizeof(pathname)); 1019 1020 if (ret < 0 || ret > st.st_size) 1021 return -1; 1022 1023 pathname[ret] = '\0'; 1024 return trace__set_fd_pathname(thread, fd, pathname); 1025 } 1026 1027 static const char *thread__fd_path(struct thread *thread, int fd, 1028 struct trace *trace) 1029 { 1030 struct thread_trace *ttrace = thread__priv(thread); 1031 1032 if (ttrace == NULL) 1033 return NULL; 1034 1035 if (fd < 0) 1036 return NULL; 1037 1038 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) { 1039 if (!trace->live) 1040 return NULL; 1041 ++trace->stats.proc_getname; 1042 if (thread__read_fd_path(thread, fd)) 1043 return NULL; 1044 } 1045 1046 return ttrace->paths.table[fd]; 1047 } 1048 1049 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg) 1050 { 1051 int fd = arg->val; 1052 size_t printed = scnprintf(bf, size, "%d", fd); 1053 const char *path = thread__fd_path(arg->thread, fd, arg->trace); 1054 1055 if (path) 1056 printed += scnprintf(bf + printed, size - printed, "<%s>", path); 1057 1058 return printed; 1059 } 1060 1061 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size) 1062 { 1063 size_t printed = scnprintf(bf, size, "%d", fd); 1064 struct thread *thread = machine__find_thread(trace->host, pid, pid); 1065 1066 if (thread) { 1067 const char *path = thread__fd_path(thread, fd, trace); 1068 1069 if (path) 1070 printed += scnprintf(bf + printed, size - printed, "<%s>", path); 1071 1072 thread__put(thread); 1073 } 1074 1075 return printed; 1076 } 1077 1078 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size, 1079 struct syscall_arg *arg) 1080 { 1081 int fd = arg->val; 1082 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg); 1083 struct thread_trace *ttrace = thread__priv(arg->thread); 1084 1085 if (ttrace && fd >= 0 && fd <= ttrace->paths.max) 1086 zfree(&ttrace->paths.table[fd]); 1087 1088 return printed; 1089 } 1090 1091 static void thread__set_filename_pos(struct thread *thread, const char *bf, 1092 unsigned long ptr) 1093 { 1094 struct thread_trace *ttrace = thread__priv(thread); 1095 1096 ttrace->filename.ptr = ptr; 1097 ttrace->filename.entry_str_pos = bf - ttrace->entry_str; 1098 } 1099 1100 static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size) 1101 { 1102 struct augmented_arg *augmented_arg = arg->augmented.args; 1103 1104 return scnprintf(bf, size, "%.*s", augmented_arg->size, augmented_arg->value); 1105 } 1106 1107 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, 1108 struct syscall_arg *arg) 1109 { 1110 unsigned long ptr = arg->val; 1111 1112 if (arg->augmented.args) 1113 return syscall_arg__scnprintf_augmented_string(arg, bf, size); 1114 1115 if (!arg->trace->vfs_getname) 1116 return scnprintf(bf, size, "%#x", ptr); 1117 1118 thread__set_filename_pos(arg->thread, bf, ptr); 1119 return 0; 1120 } 1121 1122 static bool trace__filter_duration(struct trace *trace, double t) 1123 { 1124 return t < (trace->duration_filter * NSEC_PER_MSEC); 1125 } 1126 1127 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp) 1128 { 1129 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC; 1130 1131 return fprintf(fp, "%10.3f ", ts); 1132 } 1133 1134 /* 1135 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are 1136 * using ttrace->entry_time for a thread that receives a sys_exit without 1137 * first having received a sys_enter ("poll" issued before tracing session 1138 * starts, lost sys_enter exit due to ring buffer overflow). 1139 */ 1140 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp) 1141 { 1142 if (tstamp > 0) 1143 return __trace__fprintf_tstamp(trace, tstamp, fp); 1144 1145 return fprintf(fp, " ? "); 1146 } 1147 1148 static bool done = false; 1149 static bool interrupted = false; 1150 1151 static void sig_handler(int sig) 1152 { 1153 done = true; 1154 interrupted = sig == SIGINT; 1155 } 1156 1157 static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp) 1158 { 1159 size_t printed = 0; 1160 1161 if (trace->multiple_threads) { 1162 if (trace->show_comm) 1163 printed += fprintf(fp, "%.14s/", thread__comm_str(thread)); 1164 printed += fprintf(fp, "%d ", thread->tid); 1165 } 1166 1167 return printed; 1168 } 1169 1170 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread, 1171 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp) 1172 { 1173 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp); 1174 printed += fprintf_duration(duration, duration_calculated, fp); 1175 return printed + trace__fprintf_comm_tid(trace, thread, fp); 1176 } 1177 1178 static int trace__process_event(struct trace *trace, struct machine *machine, 1179 union perf_event *event, struct perf_sample *sample) 1180 { 1181 int ret = 0; 1182 1183 switch (event->header.type) { 1184 case PERF_RECORD_LOST: 1185 color_fprintf(trace->output, PERF_COLOR_RED, 1186 "LOST %" PRIu64 " events!\n", event->lost.lost); 1187 ret = machine__process_lost_event(machine, event, sample); 1188 break; 1189 default: 1190 ret = machine__process_event(machine, event, sample); 1191 break; 1192 } 1193 1194 return ret; 1195 } 1196 1197 static int trace__tool_process(struct perf_tool *tool, 1198 union perf_event *event, 1199 struct perf_sample *sample, 1200 struct machine *machine) 1201 { 1202 struct trace *trace = container_of(tool, struct trace, tool); 1203 return trace__process_event(trace, machine, event, sample); 1204 } 1205 1206 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp) 1207 { 1208 struct machine *machine = vmachine; 1209 1210 if (machine->kptr_restrict_warned) 1211 return NULL; 1212 1213 if (symbol_conf.kptr_restrict) { 1214 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n" 1215 "Check /proc/sys/kernel/kptr_restrict.\n\n" 1216 "Kernel samples will not be resolved.\n"); 1217 machine->kptr_restrict_warned = true; 1218 return NULL; 1219 } 1220 1221 return machine__resolve_kernel_addr(vmachine, addrp, modp); 1222 } 1223 1224 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist) 1225 { 1226 int err = symbol__init(NULL); 1227 1228 if (err) 1229 return err; 1230 1231 trace->host = machine__new_host(); 1232 if (trace->host == NULL) 1233 return -ENOMEM; 1234 1235 err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr); 1236 if (err < 0) 1237 goto out; 1238 1239 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target, 1240 evlist->threads, trace__tool_process, false, 1241 trace->opts.proc_map_timeout, 1); 1242 out: 1243 if (err) 1244 symbol__exit(); 1245 1246 return err; 1247 } 1248 1249 static void trace__symbols__exit(struct trace *trace) 1250 { 1251 machine__exit(trace->host); 1252 trace->host = NULL; 1253 1254 symbol__exit(); 1255 } 1256 1257 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args) 1258 { 1259 int idx; 1260 1261 if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0) 1262 nr_args = sc->fmt->nr_args; 1263 1264 sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt)); 1265 if (sc->arg_fmt == NULL) 1266 return -1; 1267 1268 for (idx = 0; idx < nr_args; ++idx) { 1269 if (sc->fmt) 1270 sc->arg_fmt[idx] = sc->fmt->arg[idx]; 1271 } 1272 1273 sc->nr_args = nr_args; 1274 return 0; 1275 } 1276 1277 static int syscall__set_arg_fmts(struct syscall *sc) 1278 { 1279 struct format_field *field, *last_field = NULL; 1280 int idx = 0, len; 1281 1282 for (field = sc->args; field; field = field->next, ++idx) { 1283 last_field = field; 1284 1285 if (sc->fmt && sc->fmt->arg[idx].scnprintf) 1286 continue; 1287 1288 if (strcmp(field->type, "const char *") == 0 && 1289 (strcmp(field->name, "filename") == 0 || 1290 strcmp(field->name, "path") == 0 || 1291 strcmp(field->name, "pathname") == 0)) 1292 sc->arg_fmt[idx].scnprintf = SCA_FILENAME; 1293 else if (field->flags & FIELD_IS_POINTER) 1294 sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex; 1295 else if (strcmp(field->type, "pid_t") == 0) 1296 sc->arg_fmt[idx].scnprintf = SCA_PID; 1297 else if (strcmp(field->type, "umode_t") == 0) 1298 sc->arg_fmt[idx].scnprintf = SCA_MODE_T; 1299 else if ((strcmp(field->type, "int") == 0 || 1300 strcmp(field->type, "unsigned int") == 0 || 1301 strcmp(field->type, "long") == 0) && 1302 (len = strlen(field->name)) >= 2 && 1303 strcmp(field->name + len - 2, "fd") == 0) { 1304 /* 1305 * /sys/kernel/tracing/events/syscalls/sys_enter* 1306 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c 1307 * 65 int 1308 * 23 unsigned int 1309 * 7 unsigned long 1310 */ 1311 sc->arg_fmt[idx].scnprintf = SCA_FD; 1312 } 1313 } 1314 1315 if (last_field) 1316 sc->args_size = last_field->offset + last_field->size; 1317 1318 return 0; 1319 } 1320 1321 static int trace__read_syscall_info(struct trace *trace, int id) 1322 { 1323 char tp_name[128]; 1324 struct syscall *sc; 1325 const char *name = syscalltbl__name(trace->sctbl, id); 1326 1327 if (name == NULL) 1328 return -1; 1329 1330 if (id > trace->syscalls.max) { 1331 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc)); 1332 1333 if (nsyscalls == NULL) 1334 return -1; 1335 1336 if (trace->syscalls.max != -1) { 1337 memset(nsyscalls + trace->syscalls.max + 1, 0, 1338 (id - trace->syscalls.max) * sizeof(*sc)); 1339 } else { 1340 memset(nsyscalls, 0, (id + 1) * sizeof(*sc)); 1341 } 1342 1343 trace->syscalls.table = nsyscalls; 1344 trace->syscalls.max = id; 1345 } 1346 1347 sc = trace->syscalls.table + id; 1348 sc->name = name; 1349 1350 sc->fmt = syscall_fmt__find(sc->name); 1351 1352 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name); 1353 sc->tp_format = trace_event__tp_format("syscalls", tp_name); 1354 1355 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) { 1356 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias); 1357 sc->tp_format = trace_event__tp_format("syscalls", tp_name); 1358 } 1359 1360 if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields)) 1361 return -1; 1362 1363 if (IS_ERR(sc->tp_format)) 1364 return -1; 1365 1366 sc->args = sc->tp_format->format.fields; 1367 /* 1368 * We need to check and discard the first variable '__syscall_nr' 1369 * or 'nr' that mean the syscall number. It is needless here. 1370 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels. 1371 */ 1372 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) { 1373 sc->args = sc->args->next; 1374 --sc->nr_args; 1375 } 1376 1377 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit"); 1378 sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat"); 1379 1380 return syscall__set_arg_fmts(sc); 1381 } 1382 1383 static int trace__validate_ev_qualifier(struct trace *trace) 1384 { 1385 int err = 0, i; 1386 size_t nr_allocated; 1387 struct str_node *pos; 1388 1389 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier); 1390 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr * 1391 sizeof(trace->ev_qualifier_ids.entries[0])); 1392 1393 if (trace->ev_qualifier_ids.entries == NULL) { 1394 fputs("Error:\tNot enough memory for allocating events qualifier ids\n", 1395 trace->output); 1396 err = -EINVAL; 1397 goto out; 1398 } 1399 1400 nr_allocated = trace->ev_qualifier_ids.nr; 1401 i = 0; 1402 1403 strlist__for_each_entry(pos, trace->ev_qualifier) { 1404 const char *sc = pos->s; 1405 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1; 1406 1407 if (id < 0) { 1408 id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next); 1409 if (id >= 0) 1410 goto matches; 1411 1412 if (err == 0) { 1413 fputs("Error:\tInvalid syscall ", trace->output); 1414 err = -EINVAL; 1415 } else { 1416 fputs(", ", trace->output); 1417 } 1418 1419 fputs(sc, trace->output); 1420 } 1421 matches: 1422 trace->ev_qualifier_ids.entries[i++] = id; 1423 if (match_next == -1) 1424 continue; 1425 1426 while (1) { 1427 id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next); 1428 if (id < 0) 1429 break; 1430 if (nr_allocated == trace->ev_qualifier_ids.nr) { 1431 void *entries; 1432 1433 nr_allocated += 8; 1434 entries = realloc(trace->ev_qualifier_ids.entries, 1435 nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0])); 1436 if (entries == NULL) { 1437 err = -ENOMEM; 1438 fputs("\nError:\t Not enough memory for parsing\n", trace->output); 1439 goto out_free; 1440 } 1441 trace->ev_qualifier_ids.entries = entries; 1442 } 1443 trace->ev_qualifier_ids.nr++; 1444 trace->ev_qualifier_ids.entries[i++] = id; 1445 } 1446 } 1447 1448 if (err < 0) { 1449 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'" 1450 "\nHint:\tand: 'man syscalls'\n", trace->output); 1451 out_free: 1452 zfree(&trace->ev_qualifier_ids.entries); 1453 trace->ev_qualifier_ids.nr = 0; 1454 } 1455 out: 1456 return err; 1457 } 1458 1459 /* 1460 * args is to be interpreted as a series of longs but we need to handle 1461 * 8-byte unaligned accesses. args points to raw_data within the event 1462 * and raw_data is guaranteed to be 8-byte unaligned because it is 1463 * preceded by raw_size which is a u32. So we need to copy args to a temp 1464 * variable to read it. Most notably this avoids extended load instructions 1465 * on unaligned addresses 1466 */ 1467 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx) 1468 { 1469 unsigned long val; 1470 unsigned char *p = arg->args + sizeof(unsigned long) * idx; 1471 1472 memcpy(&val, p, sizeof(val)); 1473 return val; 1474 } 1475 1476 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size, 1477 struct syscall_arg *arg) 1478 { 1479 if (sc->arg_fmt && sc->arg_fmt[arg->idx].name) 1480 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name); 1481 1482 return scnprintf(bf, size, "arg%d: ", arg->idx); 1483 } 1484 1485 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size, 1486 struct syscall_arg *arg, unsigned long val) 1487 { 1488 if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) { 1489 arg->val = val; 1490 if (sc->arg_fmt[arg->idx].parm) 1491 arg->parm = sc->arg_fmt[arg->idx].parm; 1492 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg); 1493 } 1494 return scnprintf(bf, size, "%ld", val); 1495 } 1496 1497 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, 1498 unsigned char *args, void *augmented_args, int augmented_args_size, 1499 struct trace *trace, struct thread *thread) 1500 { 1501 size_t printed = 0; 1502 unsigned long val; 1503 u8 bit = 1; 1504 struct syscall_arg arg = { 1505 .args = args, 1506 .augmented = { 1507 .size = augmented_args_size, 1508 .args = augmented_args, 1509 }, 1510 .idx = 0, 1511 .mask = 0, 1512 .trace = trace, 1513 .thread = thread, 1514 }; 1515 struct thread_trace *ttrace = thread__priv(thread); 1516 1517 /* 1518 * Things like fcntl will set this in its 'cmd' formatter to pick the 1519 * right formatter for the return value (an fd? file flags?), which is 1520 * not needed for syscalls that always return a given type, say an fd. 1521 */ 1522 ttrace->ret_scnprintf = NULL; 1523 1524 if (sc->args != NULL) { 1525 struct format_field *field; 1526 1527 for (field = sc->args; field; 1528 field = field->next, ++arg.idx, bit <<= 1) { 1529 if (arg.mask & bit) 1530 continue; 1531 1532 val = syscall_arg__val(&arg, arg.idx); 1533 1534 /* 1535 * Suppress this argument if its value is zero and 1536 * and we don't have a string associated in an 1537 * strarray for it. 1538 */ 1539 if (val == 0 && 1540 !(sc->arg_fmt && 1541 (sc->arg_fmt[arg.idx].show_zero || 1542 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY || 1543 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) && 1544 sc->arg_fmt[arg.idx].parm)) 1545 continue; 1546 1547 printed += scnprintf(bf + printed, size - printed, 1548 "%s%s: ", printed ? ", " : "", field->name); 1549 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val); 1550 } 1551 } else if (IS_ERR(sc->tp_format)) { 1552 /* 1553 * If we managed to read the tracepoint /format file, then we 1554 * may end up not having any args, like with gettid(), so only 1555 * print the raw args when we didn't manage to read it. 1556 */ 1557 while (arg.idx < sc->nr_args) { 1558 if (arg.mask & bit) 1559 goto next_arg; 1560 val = syscall_arg__val(&arg, arg.idx); 1561 if (printed) 1562 printed += scnprintf(bf + printed, size - printed, ", "); 1563 printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg); 1564 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val); 1565 next_arg: 1566 ++arg.idx; 1567 bit <<= 1; 1568 } 1569 } 1570 1571 return printed; 1572 } 1573 1574 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel, 1575 union perf_event *event, 1576 struct perf_sample *sample); 1577 1578 static struct syscall *trace__syscall_info(struct trace *trace, 1579 struct perf_evsel *evsel, int id) 1580 { 1581 1582 if (id < 0) { 1583 1584 /* 1585 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried 1586 * before that, leaving at a higher verbosity level till that is 1587 * explained. Reproduced with plain ftrace with: 1588 * 1589 * echo 1 > /t/events/raw_syscalls/sys_exit/enable 1590 * grep "NR -1 " /t/trace_pipe 1591 * 1592 * After generating some load on the machine. 1593 */ 1594 if (verbose > 1) { 1595 static u64 n; 1596 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n", 1597 id, perf_evsel__name(evsel), ++n); 1598 } 1599 return NULL; 1600 } 1601 1602 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) && 1603 trace__read_syscall_info(trace, id)) 1604 goto out_cant_read; 1605 1606 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL)) 1607 goto out_cant_read; 1608 1609 return &trace->syscalls.table[id]; 1610 1611 out_cant_read: 1612 if (verbose > 0) { 1613 fprintf(trace->output, "Problems reading syscall %d", id); 1614 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL) 1615 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name); 1616 fputs(" information\n", trace->output); 1617 } 1618 return NULL; 1619 } 1620 1621 static void thread__update_stats(struct thread_trace *ttrace, 1622 int id, struct perf_sample *sample) 1623 { 1624 struct int_node *inode; 1625 struct stats *stats; 1626 u64 duration = 0; 1627 1628 inode = intlist__findnew(ttrace->syscall_stats, id); 1629 if (inode == NULL) 1630 return; 1631 1632 stats = inode->priv; 1633 if (stats == NULL) { 1634 stats = malloc(sizeof(struct stats)); 1635 if (stats == NULL) 1636 return; 1637 init_stats(stats); 1638 inode->priv = stats; 1639 } 1640 1641 if (ttrace->entry_time && sample->time > ttrace->entry_time) 1642 duration = sample->time - ttrace->entry_time; 1643 1644 update_stats(stats, duration); 1645 } 1646 1647 static int trace__printf_interrupted_entry(struct trace *trace) 1648 { 1649 struct thread_trace *ttrace; 1650 size_t printed; 1651 1652 if (trace->failure_only || trace->current == NULL) 1653 return 0; 1654 1655 ttrace = thread__priv(trace->current); 1656 1657 if (!ttrace->entry_pending) 1658 return 0; 1659 1660 printed = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output); 1661 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str); 1662 ttrace->entry_pending = false; 1663 1664 return printed; 1665 } 1666 1667 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel, 1668 struct perf_sample *sample, struct thread *thread) 1669 { 1670 int printed = 0; 1671 1672 if (trace->print_sample) { 1673 double ts = (double)sample->time / NSEC_PER_MSEC; 1674 1675 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n", 1676 perf_evsel__name(evsel), ts, 1677 thread__comm_str(thread), 1678 sample->pid, sample->tid, sample->cpu); 1679 } 1680 1681 return printed; 1682 } 1683 1684 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, 1685 union perf_event *event __maybe_unused, 1686 struct perf_sample *sample) 1687 { 1688 char *msg; 1689 void *args; 1690 size_t printed = 0; 1691 struct thread *thread; 1692 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; 1693 struct syscall *sc = trace__syscall_info(trace, evsel, id); 1694 struct thread_trace *ttrace; 1695 1696 if (sc == NULL) 1697 return -1; 1698 1699 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1700 ttrace = thread__trace(thread, trace->output); 1701 if (ttrace == NULL) 1702 goto out_put; 1703 1704 trace__fprintf_sample(trace, evsel, sample, thread); 1705 1706 args = perf_evsel__sc_tp_ptr(evsel, args, sample); 1707 1708 if (ttrace->entry_str == NULL) { 1709 ttrace->entry_str = malloc(trace__entry_str_size); 1710 if (!ttrace->entry_str) 1711 goto out_put; 1712 } 1713 1714 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) 1715 trace__printf_interrupted_entry(trace); 1716 1717 ttrace->entry_time = sample->time; 1718 msg = ttrace->entry_str; 1719 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name); 1720 1721 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed, 1722 args, NULL, 0, trace, thread); 1723 1724 if (sc->is_exit) { 1725 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) { 1726 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output); 1727 fprintf(trace->output, "%-70s)\n", ttrace->entry_str); 1728 } 1729 } else { 1730 ttrace->entry_pending = true; 1731 /* See trace__vfs_getname & trace__sys_exit */ 1732 ttrace->filename.pending_open = false; 1733 } 1734 1735 if (trace->current != thread) { 1736 thread__put(trace->current); 1737 trace->current = thread__get(thread); 1738 } 1739 err = 0; 1740 out_put: 1741 thread__put(thread); 1742 return err; 1743 } 1744 1745 static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evsel, 1746 struct perf_sample *sample) 1747 { 1748 struct thread_trace *ttrace; 1749 struct thread *thread; 1750 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; 1751 struct syscall *sc = trace__syscall_info(trace, evsel, id); 1752 char msg[1024]; 1753 void *args, *augmented_args = NULL; 1754 int augmented_args_size; 1755 1756 if (sc == NULL) 1757 return -1; 1758 1759 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1760 ttrace = thread__trace(thread, trace->output); 1761 /* 1762 * We need to get ttrace just to make sure it is there when syscall__scnprintf_args() 1763 * and the rest of the beautifiers accessing it via struct syscall_arg touches it. 1764 */ 1765 if (ttrace == NULL) 1766 goto out_put; 1767 1768 args = perf_evsel__sc_tp_ptr(evsel, args, sample); 1769 augmented_args_size = sample->raw_size - sc->args_size; 1770 if (augmented_args_size > 0) 1771 augmented_args = sample->raw_data + sc->args_size; 1772 1773 syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread); 1774 fprintf(trace->output, "%s", msg); 1775 err = 0; 1776 out_put: 1777 thread__put(thread); 1778 return err; 1779 } 1780 1781 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel, 1782 struct perf_sample *sample, 1783 struct callchain_cursor *cursor) 1784 { 1785 struct addr_location al; 1786 int max_stack = evsel->attr.sample_max_stack ? 1787 evsel->attr.sample_max_stack : 1788 trace->max_stack; 1789 1790 if (machine__resolve(trace->host, &al, sample) < 0 || 1791 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack)) 1792 return -1; 1793 1794 return 0; 1795 } 1796 1797 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample) 1798 { 1799 /* TODO: user-configurable print_opts */ 1800 const unsigned int print_opts = EVSEL__PRINT_SYM | 1801 EVSEL__PRINT_DSO | 1802 EVSEL__PRINT_UNKNOWN_AS_ADDR; 1803 1804 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output); 1805 } 1806 1807 static const char *errno_to_name(struct perf_evsel *evsel, int err) 1808 { 1809 struct perf_env *env = perf_evsel__env(evsel); 1810 const char *arch_name = perf_env__arch(env); 1811 1812 return arch_syscalls__strerrno(arch_name, err); 1813 } 1814 1815 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, 1816 union perf_event *event __maybe_unused, 1817 struct perf_sample *sample) 1818 { 1819 long ret; 1820 u64 duration = 0; 1821 bool duration_calculated = false; 1822 struct thread *thread; 1823 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0; 1824 struct syscall *sc = trace__syscall_info(trace, evsel, id); 1825 struct thread_trace *ttrace; 1826 1827 if (sc == NULL) 1828 return -1; 1829 1830 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1831 ttrace = thread__trace(thread, trace->output); 1832 if (ttrace == NULL) 1833 goto out_put; 1834 1835 trace__fprintf_sample(trace, evsel, sample, thread); 1836 1837 if (trace->summary) 1838 thread__update_stats(ttrace, id, sample); 1839 1840 ret = perf_evsel__sc_tp_uint(evsel, ret, sample); 1841 1842 if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) { 1843 trace__set_fd_pathname(thread, ret, ttrace->filename.name); 1844 ttrace->filename.pending_open = false; 1845 ++trace->stats.vfs_getname; 1846 } 1847 1848 if (ttrace->entry_time) { 1849 duration = sample->time - ttrace->entry_time; 1850 if (trace__filter_duration(trace, duration)) 1851 goto out; 1852 duration_calculated = true; 1853 } else if (trace->duration_filter) 1854 goto out; 1855 1856 if (sample->callchain) { 1857 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 1858 if (callchain_ret == 0) { 1859 if (callchain_cursor.nr < trace->min_stack) 1860 goto out; 1861 callchain_ret = 1; 1862 } 1863 } 1864 1865 if (trace->summary_only || (ret >= 0 && trace->failure_only)) 1866 goto out; 1867 1868 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output); 1869 1870 if (ttrace->entry_pending) { 1871 fprintf(trace->output, "%-70s", ttrace->entry_str); 1872 } else { 1873 fprintf(trace->output, " ... ["); 1874 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued"); 1875 fprintf(trace->output, "]: %s()", sc->name); 1876 } 1877 1878 if (sc->fmt == NULL) { 1879 if (ret < 0) 1880 goto errno_print; 1881 signed_print: 1882 fprintf(trace->output, ") = %ld", ret); 1883 } else if (ret < 0) { 1884 errno_print: { 1885 char bf[STRERR_BUFSIZE]; 1886 const char *emsg = str_error_r(-ret, bf, sizeof(bf)), 1887 *e = errno_to_name(evsel, -ret); 1888 1889 fprintf(trace->output, ") = -1 %s %s", e, emsg); 1890 } 1891 } else if (ret == 0 && sc->fmt->timeout) 1892 fprintf(trace->output, ") = 0 Timeout"); 1893 else if (ttrace->ret_scnprintf) { 1894 char bf[1024]; 1895 struct syscall_arg arg = { 1896 .val = ret, 1897 .thread = thread, 1898 .trace = trace, 1899 }; 1900 ttrace->ret_scnprintf(bf, sizeof(bf), &arg); 1901 ttrace->ret_scnprintf = NULL; 1902 fprintf(trace->output, ") = %s", bf); 1903 } else if (sc->fmt->hexret) 1904 fprintf(trace->output, ") = %#lx", ret); 1905 else if (sc->fmt->errpid) { 1906 struct thread *child = machine__find_thread(trace->host, ret, ret); 1907 1908 if (child != NULL) { 1909 fprintf(trace->output, ") = %ld", ret); 1910 if (child->comm_set) 1911 fprintf(trace->output, " (%s)", thread__comm_str(child)); 1912 thread__put(child); 1913 } 1914 } else 1915 goto signed_print; 1916 1917 fputc('\n', trace->output); 1918 1919 if (callchain_ret > 0) 1920 trace__fprintf_callchain(trace, sample); 1921 else if (callchain_ret < 0) 1922 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 1923 out: 1924 ttrace->entry_pending = false; 1925 err = 0; 1926 out_put: 1927 thread__put(thread); 1928 return err; 1929 } 1930 1931 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel, 1932 union perf_event *event __maybe_unused, 1933 struct perf_sample *sample) 1934 { 1935 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1936 struct thread_trace *ttrace; 1937 size_t filename_len, entry_str_len, to_move; 1938 ssize_t remaining_space; 1939 char *pos; 1940 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname"); 1941 1942 if (!thread) 1943 goto out; 1944 1945 ttrace = thread__priv(thread); 1946 if (!ttrace) 1947 goto out_put; 1948 1949 filename_len = strlen(filename); 1950 if (filename_len == 0) 1951 goto out_put; 1952 1953 if (ttrace->filename.namelen < filename_len) { 1954 char *f = realloc(ttrace->filename.name, filename_len + 1); 1955 1956 if (f == NULL) 1957 goto out_put; 1958 1959 ttrace->filename.namelen = filename_len; 1960 ttrace->filename.name = f; 1961 } 1962 1963 strcpy(ttrace->filename.name, filename); 1964 ttrace->filename.pending_open = true; 1965 1966 if (!ttrace->filename.ptr) 1967 goto out_put; 1968 1969 entry_str_len = strlen(ttrace->entry_str); 1970 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */ 1971 if (remaining_space <= 0) 1972 goto out_put; 1973 1974 if (filename_len > (size_t)remaining_space) { 1975 filename += filename_len - remaining_space; 1976 filename_len = remaining_space; 1977 } 1978 1979 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */ 1980 pos = ttrace->entry_str + ttrace->filename.entry_str_pos; 1981 memmove(pos + filename_len, pos, to_move); 1982 memcpy(pos, filename, filename_len); 1983 1984 ttrace->filename.ptr = 0; 1985 ttrace->filename.entry_str_pos = 0; 1986 out_put: 1987 thread__put(thread); 1988 out: 1989 return 0; 1990 } 1991 1992 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel, 1993 union perf_event *event __maybe_unused, 1994 struct perf_sample *sample) 1995 { 1996 u64 runtime = perf_evsel__intval(evsel, sample, "runtime"); 1997 double runtime_ms = (double)runtime / NSEC_PER_MSEC; 1998 struct thread *thread = machine__findnew_thread(trace->host, 1999 sample->pid, 2000 sample->tid); 2001 struct thread_trace *ttrace = thread__trace(thread, trace->output); 2002 2003 if (ttrace == NULL) 2004 goto out_dump; 2005 2006 ttrace->runtime_ms += runtime_ms; 2007 trace->runtime_ms += runtime_ms; 2008 out_put: 2009 thread__put(thread); 2010 return 0; 2011 2012 out_dump: 2013 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n", 2014 evsel->name, 2015 perf_evsel__strval(evsel, sample, "comm"), 2016 (pid_t)perf_evsel__intval(evsel, sample, "pid"), 2017 runtime, 2018 perf_evsel__intval(evsel, sample, "vruntime")); 2019 goto out_put; 2020 } 2021 2022 static int bpf_output__printer(enum binary_printer_ops op, 2023 unsigned int val, void *extra __maybe_unused, FILE *fp) 2024 { 2025 unsigned char ch = (unsigned char)val; 2026 2027 switch (op) { 2028 case BINARY_PRINT_CHAR_DATA: 2029 return fprintf(fp, "%c", isprint(ch) ? ch : '.'); 2030 case BINARY_PRINT_DATA_BEGIN: 2031 case BINARY_PRINT_LINE_BEGIN: 2032 case BINARY_PRINT_ADDR: 2033 case BINARY_PRINT_NUM_DATA: 2034 case BINARY_PRINT_NUM_PAD: 2035 case BINARY_PRINT_SEP: 2036 case BINARY_PRINT_CHAR_PAD: 2037 case BINARY_PRINT_LINE_END: 2038 case BINARY_PRINT_DATA_END: 2039 default: 2040 break; 2041 } 2042 2043 return 0; 2044 } 2045 2046 static void bpf_output__fprintf(struct trace *trace, 2047 struct perf_sample *sample) 2048 { 2049 binary__fprintf(sample->raw_data, sample->raw_size, 8, 2050 bpf_output__printer, NULL, trace->output); 2051 } 2052 2053 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, 2054 union perf_event *event __maybe_unused, 2055 struct perf_sample *sample) 2056 { 2057 int callchain_ret = 0; 2058 2059 if (sample->callchain) { 2060 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 2061 if (callchain_ret == 0) { 2062 if (callchain_cursor.nr < trace->min_stack) 2063 goto out; 2064 callchain_ret = 1; 2065 } 2066 } 2067 2068 trace__printf_interrupted_entry(trace); 2069 trace__fprintf_tstamp(trace, sample->time, trace->output); 2070 2071 if (trace->trace_syscalls) 2072 fprintf(trace->output, "( ): "); 2073 2074 if (evsel == trace->syscalls.events.augmented) { 2075 int id = perf_evsel__sc_tp_uint(evsel, id, sample); 2076 struct syscall *sc = trace__syscall_info(trace, evsel, id); 2077 2078 if (sc) { 2079 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 2080 2081 if (thread) { 2082 trace__fprintf_comm_tid(trace, thread, trace->output); 2083 fprintf(trace->output, "%s(", sc->name); 2084 trace__fprintf_sys_enter(trace, evsel, sample); 2085 fputc(')', trace->output); 2086 thread__put(thread); 2087 goto newline; 2088 } 2089 } 2090 2091 /* 2092 * XXX: Not having the associated syscall info or not finding/adding 2093 * the thread should never happen, but if it does... 2094 * fall thru and print it as a bpf_output event. 2095 */ 2096 } 2097 2098 fprintf(trace->output, "%s:", evsel->name); 2099 2100 if (perf_evsel__is_bpf_output(evsel)) { 2101 bpf_output__fprintf(trace, sample); 2102 } else if (evsel->tp_format) { 2103 if (strncmp(evsel->tp_format->name, "sys_enter_", 10) || 2104 trace__fprintf_sys_enter(trace, evsel, sample)) { 2105 event_format__fprintf(evsel->tp_format, sample->cpu, 2106 sample->raw_data, sample->raw_size, 2107 trace->output); 2108 } 2109 } 2110 2111 newline: 2112 fprintf(trace->output, "\n"); 2113 2114 if (callchain_ret > 0) 2115 trace__fprintf_callchain(trace, sample); 2116 else if (callchain_ret < 0) 2117 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 2118 out: 2119 return 0; 2120 } 2121 2122 static void print_location(FILE *f, struct perf_sample *sample, 2123 struct addr_location *al, 2124 bool print_dso, bool print_sym) 2125 { 2126 2127 if ((verbose > 0 || print_dso) && al->map) 2128 fprintf(f, "%s@", al->map->dso->long_name); 2129 2130 if ((verbose > 0 || print_sym) && al->sym) 2131 fprintf(f, "%s+0x%" PRIx64, al->sym->name, 2132 al->addr - al->sym->start); 2133 else if (al->map) 2134 fprintf(f, "0x%" PRIx64, al->addr); 2135 else 2136 fprintf(f, "0x%" PRIx64, sample->addr); 2137 } 2138 2139 static int trace__pgfault(struct trace *trace, 2140 struct perf_evsel *evsel, 2141 union perf_event *event __maybe_unused, 2142 struct perf_sample *sample) 2143 { 2144 struct thread *thread; 2145 struct addr_location al; 2146 char map_type = 'd'; 2147 struct thread_trace *ttrace; 2148 int err = -1; 2149 int callchain_ret = 0; 2150 2151 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 2152 2153 if (sample->callchain) { 2154 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 2155 if (callchain_ret == 0) { 2156 if (callchain_cursor.nr < trace->min_stack) 2157 goto out_put; 2158 callchain_ret = 1; 2159 } 2160 } 2161 2162 ttrace = thread__trace(thread, trace->output); 2163 if (ttrace == NULL) 2164 goto out_put; 2165 2166 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ) 2167 ttrace->pfmaj++; 2168 else 2169 ttrace->pfmin++; 2170 2171 if (trace->summary_only) 2172 goto out; 2173 2174 thread__find_symbol(thread, sample->cpumode, sample->ip, &al); 2175 2176 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output); 2177 2178 fprintf(trace->output, "%sfault [", 2179 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ? 2180 "maj" : "min"); 2181 2182 print_location(trace->output, sample, &al, false, true); 2183 2184 fprintf(trace->output, "] => "); 2185 2186 thread__find_symbol(thread, sample->cpumode, sample->addr, &al); 2187 2188 if (!al.map) { 2189 thread__find_symbol(thread, sample->cpumode, sample->addr, &al); 2190 2191 if (al.map) 2192 map_type = 'x'; 2193 else 2194 map_type = '?'; 2195 } 2196 2197 print_location(trace->output, sample, &al, true, false); 2198 2199 fprintf(trace->output, " (%c%c)\n", map_type, al.level); 2200 2201 if (callchain_ret > 0) 2202 trace__fprintf_callchain(trace, sample); 2203 else if (callchain_ret < 0) 2204 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 2205 out: 2206 err = 0; 2207 out_put: 2208 thread__put(thread); 2209 return err; 2210 } 2211 2212 static void trace__set_base_time(struct trace *trace, 2213 struct perf_evsel *evsel, 2214 struct perf_sample *sample) 2215 { 2216 /* 2217 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust 2218 * and don't use sample->time unconditionally, we may end up having 2219 * some other event in the future without PERF_SAMPLE_TIME for good 2220 * reason, i.e. we may not be interested in its timestamps, just in 2221 * it taking place, picking some piece of information when it 2222 * appears in our event stream (vfs_getname comes to mind). 2223 */ 2224 if (trace->base_time == 0 && !trace->full_time && 2225 (evsel->attr.sample_type & PERF_SAMPLE_TIME)) 2226 trace->base_time = sample->time; 2227 } 2228 2229 static int trace__process_sample(struct perf_tool *tool, 2230 union perf_event *event, 2231 struct perf_sample *sample, 2232 struct perf_evsel *evsel, 2233 struct machine *machine __maybe_unused) 2234 { 2235 struct trace *trace = container_of(tool, struct trace, tool); 2236 struct thread *thread; 2237 int err = 0; 2238 2239 tracepoint_handler handler = evsel->handler; 2240 2241 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 2242 if (thread && thread__is_filtered(thread)) 2243 goto out; 2244 2245 trace__set_base_time(trace, evsel, sample); 2246 2247 if (handler) { 2248 ++trace->nr_events; 2249 handler(trace, evsel, event, sample); 2250 } 2251 out: 2252 thread__put(thread); 2253 return err; 2254 } 2255 2256 static int trace__record(struct trace *trace, int argc, const char **argv) 2257 { 2258 unsigned int rec_argc, i, j; 2259 const char **rec_argv; 2260 const char * const record_args[] = { 2261 "record", 2262 "-R", 2263 "-m", "1024", 2264 "-c", "1", 2265 }; 2266 2267 const char * const sc_args[] = { "-e", }; 2268 unsigned int sc_args_nr = ARRAY_SIZE(sc_args); 2269 const char * const majpf_args[] = { "-e", "major-faults" }; 2270 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args); 2271 const char * const minpf_args[] = { "-e", "minor-faults" }; 2272 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args); 2273 2274 /* +1 is for the event string below */ 2275 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 + 2276 majpf_args_nr + minpf_args_nr + argc; 2277 rec_argv = calloc(rec_argc + 1, sizeof(char *)); 2278 2279 if (rec_argv == NULL) 2280 return -ENOMEM; 2281 2282 j = 0; 2283 for (i = 0; i < ARRAY_SIZE(record_args); i++) 2284 rec_argv[j++] = record_args[i]; 2285 2286 if (trace->trace_syscalls) { 2287 for (i = 0; i < sc_args_nr; i++) 2288 rec_argv[j++] = sc_args[i]; 2289 2290 /* event string may be different for older kernels - e.g., RHEL6 */ 2291 if (is_valid_tracepoint("raw_syscalls:sys_enter")) 2292 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit"; 2293 else if (is_valid_tracepoint("syscalls:sys_enter")) 2294 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit"; 2295 else { 2296 pr_err("Neither raw_syscalls nor syscalls events exist.\n"); 2297 free(rec_argv); 2298 return -1; 2299 } 2300 } 2301 2302 if (trace->trace_pgfaults & TRACE_PFMAJ) 2303 for (i = 0; i < majpf_args_nr; i++) 2304 rec_argv[j++] = majpf_args[i]; 2305 2306 if (trace->trace_pgfaults & TRACE_PFMIN) 2307 for (i = 0; i < minpf_args_nr; i++) 2308 rec_argv[j++] = minpf_args[i]; 2309 2310 for (i = 0; i < (unsigned int)argc; i++) 2311 rec_argv[j++] = argv[i]; 2312 2313 return cmd_record(j, rec_argv); 2314 } 2315 2316 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp); 2317 2318 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist) 2319 { 2320 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname"); 2321 2322 if (IS_ERR(evsel)) 2323 return false; 2324 2325 if (perf_evsel__field(evsel, "pathname") == NULL) { 2326 perf_evsel__delete(evsel); 2327 return false; 2328 } 2329 2330 evsel->handler = trace__vfs_getname; 2331 perf_evlist__add(evlist, evsel); 2332 return true; 2333 } 2334 2335 static struct perf_evsel *perf_evsel__new_pgfault(u64 config) 2336 { 2337 struct perf_evsel *evsel; 2338 struct perf_event_attr attr = { 2339 .type = PERF_TYPE_SOFTWARE, 2340 .mmap_data = 1, 2341 }; 2342 2343 attr.config = config; 2344 attr.sample_period = 1; 2345 2346 event_attr_init(&attr); 2347 2348 evsel = perf_evsel__new(&attr); 2349 if (evsel) 2350 evsel->handler = trace__pgfault; 2351 2352 return evsel; 2353 } 2354 2355 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample) 2356 { 2357 const u32 type = event->header.type; 2358 struct perf_evsel *evsel; 2359 2360 if (type != PERF_RECORD_SAMPLE) { 2361 trace__process_event(trace, trace->host, event, sample); 2362 return; 2363 } 2364 2365 evsel = perf_evlist__id2evsel(trace->evlist, sample->id); 2366 if (evsel == NULL) { 2367 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id); 2368 return; 2369 } 2370 2371 trace__set_base_time(trace, evsel, sample); 2372 2373 if (evsel->attr.type == PERF_TYPE_TRACEPOINT && 2374 sample->raw_data == NULL) { 2375 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n", 2376 perf_evsel__name(evsel), sample->tid, 2377 sample->cpu, sample->raw_size); 2378 } else { 2379 tracepoint_handler handler = evsel->handler; 2380 handler(trace, evsel, event, sample); 2381 } 2382 } 2383 2384 static int trace__add_syscall_newtp(struct trace *trace) 2385 { 2386 int ret = -1; 2387 struct perf_evlist *evlist = trace->evlist; 2388 struct perf_evsel *sys_enter, *sys_exit; 2389 2390 sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter); 2391 if (sys_enter == NULL) 2392 goto out; 2393 2394 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args)) 2395 goto out_delete_sys_enter; 2396 2397 sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit); 2398 if (sys_exit == NULL) 2399 goto out_delete_sys_enter; 2400 2401 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret)) 2402 goto out_delete_sys_exit; 2403 2404 perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param); 2405 perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param); 2406 2407 perf_evlist__add(evlist, sys_enter); 2408 perf_evlist__add(evlist, sys_exit); 2409 2410 if (callchain_param.enabled && !trace->kernel_syscallchains) { 2411 /* 2412 * We're interested only in the user space callchain 2413 * leading to the syscall, allow overriding that for 2414 * debugging reasons using --kernel_syscall_callchains 2415 */ 2416 sys_exit->attr.exclude_callchain_kernel = 1; 2417 } 2418 2419 trace->syscalls.events.sys_enter = sys_enter; 2420 trace->syscalls.events.sys_exit = sys_exit; 2421 2422 ret = 0; 2423 out: 2424 return ret; 2425 2426 out_delete_sys_exit: 2427 perf_evsel__delete_priv(sys_exit); 2428 out_delete_sys_enter: 2429 perf_evsel__delete_priv(sys_enter); 2430 goto out; 2431 } 2432 2433 static int trace__set_ev_qualifier_filter(struct trace *trace) 2434 { 2435 int err = -1; 2436 struct perf_evsel *sys_exit; 2437 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier, 2438 trace->ev_qualifier_ids.nr, 2439 trace->ev_qualifier_ids.entries); 2440 2441 if (filter == NULL) 2442 goto out_enomem; 2443 2444 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter, 2445 filter)) { 2446 sys_exit = trace->syscalls.events.sys_exit; 2447 err = perf_evsel__append_tp_filter(sys_exit, filter); 2448 } 2449 2450 free(filter); 2451 out: 2452 return err; 2453 out_enomem: 2454 errno = ENOMEM; 2455 goto out; 2456 } 2457 2458 static int trace__set_filter_loop_pids(struct trace *trace) 2459 { 2460 unsigned int nr = 1; 2461 pid_t pids[32] = { 2462 getpid(), 2463 }; 2464 struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]); 2465 2466 while (thread && nr < ARRAY_SIZE(pids)) { 2467 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid); 2468 2469 if (parent == NULL) 2470 break; 2471 2472 if (!strcmp(thread__comm_str(parent), "sshd")) { 2473 pids[nr++] = parent->tid; 2474 break; 2475 } 2476 thread = parent; 2477 } 2478 2479 return perf_evlist__set_filter_pids(trace->evlist, nr, pids); 2480 } 2481 2482 static int trace__run(struct trace *trace, int argc, const char **argv) 2483 { 2484 struct perf_evlist *evlist = trace->evlist; 2485 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL; 2486 int err = -1, i; 2487 unsigned long before; 2488 const bool forks = argc > 0; 2489 bool draining = false; 2490 2491 trace->live = true; 2492 2493 if (trace->trace_syscalls && trace__add_syscall_newtp(trace)) 2494 goto out_error_raw_syscalls; 2495 2496 if (trace->trace_syscalls) 2497 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist); 2498 2499 if ((trace->trace_pgfaults & TRACE_PFMAJ)) { 2500 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ); 2501 if (pgfault_maj == NULL) 2502 goto out_error_mem; 2503 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param); 2504 perf_evlist__add(evlist, pgfault_maj); 2505 } 2506 2507 if ((trace->trace_pgfaults & TRACE_PFMIN)) { 2508 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN); 2509 if (pgfault_min == NULL) 2510 goto out_error_mem; 2511 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param); 2512 perf_evlist__add(evlist, pgfault_min); 2513 } 2514 2515 if (trace->sched && 2516 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime", 2517 trace__sched_stat_runtime)) 2518 goto out_error_sched_stat_runtime; 2519 2520 /* 2521 * If a global cgroup was set, apply it to all the events without an 2522 * explicit cgroup. I.e.: 2523 * 2524 * trace -G A -e sched:*switch 2525 * 2526 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc 2527 * _and_ sched:sched_switch to the 'A' cgroup, while: 2528 * 2529 * trace -e sched:*switch -G A 2530 * 2531 * will only set the sched:sched_switch event to the 'A' cgroup, all the 2532 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without" 2533 * a cgroup (on the root cgroup, sys wide, etc). 2534 * 2535 * Multiple cgroups: 2536 * 2537 * trace -G A -e sched:*switch -G B 2538 * 2539 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes 2540 * to the 'B' cgroup. 2541 * 2542 * evlist__set_default_cgroup() grabs a reference of the passed cgroup 2543 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL. 2544 */ 2545 if (trace->cgroup) 2546 evlist__set_default_cgroup(trace->evlist, trace->cgroup); 2547 2548 err = perf_evlist__create_maps(evlist, &trace->opts.target); 2549 if (err < 0) { 2550 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n"); 2551 goto out_delete_evlist; 2552 } 2553 2554 err = trace__symbols_init(trace, evlist); 2555 if (err < 0) { 2556 fprintf(trace->output, "Problems initializing symbol libraries!\n"); 2557 goto out_delete_evlist; 2558 } 2559 2560 perf_evlist__config(evlist, &trace->opts, &callchain_param); 2561 2562 signal(SIGCHLD, sig_handler); 2563 signal(SIGINT, sig_handler); 2564 2565 if (forks) { 2566 err = perf_evlist__prepare_workload(evlist, &trace->opts.target, 2567 argv, false, NULL); 2568 if (err < 0) { 2569 fprintf(trace->output, "Couldn't run the workload!\n"); 2570 goto out_delete_evlist; 2571 } 2572 } 2573 2574 err = perf_evlist__open(evlist); 2575 if (err < 0) 2576 goto out_error_open; 2577 2578 err = bpf__apply_obj_config(); 2579 if (err) { 2580 char errbuf[BUFSIZ]; 2581 2582 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf)); 2583 pr_err("ERROR: Apply config to BPF failed: %s\n", 2584 errbuf); 2585 goto out_error_open; 2586 } 2587 2588 /* 2589 * Better not use !target__has_task() here because we need to cover the 2590 * case where no threads were specified in the command line, but a 2591 * workload was, and in that case we will fill in the thread_map when 2592 * we fork the workload in perf_evlist__prepare_workload. 2593 */ 2594 if (trace->filter_pids.nr > 0) 2595 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries); 2596 else if (thread_map__pid(evlist->threads, 0) == -1) 2597 err = trace__set_filter_loop_pids(trace); 2598 2599 if (err < 0) 2600 goto out_error_mem; 2601 2602 if (trace->ev_qualifier_ids.nr > 0) { 2603 err = trace__set_ev_qualifier_filter(trace); 2604 if (err < 0) 2605 goto out_errno; 2606 2607 pr_debug("event qualifier tracepoint filter: %s\n", 2608 trace->syscalls.events.sys_exit->filter); 2609 } 2610 2611 err = perf_evlist__apply_filters(evlist, &evsel); 2612 if (err < 0) 2613 goto out_error_apply_filters; 2614 2615 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages); 2616 if (err < 0) 2617 goto out_error_mmap; 2618 2619 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay) 2620 perf_evlist__enable(evlist); 2621 2622 if (forks) 2623 perf_evlist__start_workload(evlist); 2624 2625 if (trace->opts.initial_delay) { 2626 usleep(trace->opts.initial_delay * 1000); 2627 perf_evlist__enable(evlist); 2628 } 2629 2630 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 || 2631 evlist->threads->nr > 1 || 2632 perf_evlist__first(evlist)->attr.inherit; 2633 2634 /* 2635 * Now that we already used evsel->attr to ask the kernel to setup the 2636 * events, lets reuse evsel->attr.sample_max_stack as the limit in 2637 * trace__resolve_callchain(), allowing per-event max-stack settings 2638 * to override an explicitely set --max-stack global setting. 2639 */ 2640 evlist__for_each_entry(evlist, evsel) { 2641 if (evsel__has_callchain(evsel) && 2642 evsel->attr.sample_max_stack == 0) 2643 evsel->attr.sample_max_stack = trace->max_stack; 2644 } 2645 again: 2646 before = trace->nr_events; 2647 2648 for (i = 0; i < evlist->nr_mmaps; i++) { 2649 union perf_event *event; 2650 struct perf_mmap *md; 2651 2652 md = &evlist->mmap[i]; 2653 if (perf_mmap__read_init(md) < 0) 2654 continue; 2655 2656 while ((event = perf_mmap__read_event(md)) != NULL) { 2657 struct perf_sample sample; 2658 2659 ++trace->nr_events; 2660 2661 err = perf_evlist__parse_sample(evlist, event, &sample); 2662 if (err) { 2663 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err); 2664 goto next_event; 2665 } 2666 2667 trace__handle_event(trace, event, &sample); 2668 next_event: 2669 perf_mmap__consume(md); 2670 2671 if (interrupted) 2672 goto out_disable; 2673 2674 if (done && !draining) { 2675 perf_evlist__disable(evlist); 2676 draining = true; 2677 } 2678 } 2679 perf_mmap__read_done(md); 2680 } 2681 2682 if (trace->nr_events == before) { 2683 int timeout = done ? 100 : -1; 2684 2685 if (!draining && perf_evlist__poll(evlist, timeout) > 0) { 2686 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0) 2687 draining = true; 2688 2689 goto again; 2690 } 2691 } else { 2692 goto again; 2693 } 2694 2695 out_disable: 2696 thread__zput(trace->current); 2697 2698 perf_evlist__disable(evlist); 2699 2700 if (!err) { 2701 if (trace->summary) 2702 trace__fprintf_thread_summary(trace, trace->output); 2703 2704 if (trace->show_tool_stats) { 2705 fprintf(trace->output, "Stats:\n " 2706 " vfs_getname : %" PRIu64 "\n" 2707 " proc_getname: %" PRIu64 "\n", 2708 trace->stats.vfs_getname, 2709 trace->stats.proc_getname); 2710 } 2711 } 2712 2713 out_delete_evlist: 2714 trace__symbols__exit(trace); 2715 2716 perf_evlist__delete(evlist); 2717 cgroup__put(trace->cgroup); 2718 trace->evlist = NULL; 2719 trace->live = false; 2720 return err; 2721 { 2722 char errbuf[BUFSIZ]; 2723 2724 out_error_sched_stat_runtime: 2725 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime"); 2726 goto out_error; 2727 2728 out_error_raw_syscalls: 2729 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)"); 2730 goto out_error; 2731 2732 out_error_mmap: 2733 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf)); 2734 goto out_error; 2735 2736 out_error_open: 2737 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf)); 2738 2739 out_error: 2740 fprintf(trace->output, "%s\n", errbuf); 2741 goto out_delete_evlist; 2742 2743 out_error_apply_filters: 2744 fprintf(trace->output, 2745 "Failed to set filter \"%s\" on event %s with %d (%s)\n", 2746 evsel->filter, perf_evsel__name(evsel), errno, 2747 str_error_r(errno, errbuf, sizeof(errbuf))); 2748 goto out_delete_evlist; 2749 } 2750 out_error_mem: 2751 fprintf(trace->output, "Not enough memory to run!\n"); 2752 goto out_delete_evlist; 2753 2754 out_errno: 2755 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno)); 2756 goto out_delete_evlist; 2757 } 2758 2759 static int trace__replay(struct trace *trace) 2760 { 2761 const struct perf_evsel_str_handler handlers[] = { 2762 { "probe:vfs_getname", trace__vfs_getname, }, 2763 }; 2764 struct perf_data data = { 2765 .file = { 2766 .path = input_name, 2767 }, 2768 .mode = PERF_DATA_MODE_READ, 2769 .force = trace->force, 2770 }; 2771 struct perf_session *session; 2772 struct perf_evsel *evsel; 2773 int err = -1; 2774 2775 trace->tool.sample = trace__process_sample; 2776 trace->tool.mmap = perf_event__process_mmap; 2777 trace->tool.mmap2 = perf_event__process_mmap2; 2778 trace->tool.comm = perf_event__process_comm; 2779 trace->tool.exit = perf_event__process_exit; 2780 trace->tool.fork = perf_event__process_fork; 2781 trace->tool.attr = perf_event__process_attr; 2782 trace->tool.tracing_data = perf_event__process_tracing_data; 2783 trace->tool.build_id = perf_event__process_build_id; 2784 trace->tool.namespaces = perf_event__process_namespaces; 2785 2786 trace->tool.ordered_events = true; 2787 trace->tool.ordering_requires_timestamps = true; 2788 2789 /* add tid to output */ 2790 trace->multiple_threads = true; 2791 2792 session = perf_session__new(&data, false, &trace->tool); 2793 if (session == NULL) 2794 return -1; 2795 2796 if (trace->opts.target.pid) 2797 symbol_conf.pid_list_str = strdup(trace->opts.target.pid); 2798 2799 if (trace->opts.target.tid) 2800 symbol_conf.tid_list_str = strdup(trace->opts.target.tid); 2801 2802 if (symbol__init(&session->header.env) < 0) 2803 goto out; 2804 2805 trace->host = &session->machines.host; 2806 2807 err = perf_session__set_tracepoints_handlers(session, handlers); 2808 if (err) 2809 goto out; 2810 2811 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2812 "raw_syscalls:sys_enter"); 2813 /* older kernels have syscalls tp versus raw_syscalls */ 2814 if (evsel == NULL) 2815 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2816 "syscalls:sys_enter"); 2817 2818 if (evsel && 2819 (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 || 2820 perf_evsel__init_sc_tp_ptr_field(evsel, args))) { 2821 pr_err("Error during initialize raw_syscalls:sys_enter event\n"); 2822 goto out; 2823 } 2824 2825 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2826 "raw_syscalls:sys_exit"); 2827 if (evsel == NULL) 2828 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 2829 "syscalls:sys_exit"); 2830 if (evsel && 2831 (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 || 2832 perf_evsel__init_sc_tp_uint_field(evsel, ret))) { 2833 pr_err("Error during initialize raw_syscalls:sys_exit event\n"); 2834 goto out; 2835 } 2836 2837 evlist__for_each_entry(session->evlist, evsel) { 2838 if (evsel->attr.type == PERF_TYPE_SOFTWARE && 2839 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ || 2840 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN || 2841 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS)) 2842 evsel->handler = trace__pgfault; 2843 } 2844 2845 setup_pager(); 2846 2847 err = perf_session__process_events(session); 2848 if (err) 2849 pr_err("Failed to process events, error %d", err); 2850 2851 else if (trace->summary) 2852 trace__fprintf_thread_summary(trace, trace->output); 2853 2854 out: 2855 perf_session__delete(session); 2856 2857 return err; 2858 } 2859 2860 static size_t trace__fprintf_threads_header(FILE *fp) 2861 { 2862 size_t printed; 2863 2864 printed = fprintf(fp, "\n Summary of events:\n\n"); 2865 2866 return printed; 2867 } 2868 2869 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs, 2870 struct stats *stats; 2871 double msecs; 2872 int syscall; 2873 ) 2874 { 2875 struct int_node *source = rb_entry(nd, struct int_node, rb_node); 2876 struct stats *stats = source->priv; 2877 2878 entry->syscall = source->i; 2879 entry->stats = stats; 2880 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0; 2881 } 2882 2883 static size_t thread__dump_stats(struct thread_trace *ttrace, 2884 struct trace *trace, FILE *fp) 2885 { 2886 size_t printed = 0; 2887 struct syscall *sc; 2888 struct rb_node *nd; 2889 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats); 2890 2891 if (syscall_stats == NULL) 2892 return 0; 2893 2894 printed += fprintf(fp, "\n"); 2895 2896 printed += fprintf(fp, " syscall calls total min avg max stddev\n"); 2897 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); 2898 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n"); 2899 2900 resort_rb__for_each_entry(nd, syscall_stats) { 2901 struct stats *stats = syscall_stats_entry->stats; 2902 if (stats) { 2903 double min = (double)(stats->min) / NSEC_PER_MSEC; 2904 double max = (double)(stats->max) / NSEC_PER_MSEC; 2905 double avg = avg_stats(stats); 2906 double pct; 2907 u64 n = (u64) stats->n; 2908 2909 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0; 2910 avg /= NSEC_PER_MSEC; 2911 2912 sc = &trace->syscalls.table[syscall_stats_entry->syscall]; 2913 printed += fprintf(fp, " %-15s", sc->name); 2914 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f", 2915 n, syscall_stats_entry->msecs, min, avg); 2916 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct); 2917 } 2918 } 2919 2920 resort_rb__delete(syscall_stats); 2921 printed += fprintf(fp, "\n\n"); 2922 2923 return printed; 2924 } 2925 2926 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace) 2927 { 2928 size_t printed = 0; 2929 struct thread_trace *ttrace = thread__priv(thread); 2930 double ratio; 2931 2932 if (ttrace == NULL) 2933 return 0; 2934 2935 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0; 2936 2937 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid); 2938 printed += fprintf(fp, "%lu events, ", ttrace->nr_events); 2939 printed += fprintf(fp, "%.1f%%", ratio); 2940 if (ttrace->pfmaj) 2941 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj); 2942 if (ttrace->pfmin) 2943 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin); 2944 if (trace->sched) 2945 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms); 2946 else if (fputc('\n', fp) != EOF) 2947 ++printed; 2948 2949 printed += thread__dump_stats(ttrace, trace, fp); 2950 2951 return printed; 2952 } 2953 2954 static unsigned long thread__nr_events(struct thread_trace *ttrace) 2955 { 2956 return ttrace ? ttrace->nr_events : 0; 2957 } 2958 2959 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)), 2960 struct thread *thread; 2961 ) 2962 { 2963 entry->thread = rb_entry(nd, struct thread, rb_node); 2964 } 2965 2966 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp) 2967 { 2968 size_t printed = trace__fprintf_threads_header(fp); 2969 struct rb_node *nd; 2970 int i; 2971 2972 for (i = 0; i < THREADS__TABLE_SIZE; i++) { 2973 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i); 2974 2975 if (threads == NULL) { 2976 fprintf(fp, "%s", "Error sorting output by nr_events!\n"); 2977 return 0; 2978 } 2979 2980 resort_rb__for_each_entry(nd, threads) 2981 printed += trace__fprintf_thread(fp, threads_entry->thread, trace); 2982 2983 resort_rb__delete(threads); 2984 } 2985 return printed; 2986 } 2987 2988 static int trace__set_duration(const struct option *opt, const char *str, 2989 int unset __maybe_unused) 2990 { 2991 struct trace *trace = opt->value; 2992 2993 trace->duration_filter = atof(str); 2994 return 0; 2995 } 2996 2997 static int trace__set_filter_pids(const struct option *opt, const char *str, 2998 int unset __maybe_unused) 2999 { 3000 int ret = -1; 3001 size_t i; 3002 struct trace *trace = opt->value; 3003 /* 3004 * FIXME: introduce a intarray class, plain parse csv and create a 3005 * { int nr, int entries[] } struct... 3006 */ 3007 struct intlist *list = intlist__new(str); 3008 3009 if (list == NULL) 3010 return -1; 3011 3012 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1; 3013 trace->filter_pids.entries = calloc(i, sizeof(pid_t)); 3014 3015 if (trace->filter_pids.entries == NULL) 3016 goto out; 3017 3018 trace->filter_pids.entries[0] = getpid(); 3019 3020 for (i = 1; i < trace->filter_pids.nr; ++i) 3021 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i; 3022 3023 intlist__delete(list); 3024 ret = 0; 3025 out: 3026 return ret; 3027 } 3028 3029 static int trace__open_output(struct trace *trace, const char *filename) 3030 { 3031 struct stat st; 3032 3033 if (!stat(filename, &st) && st.st_size) { 3034 char oldname[PATH_MAX]; 3035 3036 scnprintf(oldname, sizeof(oldname), "%s.old", filename); 3037 unlink(oldname); 3038 rename(filename, oldname); 3039 } 3040 3041 trace->output = fopen(filename, "w"); 3042 3043 return trace->output == NULL ? -errno : 0; 3044 } 3045 3046 static int parse_pagefaults(const struct option *opt, const char *str, 3047 int unset __maybe_unused) 3048 { 3049 int *trace_pgfaults = opt->value; 3050 3051 if (strcmp(str, "all") == 0) 3052 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN; 3053 else if (strcmp(str, "maj") == 0) 3054 *trace_pgfaults |= TRACE_PFMAJ; 3055 else if (strcmp(str, "min") == 0) 3056 *trace_pgfaults |= TRACE_PFMIN; 3057 else 3058 return -1; 3059 3060 return 0; 3061 } 3062 3063 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler) 3064 { 3065 struct perf_evsel *evsel; 3066 3067 evlist__for_each_entry(evlist, evsel) 3068 evsel->handler = handler; 3069 } 3070 3071 static int evlist__set_syscall_tp_fields(struct perf_evlist *evlist) 3072 { 3073 struct perf_evsel *evsel; 3074 3075 evlist__for_each_entry(evlist, evsel) { 3076 if (evsel->priv || !evsel->tp_format) 3077 continue; 3078 3079 if (strcmp(evsel->tp_format->system, "syscalls")) 3080 continue; 3081 3082 if (perf_evsel__init_syscall_tp(evsel)) 3083 return -1; 3084 3085 if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) { 3086 struct syscall_tp *sc = evsel->priv; 3087 3088 if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64))) 3089 return -1; 3090 } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) { 3091 struct syscall_tp *sc = evsel->priv; 3092 3093 if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap)) 3094 return -1; 3095 } 3096 } 3097 3098 return 0; 3099 } 3100 3101 /* 3102 * XXX: Hackish, just splitting the combined -e+--event (syscalls 3103 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use 3104 * existing facilities unchanged (trace->ev_qualifier + parse_options()). 3105 * 3106 * It'd be better to introduce a parse_options() variant that would return a 3107 * list with the terms it didn't match to an event... 3108 */ 3109 static int trace__parse_events_option(const struct option *opt, const char *str, 3110 int unset __maybe_unused) 3111 { 3112 struct trace *trace = (struct trace *)opt->value; 3113 const char *s = str; 3114 char *sep = NULL, *lists[2] = { NULL, NULL, }; 3115 int len = strlen(str) + 1, err = -1, list, idx; 3116 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR); 3117 char group_name[PATH_MAX]; 3118 3119 if (strace_groups_dir == NULL) 3120 return -1; 3121 3122 if (*s == '!') { 3123 ++s; 3124 trace->not_ev_qualifier = true; 3125 } 3126 3127 while (1) { 3128 if ((sep = strchr(s, ',')) != NULL) 3129 *sep = '\0'; 3130 3131 list = 0; 3132 if (syscalltbl__id(trace->sctbl, s) >= 0 || 3133 syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) { 3134 list = 1; 3135 } else { 3136 path__join(group_name, sizeof(group_name), strace_groups_dir, s); 3137 if (access(group_name, R_OK) == 0) 3138 list = 1; 3139 } 3140 3141 if (lists[list]) { 3142 sprintf(lists[list] + strlen(lists[list]), ",%s", s); 3143 } else { 3144 lists[list] = malloc(len); 3145 if (lists[list] == NULL) 3146 goto out; 3147 strcpy(lists[list], s); 3148 } 3149 3150 if (!sep) 3151 break; 3152 3153 *sep = ','; 3154 s = sep + 1; 3155 } 3156 3157 if (lists[1] != NULL) { 3158 struct strlist_config slist_config = { 3159 .dirname = strace_groups_dir, 3160 }; 3161 3162 trace->ev_qualifier = strlist__new(lists[1], &slist_config); 3163 if (trace->ev_qualifier == NULL) { 3164 fputs("Not enough memory to parse event qualifier", trace->output); 3165 goto out; 3166 } 3167 3168 if (trace__validate_ev_qualifier(trace)) 3169 goto out; 3170 trace->trace_syscalls = true; 3171 } 3172 3173 err = 0; 3174 3175 if (lists[0]) { 3176 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event", 3177 "event selector. use 'perf list' to list available events", 3178 parse_events_option); 3179 err = parse_events_option(&o, lists[0], 0); 3180 } 3181 out: 3182 if (sep) 3183 *sep = ','; 3184 3185 return err; 3186 } 3187 3188 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset) 3189 { 3190 struct trace *trace = opt->value; 3191 3192 if (!list_empty(&trace->evlist->entries)) 3193 return parse_cgroups(opt, str, unset); 3194 3195 trace->cgroup = evlist__findnew_cgroup(trace->evlist, str); 3196 3197 return 0; 3198 } 3199 3200 int cmd_trace(int argc, const char **argv) 3201 { 3202 const char *trace_usage[] = { 3203 "perf trace [<options>] [<command>]", 3204 "perf trace [<options>] -- <command> [<options>]", 3205 "perf trace record [<options>] [<command>]", 3206 "perf trace record [<options>] -- <command> [<options>]", 3207 NULL 3208 }; 3209 struct trace trace = { 3210 .syscalls = { 3211 . max = -1, 3212 }, 3213 .opts = { 3214 .target = { 3215 .uid = UINT_MAX, 3216 .uses_mmap = true, 3217 }, 3218 .user_freq = UINT_MAX, 3219 .user_interval = ULLONG_MAX, 3220 .no_buffering = true, 3221 .mmap_pages = UINT_MAX, 3222 .proc_map_timeout = 500, 3223 }, 3224 .output = stderr, 3225 .show_comm = true, 3226 .trace_syscalls = false, 3227 .kernel_syscallchains = false, 3228 .max_stack = UINT_MAX, 3229 }; 3230 const char *output_name = NULL; 3231 const struct option trace_options[] = { 3232 OPT_CALLBACK('e', "event", &trace, "event", 3233 "event/syscall selector. use 'perf list' to list available events", 3234 trace__parse_events_option), 3235 OPT_BOOLEAN(0, "comm", &trace.show_comm, 3236 "show the thread COMM next to its id"), 3237 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"), 3238 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace", 3239 trace__parse_events_option), 3240 OPT_STRING('o', "output", &output_name, "file", "output file name"), 3241 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"), 3242 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid", 3243 "trace events on existing process id"), 3244 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid", 3245 "trace events on existing thread id"), 3246 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids", 3247 "pids to filter (by the kernel)", trace__set_filter_pids), 3248 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide, 3249 "system-wide collection from all CPUs"), 3250 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu", 3251 "list of cpus to monitor"), 3252 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit, 3253 "child tasks do not inherit counters"), 3254 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages", 3255 "number of mmap data pages", 3256 perf_evlist__parse_mmap_pages), 3257 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user", 3258 "user to profile"), 3259 OPT_CALLBACK(0, "duration", &trace, "float", 3260 "show only events with duration > N.M ms", 3261 trace__set_duration), 3262 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"), 3263 OPT_INCR('v', "verbose", &verbose, "be more verbose"), 3264 OPT_BOOLEAN('T', "time", &trace.full_time, 3265 "Show full timestamp, not time relative to first start"), 3266 OPT_BOOLEAN(0, "failure", &trace.failure_only, 3267 "Show only syscalls that failed"), 3268 OPT_BOOLEAN('s', "summary", &trace.summary_only, 3269 "Show only syscall summary with statistics"), 3270 OPT_BOOLEAN('S', "with-summary", &trace.summary, 3271 "Show all syscalls and summary with statistics"), 3272 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min", 3273 "Trace pagefaults", parse_pagefaults, "maj"), 3274 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"), 3275 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"), 3276 OPT_CALLBACK(0, "call-graph", &trace.opts, 3277 "record_mode[,record_size]", record_callchain_help, 3278 &record_parse_callchain_opt), 3279 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains, 3280 "Show the kernel callchains on the syscall exit path"), 3281 OPT_UINTEGER(0, "min-stack", &trace.min_stack, 3282 "Set the minimum stack depth when parsing the callchain, " 3283 "anything below the specified depth will be ignored."), 3284 OPT_UINTEGER(0, "max-stack", &trace.max_stack, 3285 "Set the maximum stack depth when parsing the callchain, " 3286 "anything beyond the specified depth will be ignored. " 3287 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)), 3288 OPT_BOOLEAN(0, "print-sample", &trace.print_sample, 3289 "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"), 3290 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout, 3291 "per thread proc mmap processing timeout in ms"), 3292 OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only", 3293 trace__parse_cgroups), 3294 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay, 3295 "ms to wait before starting measurement after program " 3296 "start"), 3297 OPT_END() 3298 }; 3299 bool __maybe_unused max_stack_user_set = true; 3300 bool mmap_pages_user_set = true; 3301 struct perf_evsel *evsel; 3302 const char * const trace_subcommands[] = { "record", NULL }; 3303 int err = -1; 3304 char bf[BUFSIZ]; 3305 3306 signal(SIGSEGV, sighandler_dump_stack); 3307 signal(SIGFPE, sighandler_dump_stack); 3308 3309 trace.evlist = perf_evlist__new(); 3310 trace.sctbl = syscalltbl__new(); 3311 3312 if (trace.evlist == NULL || trace.sctbl == NULL) { 3313 pr_err("Not enough memory to run!\n"); 3314 err = -ENOMEM; 3315 goto out; 3316 } 3317 3318 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands, 3319 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION); 3320 3321 if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) { 3322 usage_with_options_msg(trace_usage, trace_options, 3323 "cgroup monitoring only available in system-wide mode"); 3324 } 3325 3326 evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__"); 3327 if (IS_ERR(evsel)) { 3328 bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf)); 3329 pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf); 3330 goto out; 3331 } 3332 3333 if (evsel) { 3334 if (perf_evsel__init_augmented_syscall_tp(evsel) || 3335 perf_evsel__init_augmented_syscall_tp_args(evsel)) 3336 goto out; 3337 trace.syscalls.events.augmented = evsel; 3338 } 3339 3340 err = bpf__setup_stdout(trace.evlist); 3341 if (err) { 3342 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf)); 3343 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf); 3344 goto out; 3345 } 3346 3347 err = -1; 3348 3349 if (trace.trace_pgfaults) { 3350 trace.opts.sample_address = true; 3351 trace.opts.sample_time = true; 3352 } 3353 3354 if (trace.opts.mmap_pages == UINT_MAX) 3355 mmap_pages_user_set = false; 3356 3357 if (trace.max_stack == UINT_MAX) { 3358 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack(); 3359 max_stack_user_set = false; 3360 } 3361 3362 #ifdef HAVE_DWARF_UNWIND_SUPPORT 3363 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) { 3364 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false); 3365 } 3366 #endif 3367 3368 if (callchain_param.enabled) { 3369 if (!mmap_pages_user_set && geteuid() == 0) 3370 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4; 3371 3372 symbol_conf.use_callchain = true; 3373 } 3374 3375 if (trace.evlist->nr_entries > 0) { 3376 evlist__set_evsel_handler(trace.evlist, trace__event_handler); 3377 if (evlist__set_syscall_tp_fields(trace.evlist)) { 3378 perror("failed to set syscalls:* tracepoint fields"); 3379 goto out; 3380 } 3381 } 3382 3383 if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) 3384 return trace__record(&trace, argc-1, &argv[1]); 3385 3386 /* summary_only implies summary option, but don't overwrite summary if set */ 3387 if (trace.summary_only) 3388 trace.summary = trace.summary_only; 3389 3390 if (!trace.trace_syscalls && !trace.trace_pgfaults && 3391 trace.evlist->nr_entries == 0 /* Was --events used? */) { 3392 trace.trace_syscalls = true; 3393 } 3394 3395 if (output_name != NULL) { 3396 err = trace__open_output(&trace, output_name); 3397 if (err < 0) { 3398 perror("failed to create output file"); 3399 goto out; 3400 } 3401 } 3402 3403 err = target__validate(&trace.opts.target); 3404 if (err) { 3405 target__strerror(&trace.opts.target, err, bf, sizeof(bf)); 3406 fprintf(trace.output, "%s", bf); 3407 goto out_close; 3408 } 3409 3410 err = target__parse_uid(&trace.opts.target); 3411 if (err) { 3412 target__strerror(&trace.opts.target, err, bf, sizeof(bf)); 3413 fprintf(trace.output, "%s", bf); 3414 goto out_close; 3415 } 3416 3417 if (!argc && target__none(&trace.opts.target)) 3418 trace.opts.target.system_wide = true; 3419 3420 if (input_name) 3421 err = trace__replay(&trace); 3422 else 3423 err = trace__run(&trace, argc, argv); 3424 3425 out_close: 3426 if (output_name != NULL) 3427 fclose(trace.output); 3428 out: 3429 return err; 3430 } 3431