1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * builtin-trace.c 4 * 5 * Builtin 'trace' command: 6 * 7 * Display a continuously updated trace of any workload, CPU, specific PID, 8 * system wide, etc. Default format is loosely strace like, but any other 9 * event may be specified using --event. 10 * 11 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> 12 * 13 * Initially based on the 'trace' prototype by Thomas Gleixner: 14 * 15 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'") 16 */ 17 18 #include <traceevent/event-parse.h> 19 #include <api/fs/tracing_path.h> 20 #include <bpf/bpf.h> 21 #include "util/bpf_map.h" 22 #include "builtin.h" 23 #include "util/cgroup.h" 24 #include "util/color.h" 25 #include "util/config.h" 26 #include "util/debug.h" 27 #include "util/env.h" 28 #include "util/event.h" 29 #include "util/evlist.h" 30 #include <subcmd/exec-cmd.h> 31 #include "util/machine.h" 32 #include "util/map.h" 33 #include "util/symbol.h" 34 #include "util/path.h" 35 #include "util/session.h" 36 #include "util/thread.h" 37 #include <subcmd/parse-options.h> 38 #include "util/strlist.h" 39 #include "util/intlist.h" 40 #include "util/thread_map.h" 41 #include "util/stat.h" 42 #include "trace/beauty/beauty.h" 43 #include "trace-event.h" 44 #include "util/parse-events.h" 45 #include "util/bpf-loader.h" 46 #include "callchain.h" 47 #include "print_binary.h" 48 #include "string2.h" 49 #include "syscalltbl.h" 50 #include "rb_resort.h" 51 52 #include <errno.h> 53 #include <inttypes.h> 54 #include <poll.h> 55 #include <signal.h> 56 #include <stdlib.h> 57 #include <string.h> 58 #include <linux/err.h> 59 #include <linux/filter.h> 60 #include <linux/kernel.h> 61 #include <linux/random.h> 62 #include <linux/stringify.h> 63 #include <linux/time64.h> 64 #include <fcntl.h> 65 #include <sys/sysmacros.h> 66 67 #include <linux/ctype.h> 68 69 #ifndef O_CLOEXEC 70 # define O_CLOEXEC 02000000 71 #endif 72 73 #ifndef F_LINUX_SPECIFIC_BASE 74 # define F_LINUX_SPECIFIC_BASE 1024 75 #endif 76 77 struct trace { 78 struct perf_tool tool; 79 struct syscalltbl *sctbl; 80 struct { 81 int max; 82 struct syscall *table; 83 struct bpf_map *map; 84 struct { 85 struct perf_evsel *sys_enter, 86 *sys_exit, 87 *augmented; 88 } events; 89 } syscalls; 90 struct { 91 struct bpf_map *map; 92 } dump; 93 struct record_opts opts; 94 struct perf_evlist *evlist; 95 struct machine *host; 96 struct thread *current; 97 struct cgroup *cgroup; 98 u64 base_time; 99 FILE *output; 100 unsigned long nr_events; 101 unsigned long nr_events_printed; 102 unsigned long max_events; 103 struct strlist *ev_qualifier; 104 struct { 105 size_t nr; 106 int *entries; 107 } ev_qualifier_ids; 108 struct { 109 size_t nr; 110 pid_t *entries; 111 struct bpf_map *map; 112 } filter_pids; 113 double duration_filter; 114 double runtime_ms; 115 struct { 116 u64 vfs_getname, 117 proc_getname; 118 } stats; 119 unsigned int max_stack; 120 unsigned int min_stack; 121 int raw_augmented_syscalls_args_size; 122 bool raw_augmented_syscalls; 123 bool sort_events; 124 bool not_ev_qualifier; 125 bool live; 126 bool full_time; 127 bool sched; 128 bool multiple_threads; 129 bool summary; 130 bool summary_only; 131 bool failure_only; 132 bool show_comm; 133 bool print_sample; 134 bool show_tool_stats; 135 bool trace_syscalls; 136 bool kernel_syscallchains; 137 s16 args_alignment; 138 bool show_tstamp; 139 bool show_duration; 140 bool show_zeros; 141 bool show_arg_names; 142 bool show_string_prefix; 143 bool force; 144 bool vfs_getname; 145 int trace_pgfaults; 146 struct { 147 struct ordered_events data; 148 u64 last; 149 } oe; 150 }; 151 152 struct tp_field { 153 int offset; 154 union { 155 u64 (*integer)(struct tp_field *field, struct perf_sample *sample); 156 void *(*pointer)(struct tp_field *field, struct perf_sample *sample); 157 }; 158 }; 159 160 #define TP_UINT_FIELD(bits) \ 161 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \ 162 { \ 163 u##bits value; \ 164 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \ 165 return value; \ 166 } 167 168 TP_UINT_FIELD(8); 169 TP_UINT_FIELD(16); 170 TP_UINT_FIELD(32); 171 TP_UINT_FIELD(64); 172 173 #define TP_UINT_FIELD__SWAPPED(bits) \ 174 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \ 175 { \ 176 u##bits value; \ 177 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \ 178 return bswap_##bits(value);\ 179 } 180 181 TP_UINT_FIELD__SWAPPED(16); 182 TP_UINT_FIELD__SWAPPED(32); 183 TP_UINT_FIELD__SWAPPED(64); 184 185 static int __tp_field__init_uint(struct tp_field *field, int size, int offset, bool needs_swap) 186 { 187 field->offset = offset; 188 189 switch (size) { 190 case 1: 191 field->integer = tp_field__u8; 192 break; 193 case 2: 194 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16; 195 break; 196 case 4: 197 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32; 198 break; 199 case 8: 200 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64; 201 break; 202 default: 203 return -1; 204 } 205 206 return 0; 207 } 208 209 static int tp_field__init_uint(struct tp_field *field, struct tep_format_field *format_field, bool needs_swap) 210 { 211 return __tp_field__init_uint(field, format_field->size, format_field->offset, needs_swap); 212 } 213 214 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample) 215 { 216 return sample->raw_data + field->offset; 217 } 218 219 static int __tp_field__init_ptr(struct tp_field *field, int offset) 220 { 221 field->offset = offset; 222 field->pointer = tp_field__ptr; 223 return 0; 224 } 225 226 static int tp_field__init_ptr(struct tp_field *field, struct tep_format_field *format_field) 227 { 228 return __tp_field__init_ptr(field, format_field->offset); 229 } 230 231 struct syscall_tp { 232 struct tp_field id; 233 union { 234 struct tp_field args, ret; 235 }; 236 }; 237 238 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel, 239 struct tp_field *field, 240 const char *name) 241 { 242 struct tep_format_field *format_field = perf_evsel__field(evsel, name); 243 244 if (format_field == NULL) 245 return -1; 246 247 return tp_field__init_uint(field, format_field, evsel->needs_swap); 248 } 249 250 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \ 251 ({ struct syscall_tp *sc = evsel->priv;\ 252 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); }) 253 254 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel, 255 struct tp_field *field, 256 const char *name) 257 { 258 struct tep_format_field *format_field = perf_evsel__field(evsel, name); 259 260 if (format_field == NULL) 261 return -1; 262 263 return tp_field__init_ptr(field, format_field); 264 } 265 266 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \ 267 ({ struct syscall_tp *sc = evsel->priv;\ 268 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); }) 269 270 static void perf_evsel__delete_priv(struct perf_evsel *evsel) 271 { 272 zfree(&evsel->priv); 273 perf_evsel__delete(evsel); 274 } 275 276 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel) 277 { 278 struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp)); 279 280 if (evsel->priv != NULL) { 281 if (perf_evsel__init_tp_uint_field(evsel, &sc->id, "__syscall_nr") && 282 perf_evsel__init_tp_uint_field(evsel, &sc->id, "nr")) 283 goto out_delete; 284 return 0; 285 } 286 287 return -ENOMEM; 288 out_delete: 289 zfree(&evsel->priv); 290 return -ENOENT; 291 } 292 293 static int perf_evsel__init_augmented_syscall_tp(struct perf_evsel *evsel, struct perf_evsel *tp) 294 { 295 struct syscall_tp *sc = evsel->priv = malloc(sizeof(struct syscall_tp)); 296 297 if (evsel->priv != NULL) { 298 struct tep_format_field *syscall_id = perf_evsel__field(tp, "id"); 299 if (syscall_id == NULL) 300 syscall_id = perf_evsel__field(tp, "__syscall_nr"); 301 if (syscall_id == NULL) 302 goto out_delete; 303 if (__tp_field__init_uint(&sc->id, syscall_id->size, syscall_id->offset, evsel->needs_swap)) 304 goto out_delete; 305 306 return 0; 307 } 308 309 return -ENOMEM; 310 out_delete: 311 zfree(&evsel->priv); 312 return -EINVAL; 313 } 314 315 static int perf_evsel__init_augmented_syscall_tp_args(struct perf_evsel *evsel) 316 { 317 struct syscall_tp *sc = evsel->priv; 318 319 return __tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64)); 320 } 321 322 static int perf_evsel__init_augmented_syscall_tp_ret(struct perf_evsel *evsel) 323 { 324 struct syscall_tp *sc = evsel->priv; 325 326 return __tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap); 327 } 328 329 static int perf_evsel__init_raw_syscall_tp(struct perf_evsel *evsel, void *handler) 330 { 331 evsel->priv = malloc(sizeof(struct syscall_tp)); 332 if (evsel->priv != NULL) { 333 if (perf_evsel__init_sc_tp_uint_field(evsel, id)) 334 goto out_delete; 335 336 evsel->handler = handler; 337 return 0; 338 } 339 340 return -ENOMEM; 341 342 out_delete: 343 zfree(&evsel->priv); 344 return -ENOENT; 345 } 346 347 static struct perf_evsel *perf_evsel__raw_syscall_newtp(const char *direction, void *handler) 348 { 349 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction); 350 351 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */ 352 if (IS_ERR(evsel)) 353 evsel = perf_evsel__newtp("syscalls", direction); 354 355 if (IS_ERR(evsel)) 356 return NULL; 357 358 if (perf_evsel__init_raw_syscall_tp(evsel, handler)) 359 goto out_delete; 360 361 return evsel; 362 363 out_delete: 364 perf_evsel__delete_priv(evsel); 365 return NULL; 366 } 367 368 #define perf_evsel__sc_tp_uint(evsel, name, sample) \ 369 ({ struct syscall_tp *fields = evsel->priv; \ 370 fields->name.integer(&fields->name, sample); }) 371 372 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \ 373 ({ struct syscall_tp *fields = evsel->priv; \ 374 fields->name.pointer(&fields->name, sample); }) 375 376 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, bool show_prefix, int val) 377 { 378 int idx = val - sa->offset; 379 380 if (idx < 0 || idx >= sa->nr_entries || sa->entries[idx] == NULL) { 381 size_t printed = scnprintf(bf, size, intfmt, val); 382 if (show_prefix) 383 printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sa->prefix); 384 return printed; 385 } 386 387 return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]); 388 } 389 390 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size, 391 const char *intfmt, 392 struct syscall_arg *arg) 393 { 394 return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->show_string_prefix, arg->val); 395 } 396 397 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size, 398 struct syscall_arg *arg) 399 { 400 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg); 401 } 402 403 #define SCA_STRARRAY syscall_arg__scnprintf_strarray 404 405 size_t syscall_arg__scnprintf_strarray_flags(char *bf, size_t size, struct syscall_arg *arg) 406 { 407 return strarray__scnprintf_flags(arg->parm, bf, size, arg->show_string_prefix, arg->val); 408 } 409 410 size_t strarrays__scnprintf(struct strarrays *sas, char *bf, size_t size, const char *intfmt, bool show_prefix, int val) 411 { 412 size_t printed; 413 int i; 414 415 for (i = 0; i < sas->nr_entries; ++i) { 416 struct strarray *sa = sas->entries[i]; 417 int idx = val - sa->offset; 418 419 if (idx >= 0 && idx < sa->nr_entries) { 420 if (sa->entries[idx] == NULL) 421 break; 422 return scnprintf(bf, size, "%s%s", show_prefix ? sa->prefix : "", sa->entries[idx]); 423 } 424 } 425 426 printed = scnprintf(bf, size, intfmt, val); 427 if (show_prefix) 428 printed += scnprintf(bf + printed, size - printed, " /* %s??? */", sas->entries[0]->prefix); 429 return printed; 430 } 431 432 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size, 433 struct syscall_arg *arg) 434 { 435 return strarrays__scnprintf(arg->parm, bf, size, "%d", arg->show_string_prefix, arg->val); 436 } 437 438 #ifndef AT_FDCWD 439 #define AT_FDCWD -100 440 #endif 441 442 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size, 443 struct syscall_arg *arg) 444 { 445 int fd = arg->val; 446 const char *prefix = "AT_FD"; 447 448 if (fd == AT_FDCWD) 449 return scnprintf(bf, size, "%s%s", arg->show_string_prefix ? prefix : "", "CWD"); 450 451 return syscall_arg__scnprintf_fd(bf, size, arg); 452 } 453 454 #define SCA_FDAT syscall_arg__scnprintf_fd_at 455 456 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size, 457 struct syscall_arg *arg); 458 459 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd 460 461 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg) 462 { 463 return scnprintf(bf, size, "%#lx", arg->val); 464 } 465 466 size_t syscall_arg__scnprintf_ptr(char *bf, size_t size, struct syscall_arg *arg) 467 { 468 if (arg->val == 0) 469 return scnprintf(bf, size, "NULL"); 470 return syscall_arg__scnprintf_hex(bf, size, arg); 471 } 472 473 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg) 474 { 475 return scnprintf(bf, size, "%d", arg->val); 476 } 477 478 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg) 479 { 480 return scnprintf(bf, size, "%ld", arg->val); 481 } 482 483 static const char *bpf_cmd[] = { 484 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM", 485 "MAP_GET_NEXT_KEY", "PROG_LOAD", 486 }; 487 static DEFINE_STRARRAY(bpf_cmd, "BPF_"); 488 489 static const char *fsmount_flags[] = { 490 [1] = "CLOEXEC", 491 }; 492 static DEFINE_STRARRAY(fsmount_flags, "FSMOUNT_"); 493 494 #include "trace/beauty/generated/fsconfig_arrays.c" 495 496 static DEFINE_STRARRAY(fsconfig_cmds, "FSCONFIG_"); 497 498 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", }; 499 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, "EPOLL_CTL_", 1); 500 501 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", }; 502 static DEFINE_STRARRAY(itimers, "ITIMER_"); 503 504 static const char *keyctl_options[] = { 505 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN", 506 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ", 507 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT", 508 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT", 509 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT", 510 }; 511 static DEFINE_STRARRAY(keyctl_options, "KEYCTL_"); 512 513 static const char *whences[] = { "SET", "CUR", "END", 514 #ifdef SEEK_DATA 515 "DATA", 516 #endif 517 #ifdef SEEK_HOLE 518 "HOLE", 519 #endif 520 }; 521 static DEFINE_STRARRAY(whences, "SEEK_"); 522 523 static const char *fcntl_cmds[] = { 524 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK", 525 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64", 526 "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX", 527 "GETOWNER_UIDS", 528 }; 529 static DEFINE_STRARRAY(fcntl_cmds, "F_"); 530 531 static const char *fcntl_linux_specific_cmds[] = { 532 "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC", 533 "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS", 534 "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT", 535 }; 536 537 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, "F_", F_LINUX_SPECIFIC_BASE); 538 539 static struct strarray *fcntl_cmds_arrays[] = { 540 &strarray__fcntl_cmds, 541 &strarray__fcntl_linux_specific_cmds, 542 }; 543 544 static DEFINE_STRARRAYS(fcntl_cmds_arrays); 545 546 static const char *rlimit_resources[] = { 547 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE", 548 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO", 549 "RTTIME", 550 }; 551 static DEFINE_STRARRAY(rlimit_resources, "RLIMIT_"); 552 553 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", }; 554 static DEFINE_STRARRAY(sighow, "SIG_"); 555 556 static const char *clockid[] = { 557 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID", 558 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME", 559 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI" 560 }; 561 static DEFINE_STRARRAY(clockid, "CLOCK_"); 562 563 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size, 564 struct syscall_arg *arg) 565 { 566 bool show_prefix = arg->show_string_prefix; 567 const char *suffix = "_OK"; 568 size_t printed = 0; 569 int mode = arg->val; 570 571 if (mode == F_OK) /* 0 */ 572 return scnprintf(bf, size, "F%s", show_prefix ? suffix : ""); 573 #define P_MODE(n) \ 574 if (mode & n##_OK) { \ 575 printed += scnprintf(bf + printed, size - printed, "%s%s", #n, show_prefix ? suffix : ""); \ 576 mode &= ~n##_OK; \ 577 } 578 579 P_MODE(R); 580 P_MODE(W); 581 P_MODE(X); 582 #undef P_MODE 583 584 if (mode) 585 printed += scnprintf(bf + printed, size - printed, "|%#x", mode); 586 587 return printed; 588 } 589 590 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode 591 592 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, 593 struct syscall_arg *arg); 594 595 #define SCA_FILENAME syscall_arg__scnprintf_filename 596 597 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size, 598 struct syscall_arg *arg) 599 { 600 bool show_prefix = arg->show_string_prefix; 601 const char *prefix = "O_"; 602 int printed = 0, flags = arg->val; 603 604 #define P_FLAG(n) \ 605 if (flags & O_##n) { \ 606 printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \ 607 flags &= ~O_##n; \ 608 } 609 610 P_FLAG(CLOEXEC); 611 P_FLAG(NONBLOCK); 612 #undef P_FLAG 613 614 if (flags) 615 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); 616 617 return printed; 618 } 619 620 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags 621 622 #ifndef GRND_NONBLOCK 623 #define GRND_NONBLOCK 0x0001 624 #endif 625 #ifndef GRND_RANDOM 626 #define GRND_RANDOM 0x0002 627 #endif 628 629 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size, 630 struct syscall_arg *arg) 631 { 632 bool show_prefix = arg->show_string_prefix; 633 const char *prefix = "GRND_"; 634 int printed = 0, flags = arg->val; 635 636 #define P_FLAG(n) \ 637 if (flags & GRND_##n) { \ 638 printed += scnprintf(bf + printed, size - printed, "%s%s%s", printed ? "|" : "", show_prefix ? prefix : "", #n); \ 639 flags &= ~GRND_##n; \ 640 } 641 642 P_FLAG(RANDOM); 643 P_FLAG(NONBLOCK); 644 #undef P_FLAG 645 646 if (flags) 647 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags); 648 649 return printed; 650 } 651 652 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags 653 654 #define STRARRAY(name, array) \ 655 { .scnprintf = SCA_STRARRAY, \ 656 .parm = &strarray__##array, } 657 658 #define STRARRAY_FLAGS(name, array) \ 659 { .scnprintf = SCA_STRARRAY_FLAGS, \ 660 .parm = &strarray__##array, } 661 662 #include "trace/beauty/arch_errno_names.c" 663 #include "trace/beauty/eventfd.c" 664 #include "trace/beauty/futex_op.c" 665 #include "trace/beauty/futex_val3.c" 666 #include "trace/beauty/mmap.c" 667 #include "trace/beauty/mode_t.c" 668 #include "trace/beauty/msg_flags.c" 669 #include "trace/beauty/open_flags.c" 670 #include "trace/beauty/perf_event_open.c" 671 #include "trace/beauty/pid.c" 672 #include "trace/beauty/sched_policy.c" 673 #include "trace/beauty/seccomp.c" 674 #include "trace/beauty/signum.c" 675 #include "trace/beauty/socket_type.c" 676 #include "trace/beauty/waitid_options.c" 677 678 struct syscall_arg_fmt { 679 size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg); 680 unsigned long (*mask_val)(struct syscall_arg *arg, unsigned long val); 681 void *parm; 682 const char *name; 683 bool show_zero; 684 }; 685 686 static struct syscall_fmt { 687 const char *name; 688 const char *alias; 689 struct syscall_arg_fmt arg[6]; 690 u8 nr_args; 691 bool errpid; 692 bool timeout; 693 bool hexret; 694 } syscall_fmts[] = { 695 { .name = "access", 696 .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, }, 697 { .name = "arch_prctl", 698 .arg = { [0] = { .scnprintf = SCA_X86_ARCH_PRCTL_CODE, /* code */ }, 699 [1] = { .scnprintf = SCA_PTR, /* arg2 */ }, }, }, 700 { .name = "bind", 701 .arg = { [1] = { .scnprintf = SCA_SOCKADDR, /* umyaddr */ }, }, }, 702 { .name = "bpf", 703 .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, }, 704 { .name = "brk", .hexret = true, 705 .arg = { [0] = { .scnprintf = SCA_PTR, /* brk */ }, }, }, 706 { .name = "clock_gettime", 707 .arg = { [0] = STRARRAY(clk_id, clockid), }, }, 708 { .name = "clone", .errpid = true, .nr_args = 5, 709 .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, }, 710 [1] = { .name = "child_stack", .scnprintf = SCA_HEX, }, 711 [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, }, 712 [3] = { .name = "child_tidptr", .scnprintf = SCA_HEX, }, 713 [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, }, 714 { .name = "close", 715 .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, }, 716 { .name = "connect", 717 .arg = { [1] = { .scnprintf = SCA_SOCKADDR, /* servaddr */ }, }, }, 718 { .name = "epoll_ctl", 719 .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, }, 720 { .name = "eventfd2", 721 .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, }, 722 { .name = "fchmodat", 723 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 724 { .name = "fchownat", 725 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 726 { .name = "fcntl", 727 .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */ 728 .parm = &strarrays__fcntl_cmds_arrays, 729 .show_zero = true, }, 730 [2] = { .scnprintf = SCA_FCNTL_ARG, /* arg */ }, }, }, 731 { .name = "flock", 732 .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, }, 733 { .name = "fsconfig", 734 .arg = { [1] = STRARRAY(cmd, fsconfig_cmds), }, }, 735 { .name = "fsmount", 736 .arg = { [1] = STRARRAY_FLAGS(flags, fsmount_flags), 737 [2] = { .scnprintf = SCA_FSMOUNT_ATTR_FLAGS, /* attr_flags */ }, }, }, 738 { .name = "fspick", 739 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, 740 [1] = { .scnprintf = SCA_FILENAME, /* path */ }, 741 [2] = { .scnprintf = SCA_FSPICK_FLAGS, /* flags */ }, }, }, 742 { .name = "fstat", .alias = "newfstat", }, 743 { .name = "fstatat", .alias = "newfstatat", }, 744 { .name = "futex", 745 .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, 746 [5] = { .scnprintf = SCA_FUTEX_VAL3, /* val3 */ }, }, }, 747 { .name = "futimesat", 748 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 749 { .name = "getitimer", 750 .arg = { [0] = STRARRAY(which, itimers), }, }, 751 { .name = "getpid", .errpid = true, }, 752 { .name = "getpgid", .errpid = true, }, 753 { .name = "getppid", .errpid = true, }, 754 { .name = "getrandom", 755 .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, }, 756 { .name = "getrlimit", 757 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, }, 758 { .name = "gettid", .errpid = true, }, 759 { .name = "ioctl", 760 .arg = { 761 #if defined(__i386__) || defined(__x86_64__) 762 /* 763 * FIXME: Make this available to all arches. 764 */ 765 [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ }, 766 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, }, 767 #else 768 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, }, 769 #endif 770 { .name = "kcmp", .nr_args = 5, 771 .arg = { [0] = { .name = "pid1", .scnprintf = SCA_PID, }, 772 [1] = { .name = "pid2", .scnprintf = SCA_PID, }, 773 [2] = { .name = "type", .scnprintf = SCA_KCMP_TYPE, }, 774 [3] = { .name = "idx1", .scnprintf = SCA_KCMP_IDX, }, 775 [4] = { .name = "idx2", .scnprintf = SCA_KCMP_IDX, }, }, }, 776 { .name = "keyctl", 777 .arg = { [0] = STRARRAY(option, keyctl_options), }, }, 778 { .name = "kill", 779 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 780 { .name = "linkat", 781 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 782 { .name = "lseek", 783 .arg = { [2] = STRARRAY(whence, whences), }, }, 784 { .name = "lstat", .alias = "newlstat", }, 785 { .name = "madvise", 786 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ }, 787 [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, }, 788 { .name = "mkdirat", 789 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 790 { .name = "mknodat", 791 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, }, 792 { .name = "mmap", .hexret = true, 793 /* The standard mmap maps to old_mmap on s390x */ 794 #if defined(__s390x__) 795 .alias = "old_mmap", 796 #endif 797 .arg = { [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, 798 [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, 799 [5] = { .scnprintf = SCA_HEX, /* offset */ }, }, }, 800 { .name = "mount", 801 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* dev_name */ }, 802 [3] = { .scnprintf = SCA_MOUNT_FLAGS, /* flags */ 803 .mask_val = SCAMV_MOUNT_FLAGS, /* flags */ }, }, }, 804 { .name = "move_mount", 805 .arg = { [0] = { .scnprintf = SCA_FDAT, /* from_dfd */ }, 806 [1] = { .scnprintf = SCA_FILENAME, /* from_pathname */ }, 807 [2] = { .scnprintf = SCA_FDAT, /* to_dfd */ }, 808 [3] = { .scnprintf = SCA_FILENAME, /* to_pathname */ }, 809 [4] = { .scnprintf = SCA_MOVE_MOUNT_FLAGS, /* flags */ }, }, }, 810 { .name = "mprotect", 811 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ }, 812 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, }, 813 { .name = "mq_unlink", 814 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, }, 815 { .name = "mremap", .hexret = true, 816 .arg = { [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ }, }, }, 817 { .name = "name_to_handle_at", 818 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 819 { .name = "newfstatat", 820 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 821 { .name = "open", 822 .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, 823 { .name = "open_by_handle_at", 824 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, 825 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, 826 { .name = "openat", 827 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, 828 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, }, 829 { .name = "perf_event_open", 830 .arg = { [2] = { .scnprintf = SCA_INT, /* cpu */ }, 831 [3] = { .scnprintf = SCA_FD, /* group_fd */ }, 832 [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, }, 833 { .name = "pipe2", 834 .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, }, 835 { .name = "pkey_alloc", 836 .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS, /* access_rights */ }, }, }, 837 { .name = "pkey_free", 838 .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, }, 839 { .name = "pkey_mprotect", 840 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ }, 841 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, 842 [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, }, 843 { .name = "poll", .timeout = true, }, 844 { .name = "ppoll", .timeout = true, }, 845 { .name = "prctl", 846 .arg = { [0] = { .scnprintf = SCA_PRCTL_OPTION, /* option */ }, 847 [1] = { .scnprintf = SCA_PRCTL_ARG2, /* arg2 */ }, 848 [2] = { .scnprintf = SCA_PRCTL_ARG3, /* arg3 */ }, }, }, 849 { .name = "pread", .alias = "pread64", }, 850 { .name = "preadv", .alias = "pread", }, 851 { .name = "prlimit64", 852 .arg = { [1] = STRARRAY(resource, rlimit_resources), }, }, 853 { .name = "pwrite", .alias = "pwrite64", }, 854 { .name = "readlinkat", 855 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 856 { .name = "recvfrom", 857 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 858 { .name = "recvmmsg", 859 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 860 { .name = "recvmsg", 861 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 862 { .name = "renameat", 863 .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ }, 864 [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ }, }, }, 865 { .name = "renameat2", 866 .arg = { [0] = { .scnprintf = SCA_FDAT, /* olddirfd */ }, 867 [2] = { .scnprintf = SCA_FDAT, /* newdirfd */ }, 868 [4] = { .scnprintf = SCA_RENAMEAT2_FLAGS, /* flags */ }, }, }, 869 { .name = "rt_sigaction", 870 .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 871 { .name = "rt_sigprocmask", 872 .arg = { [0] = STRARRAY(how, sighow), }, }, 873 { .name = "rt_sigqueueinfo", 874 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 875 { .name = "rt_tgsigqueueinfo", 876 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 877 { .name = "sched_setscheduler", 878 .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, }, 879 { .name = "seccomp", 880 .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP, /* op */ }, 881 [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, }, 882 { .name = "select", .timeout = true, }, 883 { .name = "sendmmsg", 884 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 885 { .name = "sendmsg", 886 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, }, 887 { .name = "sendto", 888 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, 889 [4] = { .scnprintf = SCA_SOCKADDR, /* addr */ }, }, }, 890 { .name = "set_tid_address", .errpid = true, }, 891 { .name = "setitimer", 892 .arg = { [0] = STRARRAY(which, itimers), }, }, 893 { .name = "setrlimit", 894 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, }, 895 { .name = "socket", 896 .arg = { [0] = STRARRAY(family, socket_families), 897 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, 898 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, }, 899 { .name = "socketpair", 900 .arg = { [0] = STRARRAY(family, socket_families), 901 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, 902 [2] = { .scnprintf = SCA_SK_PROTO, /* protocol */ }, }, }, 903 { .name = "stat", .alias = "newstat", }, 904 { .name = "statx", 905 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ }, 906 [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } , 907 [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, }, 908 { .name = "swapoff", 909 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, }, 910 { .name = "swapon", 911 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, }, 912 { .name = "symlinkat", 913 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 914 { .name = "sync_file_range", 915 .arg = { [3] = { .scnprintf = SCA_SYNC_FILE_RANGE_FLAGS, /* flags */ }, }, }, 916 { .name = "tgkill", 917 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 918 { .name = "tkill", 919 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, }, 920 { .name = "umount2", .alias = "umount", 921 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* name */ }, }, }, 922 { .name = "uname", .alias = "newuname", }, 923 { .name = "unlinkat", 924 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, }, 925 { .name = "utimensat", 926 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, }, 927 { .name = "wait4", .errpid = true, 928 .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, }, 929 { .name = "waitid", .errpid = true, 930 .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, }, 931 }; 932 933 static int syscall_fmt__cmp(const void *name, const void *fmtp) 934 { 935 const struct syscall_fmt *fmt = fmtp; 936 return strcmp(name, fmt->name); 937 } 938 939 static struct syscall_fmt *syscall_fmt__find(const char *name) 940 { 941 const int nmemb = ARRAY_SIZE(syscall_fmts); 942 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp); 943 } 944 945 static struct syscall_fmt *syscall_fmt__find_by_alias(const char *alias) 946 { 947 int i, nmemb = ARRAY_SIZE(syscall_fmts); 948 949 for (i = 0; i < nmemb; ++i) { 950 if (syscall_fmts[i].alias && strcmp(syscall_fmts[i].alias, alias) == 0) 951 return &syscall_fmts[i]; 952 } 953 954 return NULL; 955 } 956 957 /* 958 * is_exit: is this "exit" or "exit_group"? 959 * is_open: is this "open" or "openat"? To associate the fd returned in sys_exit with the pathname in sys_enter. 960 * args_size: sum of the sizes of the syscall arguments, anything after that is augmented stuff: pathname for openat, etc. 961 */ 962 struct syscall { 963 struct tep_event *tp_format; 964 int nr_args; 965 int args_size; 966 bool is_exit; 967 bool is_open; 968 struct tep_format_field *args; 969 const char *name; 970 struct syscall_fmt *fmt; 971 struct syscall_arg_fmt *arg_fmt; 972 }; 973 974 /* 975 * Must match what is in the BPF program: 976 * 977 * tools/perf/examples/bpf/augmented_raw_syscalls.c 978 */ 979 struct bpf_map_syscall_entry { 980 bool enabled; 981 u16 string_args_len[6]; 982 }; 983 984 /* 985 * We need to have this 'calculated' boolean because in some cases we really 986 * don't know what is the duration of a syscall, for instance, when we start 987 * a session and some threads are waiting for a syscall to finish, say 'poll', 988 * in which case all we can do is to print "( ? ) for duration and for the 989 * start timestamp. 990 */ 991 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp) 992 { 993 double duration = (double)t / NSEC_PER_MSEC; 994 size_t printed = fprintf(fp, "("); 995 996 if (!calculated) 997 printed += fprintf(fp, " "); 998 else if (duration >= 1.0) 999 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration); 1000 else if (duration >= 0.01) 1001 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration); 1002 else 1003 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration); 1004 return printed + fprintf(fp, "): "); 1005 } 1006 1007 /** 1008 * filename.ptr: The filename char pointer that will be vfs_getname'd 1009 * filename.entry_str_pos: Where to insert the string translated from 1010 * filename.ptr by the vfs_getname tracepoint/kprobe. 1011 * ret_scnprintf: syscall args may set this to a different syscall return 1012 * formatter, for instance, fcntl may return fds, file flags, etc. 1013 */ 1014 struct thread_trace { 1015 u64 entry_time; 1016 bool entry_pending; 1017 unsigned long nr_events; 1018 unsigned long pfmaj, pfmin; 1019 char *entry_str; 1020 double runtime_ms; 1021 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg); 1022 struct { 1023 unsigned long ptr; 1024 short int entry_str_pos; 1025 bool pending_open; 1026 unsigned int namelen; 1027 char *name; 1028 } filename; 1029 struct { 1030 int max; 1031 struct file *table; 1032 } files; 1033 1034 struct intlist *syscall_stats; 1035 }; 1036 1037 static struct thread_trace *thread_trace__new(void) 1038 { 1039 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace)); 1040 1041 if (ttrace) 1042 ttrace->files.max = -1; 1043 1044 ttrace->syscall_stats = intlist__new(NULL); 1045 1046 return ttrace; 1047 } 1048 1049 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp) 1050 { 1051 struct thread_trace *ttrace; 1052 1053 if (thread == NULL) 1054 goto fail; 1055 1056 if (thread__priv(thread) == NULL) 1057 thread__set_priv(thread, thread_trace__new()); 1058 1059 if (thread__priv(thread) == NULL) 1060 goto fail; 1061 1062 ttrace = thread__priv(thread); 1063 ++ttrace->nr_events; 1064 1065 return ttrace; 1066 fail: 1067 color_fprintf(fp, PERF_COLOR_RED, 1068 "WARNING: not enough memory, dropping samples!\n"); 1069 return NULL; 1070 } 1071 1072 1073 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg, 1074 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg)) 1075 { 1076 struct thread_trace *ttrace = thread__priv(arg->thread); 1077 1078 ttrace->ret_scnprintf = ret_scnprintf; 1079 } 1080 1081 #define TRACE_PFMAJ (1 << 0) 1082 #define TRACE_PFMIN (1 << 1) 1083 1084 static const size_t trace__entry_str_size = 2048; 1085 1086 static struct file *thread_trace__files_entry(struct thread_trace *ttrace, int fd) 1087 { 1088 if (fd < 0) 1089 return NULL; 1090 1091 if (fd > ttrace->files.max) { 1092 struct file *nfiles = realloc(ttrace->files.table, (fd + 1) * sizeof(struct file)); 1093 1094 if (nfiles == NULL) 1095 return NULL; 1096 1097 if (ttrace->files.max != -1) { 1098 memset(nfiles + ttrace->files.max + 1, 0, 1099 (fd - ttrace->files.max) * sizeof(struct file)); 1100 } else { 1101 memset(nfiles, 0, (fd + 1) * sizeof(struct file)); 1102 } 1103 1104 ttrace->files.table = nfiles; 1105 ttrace->files.max = fd; 1106 } 1107 1108 return ttrace->files.table + fd; 1109 } 1110 1111 struct file *thread__files_entry(struct thread *thread, int fd) 1112 { 1113 return thread_trace__files_entry(thread__priv(thread), fd); 1114 } 1115 1116 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname) 1117 { 1118 struct thread_trace *ttrace = thread__priv(thread); 1119 struct file *file = thread_trace__files_entry(ttrace, fd); 1120 1121 if (file != NULL) { 1122 struct stat st; 1123 if (stat(pathname, &st) == 0) 1124 file->dev_maj = major(st.st_rdev); 1125 file->pathname = strdup(pathname); 1126 if (file->pathname) 1127 return 0; 1128 } 1129 1130 return -1; 1131 } 1132 1133 static int thread__read_fd_path(struct thread *thread, int fd) 1134 { 1135 char linkname[PATH_MAX], pathname[PATH_MAX]; 1136 struct stat st; 1137 int ret; 1138 1139 if (thread->pid_ == thread->tid) { 1140 scnprintf(linkname, sizeof(linkname), 1141 "/proc/%d/fd/%d", thread->pid_, fd); 1142 } else { 1143 scnprintf(linkname, sizeof(linkname), 1144 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd); 1145 } 1146 1147 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname)) 1148 return -1; 1149 1150 ret = readlink(linkname, pathname, sizeof(pathname)); 1151 1152 if (ret < 0 || ret > st.st_size) 1153 return -1; 1154 1155 pathname[ret] = '\0'; 1156 return trace__set_fd_pathname(thread, fd, pathname); 1157 } 1158 1159 static const char *thread__fd_path(struct thread *thread, int fd, 1160 struct trace *trace) 1161 { 1162 struct thread_trace *ttrace = thread__priv(thread); 1163 1164 if (ttrace == NULL) 1165 return NULL; 1166 1167 if (fd < 0) 1168 return NULL; 1169 1170 if ((fd > ttrace->files.max || ttrace->files.table[fd].pathname == NULL)) { 1171 if (!trace->live) 1172 return NULL; 1173 ++trace->stats.proc_getname; 1174 if (thread__read_fd_path(thread, fd)) 1175 return NULL; 1176 } 1177 1178 return ttrace->files.table[fd].pathname; 1179 } 1180 1181 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg) 1182 { 1183 int fd = arg->val; 1184 size_t printed = scnprintf(bf, size, "%d", fd); 1185 const char *path = thread__fd_path(arg->thread, fd, arg->trace); 1186 1187 if (path) 1188 printed += scnprintf(bf + printed, size - printed, "<%s>", path); 1189 1190 return printed; 1191 } 1192 1193 size_t pid__scnprintf_fd(struct trace *trace, pid_t pid, int fd, char *bf, size_t size) 1194 { 1195 size_t printed = scnprintf(bf, size, "%d", fd); 1196 struct thread *thread = machine__find_thread(trace->host, pid, pid); 1197 1198 if (thread) { 1199 const char *path = thread__fd_path(thread, fd, trace); 1200 1201 if (path) 1202 printed += scnprintf(bf + printed, size - printed, "<%s>", path); 1203 1204 thread__put(thread); 1205 } 1206 1207 return printed; 1208 } 1209 1210 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size, 1211 struct syscall_arg *arg) 1212 { 1213 int fd = arg->val; 1214 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg); 1215 struct thread_trace *ttrace = thread__priv(arg->thread); 1216 1217 if (ttrace && fd >= 0 && fd <= ttrace->files.max) 1218 zfree(&ttrace->files.table[fd].pathname); 1219 1220 return printed; 1221 } 1222 1223 static void thread__set_filename_pos(struct thread *thread, const char *bf, 1224 unsigned long ptr) 1225 { 1226 struct thread_trace *ttrace = thread__priv(thread); 1227 1228 ttrace->filename.ptr = ptr; 1229 ttrace->filename.entry_str_pos = bf - ttrace->entry_str; 1230 } 1231 1232 static size_t syscall_arg__scnprintf_augmented_string(struct syscall_arg *arg, char *bf, size_t size) 1233 { 1234 struct augmented_arg *augmented_arg = arg->augmented.args; 1235 size_t printed = scnprintf(bf, size, "\"%.*s\"", augmented_arg->size, augmented_arg->value); 1236 /* 1237 * So that the next arg with a payload can consume its augmented arg, i.e. for rename* syscalls 1238 * we would have two strings, each prefixed by its size. 1239 */ 1240 int consumed = sizeof(*augmented_arg) + augmented_arg->size; 1241 1242 arg->augmented.args = ((void *)arg->augmented.args) + consumed; 1243 arg->augmented.size -= consumed; 1244 1245 return printed; 1246 } 1247 1248 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size, 1249 struct syscall_arg *arg) 1250 { 1251 unsigned long ptr = arg->val; 1252 1253 if (arg->augmented.args) 1254 return syscall_arg__scnprintf_augmented_string(arg, bf, size); 1255 1256 if (!arg->trace->vfs_getname) 1257 return scnprintf(bf, size, "%#x", ptr); 1258 1259 thread__set_filename_pos(arg->thread, bf, ptr); 1260 return 0; 1261 } 1262 1263 static bool trace__filter_duration(struct trace *trace, double t) 1264 { 1265 return t < (trace->duration_filter * NSEC_PER_MSEC); 1266 } 1267 1268 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp) 1269 { 1270 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC; 1271 1272 return fprintf(fp, "%10.3f ", ts); 1273 } 1274 1275 /* 1276 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are 1277 * using ttrace->entry_time for a thread that receives a sys_exit without 1278 * first having received a sys_enter ("poll" issued before tracing session 1279 * starts, lost sys_enter exit due to ring buffer overflow). 1280 */ 1281 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp) 1282 { 1283 if (tstamp > 0) 1284 return __trace__fprintf_tstamp(trace, tstamp, fp); 1285 1286 return fprintf(fp, " ? "); 1287 } 1288 1289 static bool done = false; 1290 static bool interrupted = false; 1291 1292 static void sig_handler(int sig) 1293 { 1294 done = true; 1295 interrupted = sig == SIGINT; 1296 } 1297 1298 static size_t trace__fprintf_comm_tid(struct trace *trace, struct thread *thread, FILE *fp) 1299 { 1300 size_t printed = 0; 1301 1302 if (trace->multiple_threads) { 1303 if (trace->show_comm) 1304 printed += fprintf(fp, "%.14s/", thread__comm_str(thread)); 1305 printed += fprintf(fp, "%d ", thread->tid); 1306 } 1307 1308 return printed; 1309 } 1310 1311 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread, 1312 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp) 1313 { 1314 size_t printed = 0; 1315 1316 if (trace->show_tstamp) 1317 printed = trace__fprintf_tstamp(trace, tstamp, fp); 1318 if (trace->show_duration) 1319 printed += fprintf_duration(duration, duration_calculated, fp); 1320 return printed + trace__fprintf_comm_tid(trace, thread, fp); 1321 } 1322 1323 static int trace__process_event(struct trace *trace, struct machine *machine, 1324 union perf_event *event, struct perf_sample *sample) 1325 { 1326 int ret = 0; 1327 1328 switch (event->header.type) { 1329 case PERF_RECORD_LOST: 1330 color_fprintf(trace->output, PERF_COLOR_RED, 1331 "LOST %" PRIu64 " events!\n", event->lost.lost); 1332 ret = machine__process_lost_event(machine, event, sample); 1333 break; 1334 default: 1335 ret = machine__process_event(machine, event, sample); 1336 break; 1337 } 1338 1339 return ret; 1340 } 1341 1342 static int trace__tool_process(struct perf_tool *tool, 1343 union perf_event *event, 1344 struct perf_sample *sample, 1345 struct machine *machine) 1346 { 1347 struct trace *trace = container_of(tool, struct trace, tool); 1348 return trace__process_event(trace, machine, event, sample); 1349 } 1350 1351 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp) 1352 { 1353 struct machine *machine = vmachine; 1354 1355 if (machine->kptr_restrict_warned) 1356 return NULL; 1357 1358 if (symbol_conf.kptr_restrict) { 1359 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n" 1360 "Check /proc/sys/kernel/kptr_restrict.\n\n" 1361 "Kernel samples will not be resolved.\n"); 1362 machine->kptr_restrict_warned = true; 1363 return NULL; 1364 } 1365 1366 return machine__resolve_kernel_addr(vmachine, addrp, modp); 1367 } 1368 1369 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist) 1370 { 1371 int err = symbol__init(NULL); 1372 1373 if (err) 1374 return err; 1375 1376 trace->host = machine__new_host(); 1377 if (trace->host == NULL) 1378 return -ENOMEM; 1379 1380 err = trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr); 1381 if (err < 0) 1382 goto out; 1383 1384 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target, 1385 evlist->threads, trace__tool_process, false, 1386 1); 1387 out: 1388 if (err) 1389 symbol__exit(); 1390 1391 return err; 1392 } 1393 1394 static void trace__symbols__exit(struct trace *trace) 1395 { 1396 machine__exit(trace->host); 1397 trace->host = NULL; 1398 1399 symbol__exit(); 1400 } 1401 1402 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args) 1403 { 1404 int idx; 1405 1406 if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0) 1407 nr_args = sc->fmt->nr_args; 1408 1409 sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt)); 1410 if (sc->arg_fmt == NULL) 1411 return -1; 1412 1413 for (idx = 0; idx < nr_args; ++idx) { 1414 if (sc->fmt) 1415 sc->arg_fmt[idx] = sc->fmt->arg[idx]; 1416 } 1417 1418 sc->nr_args = nr_args; 1419 return 0; 1420 } 1421 1422 static int syscall__set_arg_fmts(struct syscall *sc) 1423 { 1424 struct tep_format_field *field, *last_field = NULL; 1425 int idx = 0, len; 1426 1427 for (field = sc->args; field; field = field->next, ++idx) { 1428 last_field = field; 1429 1430 if (sc->fmt && sc->fmt->arg[idx].scnprintf) 1431 continue; 1432 1433 len = strlen(field->name); 1434 1435 if (strcmp(field->type, "const char *") == 0 && 1436 ((len >= 4 && strcmp(field->name + len - 4, "name") == 0) || 1437 strstr(field->name, "path") != NULL)) 1438 sc->arg_fmt[idx].scnprintf = SCA_FILENAME; 1439 else if ((field->flags & TEP_FIELD_IS_POINTER) || strstr(field->name, "addr")) 1440 sc->arg_fmt[idx].scnprintf = SCA_PTR; 1441 else if (strcmp(field->type, "pid_t") == 0) 1442 sc->arg_fmt[idx].scnprintf = SCA_PID; 1443 else if (strcmp(field->type, "umode_t") == 0) 1444 sc->arg_fmt[idx].scnprintf = SCA_MODE_T; 1445 else if ((strcmp(field->type, "int") == 0 || 1446 strcmp(field->type, "unsigned int") == 0 || 1447 strcmp(field->type, "long") == 0) && 1448 len >= 2 && strcmp(field->name + len - 2, "fd") == 0) { 1449 /* 1450 * /sys/kernel/tracing/events/syscalls/sys_enter* 1451 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c 1452 * 65 int 1453 * 23 unsigned int 1454 * 7 unsigned long 1455 */ 1456 sc->arg_fmt[idx].scnprintf = SCA_FD; 1457 } 1458 } 1459 1460 if (last_field) 1461 sc->args_size = last_field->offset + last_field->size; 1462 1463 return 0; 1464 } 1465 1466 static int trace__read_syscall_info(struct trace *trace, int id) 1467 { 1468 char tp_name[128]; 1469 struct syscall *sc; 1470 const char *name = syscalltbl__name(trace->sctbl, id); 1471 1472 if (name == NULL) 1473 return -1; 1474 1475 if (id > trace->syscalls.max) { 1476 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc)); 1477 1478 if (nsyscalls == NULL) 1479 return -1; 1480 1481 if (trace->syscalls.max != -1) { 1482 memset(nsyscalls + trace->syscalls.max + 1, 0, 1483 (id - trace->syscalls.max) * sizeof(*sc)); 1484 } else { 1485 memset(nsyscalls, 0, (id + 1) * sizeof(*sc)); 1486 } 1487 1488 trace->syscalls.table = nsyscalls; 1489 trace->syscalls.max = id; 1490 } 1491 1492 sc = trace->syscalls.table + id; 1493 sc->name = name; 1494 1495 sc->fmt = syscall_fmt__find(sc->name); 1496 1497 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name); 1498 sc->tp_format = trace_event__tp_format("syscalls", tp_name); 1499 1500 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) { 1501 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias); 1502 sc->tp_format = trace_event__tp_format("syscalls", tp_name); 1503 } 1504 1505 if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields)) 1506 return -1; 1507 1508 if (IS_ERR(sc->tp_format)) 1509 return -1; 1510 1511 sc->args = sc->tp_format->format.fields; 1512 /* 1513 * We need to check and discard the first variable '__syscall_nr' 1514 * or 'nr' that mean the syscall number. It is needless here. 1515 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels. 1516 */ 1517 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) { 1518 sc->args = sc->args->next; 1519 --sc->nr_args; 1520 } 1521 1522 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit"); 1523 sc->is_open = !strcmp(name, "open") || !strcmp(name, "openat"); 1524 1525 return syscall__set_arg_fmts(sc); 1526 } 1527 1528 static int trace__validate_ev_qualifier(struct trace *trace) 1529 { 1530 int err = 0; 1531 bool printed_invalid_prefix = false; 1532 struct str_node *pos; 1533 size_t nr_used = 0, nr_allocated = strlist__nr_entries(trace->ev_qualifier); 1534 1535 trace->ev_qualifier_ids.entries = malloc(nr_allocated * 1536 sizeof(trace->ev_qualifier_ids.entries[0])); 1537 1538 if (trace->ev_qualifier_ids.entries == NULL) { 1539 fputs("Error:\tNot enough memory for allocating events qualifier ids\n", 1540 trace->output); 1541 err = -EINVAL; 1542 goto out; 1543 } 1544 1545 strlist__for_each_entry(pos, trace->ev_qualifier) { 1546 const char *sc = pos->s; 1547 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1; 1548 1549 if (id < 0) { 1550 id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next); 1551 if (id >= 0) 1552 goto matches; 1553 1554 if (!printed_invalid_prefix) { 1555 pr_debug("Skipping unknown syscalls: "); 1556 printed_invalid_prefix = true; 1557 } else { 1558 pr_debug(", "); 1559 } 1560 1561 pr_debug("%s", sc); 1562 continue; 1563 } 1564 matches: 1565 trace->ev_qualifier_ids.entries[nr_used++] = id; 1566 if (match_next == -1) 1567 continue; 1568 1569 while (1) { 1570 id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next); 1571 if (id < 0) 1572 break; 1573 if (nr_allocated == nr_used) { 1574 void *entries; 1575 1576 nr_allocated += 8; 1577 entries = realloc(trace->ev_qualifier_ids.entries, 1578 nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0])); 1579 if (entries == NULL) { 1580 err = -ENOMEM; 1581 fputs("\nError:\t Not enough memory for parsing\n", trace->output); 1582 goto out_free; 1583 } 1584 trace->ev_qualifier_ids.entries = entries; 1585 } 1586 trace->ev_qualifier_ids.entries[nr_used++] = id; 1587 } 1588 } 1589 1590 trace->ev_qualifier_ids.nr = nr_used; 1591 out: 1592 if (printed_invalid_prefix) 1593 pr_debug("\n"); 1594 return err; 1595 out_free: 1596 zfree(&trace->ev_qualifier_ids.entries); 1597 trace->ev_qualifier_ids.nr = 0; 1598 goto out; 1599 } 1600 1601 /* 1602 * args is to be interpreted as a series of longs but we need to handle 1603 * 8-byte unaligned accesses. args points to raw_data within the event 1604 * and raw_data is guaranteed to be 8-byte unaligned because it is 1605 * preceded by raw_size which is a u32. So we need to copy args to a temp 1606 * variable to read it. Most notably this avoids extended load instructions 1607 * on unaligned addresses 1608 */ 1609 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx) 1610 { 1611 unsigned long val; 1612 unsigned char *p = arg->args + sizeof(unsigned long) * idx; 1613 1614 memcpy(&val, p, sizeof(val)); 1615 return val; 1616 } 1617 1618 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size, 1619 struct syscall_arg *arg) 1620 { 1621 if (sc->arg_fmt && sc->arg_fmt[arg->idx].name) 1622 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name); 1623 1624 return scnprintf(bf, size, "arg%d: ", arg->idx); 1625 } 1626 1627 /* 1628 * Check if the value is in fact zero, i.e. mask whatever needs masking, such 1629 * as mount 'flags' argument that needs ignoring some magic flag, see comment 1630 * in tools/perf/trace/beauty/mount_flags.c 1631 */ 1632 static unsigned long syscall__mask_val(struct syscall *sc, struct syscall_arg *arg, unsigned long val) 1633 { 1634 if (sc->arg_fmt && sc->arg_fmt[arg->idx].mask_val) 1635 return sc->arg_fmt[arg->idx].mask_val(arg, val); 1636 1637 return val; 1638 } 1639 1640 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size, 1641 struct syscall_arg *arg, unsigned long val) 1642 { 1643 if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) { 1644 arg->val = val; 1645 if (sc->arg_fmt[arg->idx].parm) 1646 arg->parm = sc->arg_fmt[arg->idx].parm; 1647 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg); 1648 } 1649 return scnprintf(bf, size, "%ld", val); 1650 } 1651 1652 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size, 1653 unsigned char *args, void *augmented_args, int augmented_args_size, 1654 struct trace *trace, struct thread *thread) 1655 { 1656 size_t printed = 0; 1657 unsigned long val; 1658 u8 bit = 1; 1659 struct syscall_arg arg = { 1660 .args = args, 1661 .augmented = { 1662 .size = augmented_args_size, 1663 .args = augmented_args, 1664 }, 1665 .idx = 0, 1666 .mask = 0, 1667 .trace = trace, 1668 .thread = thread, 1669 .show_string_prefix = trace->show_string_prefix, 1670 }; 1671 struct thread_trace *ttrace = thread__priv(thread); 1672 1673 /* 1674 * Things like fcntl will set this in its 'cmd' formatter to pick the 1675 * right formatter for the return value (an fd? file flags?), which is 1676 * not needed for syscalls that always return a given type, say an fd. 1677 */ 1678 ttrace->ret_scnprintf = NULL; 1679 1680 if (sc->args != NULL) { 1681 struct tep_format_field *field; 1682 1683 for (field = sc->args; field; 1684 field = field->next, ++arg.idx, bit <<= 1) { 1685 if (arg.mask & bit) 1686 continue; 1687 1688 val = syscall_arg__val(&arg, arg.idx); 1689 /* 1690 * Some syscall args need some mask, most don't and 1691 * return val untouched. 1692 */ 1693 val = syscall__mask_val(sc, &arg, val); 1694 1695 /* 1696 * Suppress this argument if its value is zero and 1697 * and we don't have a string associated in an 1698 * strarray for it. 1699 */ 1700 if (val == 0 && 1701 !trace->show_zeros && 1702 !(sc->arg_fmt && 1703 (sc->arg_fmt[arg.idx].show_zero || 1704 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY || 1705 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) && 1706 sc->arg_fmt[arg.idx].parm)) 1707 continue; 1708 1709 printed += scnprintf(bf + printed, size - printed, "%s", printed ? ", " : ""); 1710 1711 if (trace->show_arg_names) 1712 printed += scnprintf(bf + printed, size - printed, "%s: ", field->name); 1713 1714 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val); 1715 } 1716 } else if (IS_ERR(sc->tp_format)) { 1717 /* 1718 * If we managed to read the tracepoint /format file, then we 1719 * may end up not having any args, like with gettid(), so only 1720 * print the raw args when we didn't manage to read it. 1721 */ 1722 while (arg.idx < sc->nr_args) { 1723 if (arg.mask & bit) 1724 goto next_arg; 1725 val = syscall_arg__val(&arg, arg.idx); 1726 if (printed) 1727 printed += scnprintf(bf + printed, size - printed, ", "); 1728 printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg); 1729 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val); 1730 next_arg: 1731 ++arg.idx; 1732 bit <<= 1; 1733 } 1734 } 1735 1736 return printed; 1737 } 1738 1739 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel, 1740 union perf_event *event, 1741 struct perf_sample *sample); 1742 1743 static struct syscall *trace__syscall_info(struct trace *trace, 1744 struct perf_evsel *evsel, int id) 1745 { 1746 1747 if (id < 0) { 1748 1749 /* 1750 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried 1751 * before that, leaving at a higher verbosity level till that is 1752 * explained. Reproduced with plain ftrace with: 1753 * 1754 * echo 1 > /t/events/raw_syscalls/sys_exit/enable 1755 * grep "NR -1 " /t/trace_pipe 1756 * 1757 * After generating some load on the machine. 1758 */ 1759 if (verbose > 1) { 1760 static u64 n; 1761 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n", 1762 id, perf_evsel__name(evsel), ++n); 1763 } 1764 return NULL; 1765 } 1766 1767 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) && 1768 trace__read_syscall_info(trace, id)) 1769 goto out_cant_read; 1770 1771 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL)) 1772 goto out_cant_read; 1773 1774 return &trace->syscalls.table[id]; 1775 1776 out_cant_read: 1777 if (verbose > 0) { 1778 fprintf(trace->output, "Problems reading syscall %d", id); 1779 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL) 1780 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name); 1781 fputs(" information\n", trace->output); 1782 } 1783 return NULL; 1784 } 1785 1786 static void thread__update_stats(struct thread_trace *ttrace, 1787 int id, struct perf_sample *sample) 1788 { 1789 struct int_node *inode; 1790 struct stats *stats; 1791 u64 duration = 0; 1792 1793 inode = intlist__findnew(ttrace->syscall_stats, id); 1794 if (inode == NULL) 1795 return; 1796 1797 stats = inode->priv; 1798 if (stats == NULL) { 1799 stats = malloc(sizeof(struct stats)); 1800 if (stats == NULL) 1801 return; 1802 init_stats(stats); 1803 inode->priv = stats; 1804 } 1805 1806 if (ttrace->entry_time && sample->time > ttrace->entry_time) 1807 duration = sample->time - ttrace->entry_time; 1808 1809 update_stats(stats, duration); 1810 } 1811 1812 static int trace__printf_interrupted_entry(struct trace *trace) 1813 { 1814 struct thread_trace *ttrace; 1815 size_t printed; 1816 int len; 1817 1818 if (trace->failure_only || trace->current == NULL) 1819 return 0; 1820 1821 ttrace = thread__priv(trace->current); 1822 1823 if (!ttrace->entry_pending) 1824 return 0; 1825 1826 printed = trace__fprintf_entry_head(trace, trace->current, 0, false, ttrace->entry_time, trace->output); 1827 printed += len = fprintf(trace->output, "%s)", ttrace->entry_str); 1828 1829 if (len < trace->args_alignment - 4) 1830 printed += fprintf(trace->output, "%-*s", trace->args_alignment - 4 - len, " "); 1831 1832 printed += fprintf(trace->output, " ...\n"); 1833 1834 ttrace->entry_pending = false; 1835 ++trace->nr_events_printed; 1836 1837 return printed; 1838 } 1839 1840 static int trace__fprintf_sample(struct trace *trace, struct perf_evsel *evsel, 1841 struct perf_sample *sample, struct thread *thread) 1842 { 1843 int printed = 0; 1844 1845 if (trace->print_sample) { 1846 double ts = (double)sample->time / NSEC_PER_MSEC; 1847 1848 printed += fprintf(trace->output, "%22s %10.3f %s %d/%d [%d]\n", 1849 perf_evsel__name(evsel), ts, 1850 thread__comm_str(thread), 1851 sample->pid, sample->tid, sample->cpu); 1852 } 1853 1854 return printed; 1855 } 1856 1857 static void *syscall__augmented_args(struct syscall *sc, struct perf_sample *sample, int *augmented_args_size, int raw_augmented_args_size) 1858 { 1859 void *augmented_args = NULL; 1860 /* 1861 * For now with BPF raw_augmented we hook into raw_syscalls:sys_enter 1862 * and there we get all 6 syscall args plus the tracepoint common fields 1863 * that gets calculated at the start and the syscall_nr (another long). 1864 * So we check if that is the case and if so don't look after the 1865 * sc->args_size but always after the full raw_syscalls:sys_enter payload, 1866 * which is fixed. 1867 * 1868 * We'll revisit this later to pass s->args_size to the BPF augmenter 1869 * (now tools/perf/examples/bpf/augmented_raw_syscalls.c, so that it 1870 * copies only what we need for each syscall, like what happens when we 1871 * use syscalls:sys_enter_NAME, so that we reduce the kernel/userspace 1872 * traffic to just what is needed for each syscall. 1873 */ 1874 int args_size = raw_augmented_args_size ?: sc->args_size; 1875 1876 *augmented_args_size = sample->raw_size - args_size; 1877 if (*augmented_args_size > 0) 1878 augmented_args = sample->raw_data + args_size; 1879 1880 return augmented_args; 1881 } 1882 1883 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel, 1884 union perf_event *event __maybe_unused, 1885 struct perf_sample *sample) 1886 { 1887 char *msg; 1888 void *args; 1889 int printed = 0; 1890 struct thread *thread; 1891 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; 1892 int augmented_args_size = 0; 1893 void *augmented_args = NULL; 1894 struct syscall *sc = trace__syscall_info(trace, evsel, id); 1895 struct thread_trace *ttrace; 1896 1897 if (sc == NULL) 1898 return -1; 1899 1900 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1901 ttrace = thread__trace(thread, trace->output); 1902 if (ttrace == NULL) 1903 goto out_put; 1904 1905 trace__fprintf_sample(trace, evsel, sample, thread); 1906 1907 args = perf_evsel__sc_tp_ptr(evsel, args, sample); 1908 1909 if (ttrace->entry_str == NULL) { 1910 ttrace->entry_str = malloc(trace__entry_str_size); 1911 if (!ttrace->entry_str) 1912 goto out_put; 1913 } 1914 1915 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) 1916 trace__printf_interrupted_entry(trace); 1917 /* 1918 * If this is raw_syscalls.sys_enter, then it always comes with the 6 possible 1919 * arguments, even if the syscall being handled, say "openat", uses only 4 arguments 1920 * this breaks syscall__augmented_args() check for augmented args, as we calculate 1921 * syscall->args_size using each syscalls:sys_enter_NAME tracefs format file, 1922 * so when handling, say the openat syscall, we end up getting 6 args for the 1923 * raw_syscalls:sys_enter event, when we expected just 4, we end up mistakenly 1924 * thinking that the extra 2 u64 args are the augmented filename, so just check 1925 * here and avoid using augmented syscalls when the evsel is the raw_syscalls one. 1926 */ 1927 if (evsel != trace->syscalls.events.sys_enter) 1928 augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size); 1929 ttrace->entry_time = sample->time; 1930 msg = ttrace->entry_str; 1931 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name); 1932 1933 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed, 1934 args, augmented_args, augmented_args_size, trace, thread); 1935 1936 if (sc->is_exit) { 1937 if (!(trace->duration_filter || trace->summary_only || trace->failure_only || trace->min_stack)) { 1938 int alignment = 0; 1939 1940 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output); 1941 printed = fprintf(trace->output, "%s)", ttrace->entry_str); 1942 if (trace->args_alignment > printed) 1943 alignment = trace->args_alignment - printed; 1944 fprintf(trace->output, "%*s= ?\n", alignment, " "); 1945 } 1946 } else { 1947 ttrace->entry_pending = true; 1948 /* See trace__vfs_getname & trace__sys_exit */ 1949 ttrace->filename.pending_open = false; 1950 } 1951 1952 if (trace->current != thread) { 1953 thread__put(trace->current); 1954 trace->current = thread__get(thread); 1955 } 1956 err = 0; 1957 out_put: 1958 thread__put(thread); 1959 return err; 1960 } 1961 1962 static int trace__fprintf_sys_enter(struct trace *trace, struct perf_evsel *evsel, 1963 struct perf_sample *sample) 1964 { 1965 struct thread_trace *ttrace; 1966 struct thread *thread; 1967 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1; 1968 struct syscall *sc = trace__syscall_info(trace, evsel, id); 1969 char msg[1024]; 1970 void *args, *augmented_args = NULL; 1971 int augmented_args_size; 1972 1973 if (sc == NULL) 1974 return -1; 1975 1976 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 1977 ttrace = thread__trace(thread, trace->output); 1978 /* 1979 * We need to get ttrace just to make sure it is there when syscall__scnprintf_args() 1980 * and the rest of the beautifiers accessing it via struct syscall_arg touches it. 1981 */ 1982 if (ttrace == NULL) 1983 goto out_put; 1984 1985 args = perf_evsel__sc_tp_ptr(evsel, args, sample); 1986 augmented_args = syscall__augmented_args(sc, sample, &augmented_args_size, trace->raw_augmented_syscalls_args_size); 1987 syscall__scnprintf_args(sc, msg, sizeof(msg), args, augmented_args, augmented_args_size, trace, thread); 1988 fprintf(trace->output, "%s", msg); 1989 err = 0; 1990 out_put: 1991 thread__put(thread); 1992 return err; 1993 } 1994 1995 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel, 1996 struct perf_sample *sample, 1997 struct callchain_cursor *cursor) 1998 { 1999 struct addr_location al; 2000 int max_stack = evsel->attr.sample_max_stack ? 2001 evsel->attr.sample_max_stack : 2002 trace->max_stack; 2003 int err; 2004 2005 if (machine__resolve(trace->host, &al, sample) < 0) 2006 return -1; 2007 2008 err = thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, max_stack); 2009 addr_location__put(&al); 2010 return err; 2011 } 2012 2013 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample) 2014 { 2015 /* TODO: user-configurable print_opts */ 2016 const unsigned int print_opts = EVSEL__PRINT_SYM | 2017 EVSEL__PRINT_DSO | 2018 EVSEL__PRINT_UNKNOWN_AS_ADDR; 2019 2020 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output); 2021 } 2022 2023 static const char *errno_to_name(struct perf_evsel *evsel, int err) 2024 { 2025 struct perf_env *env = perf_evsel__env(evsel); 2026 const char *arch_name = perf_env__arch(env); 2027 2028 return arch_syscalls__strerrno(arch_name, err); 2029 } 2030 2031 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel, 2032 union perf_event *event __maybe_unused, 2033 struct perf_sample *sample) 2034 { 2035 long ret; 2036 u64 duration = 0; 2037 bool duration_calculated = false; 2038 struct thread *thread; 2039 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0, printed = 0; 2040 int alignment = trace->args_alignment; 2041 struct syscall *sc = trace__syscall_info(trace, evsel, id); 2042 struct thread_trace *ttrace; 2043 2044 if (sc == NULL) 2045 return -1; 2046 2047 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 2048 ttrace = thread__trace(thread, trace->output); 2049 if (ttrace == NULL) 2050 goto out_put; 2051 2052 trace__fprintf_sample(trace, evsel, sample, thread); 2053 2054 if (trace->summary) 2055 thread__update_stats(ttrace, id, sample); 2056 2057 ret = perf_evsel__sc_tp_uint(evsel, ret, sample); 2058 2059 if (sc->is_open && ret >= 0 && ttrace->filename.pending_open) { 2060 trace__set_fd_pathname(thread, ret, ttrace->filename.name); 2061 ttrace->filename.pending_open = false; 2062 ++trace->stats.vfs_getname; 2063 } 2064 2065 if (ttrace->entry_time) { 2066 duration = sample->time - ttrace->entry_time; 2067 if (trace__filter_duration(trace, duration)) 2068 goto out; 2069 duration_calculated = true; 2070 } else if (trace->duration_filter) 2071 goto out; 2072 2073 if (sample->callchain) { 2074 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 2075 if (callchain_ret == 0) { 2076 if (callchain_cursor.nr < trace->min_stack) 2077 goto out; 2078 callchain_ret = 1; 2079 } 2080 } 2081 2082 if (trace->summary_only || (ret >= 0 && trace->failure_only)) 2083 goto out; 2084 2085 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output); 2086 2087 if (ttrace->entry_pending) { 2088 printed = fprintf(trace->output, "%s", ttrace->entry_str); 2089 } else { 2090 printed += fprintf(trace->output, " ... ["); 2091 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued"); 2092 printed += 9; 2093 printed += fprintf(trace->output, "]: %s()", sc->name); 2094 } 2095 2096 printed++; /* the closing ')' */ 2097 2098 if (alignment > printed) 2099 alignment -= printed; 2100 else 2101 alignment = 0; 2102 2103 fprintf(trace->output, ")%*s= ", alignment, " "); 2104 2105 if (sc->fmt == NULL) { 2106 if (ret < 0) 2107 goto errno_print; 2108 signed_print: 2109 fprintf(trace->output, "%ld", ret); 2110 } else if (ret < 0) { 2111 errno_print: { 2112 char bf[STRERR_BUFSIZE]; 2113 const char *emsg = str_error_r(-ret, bf, sizeof(bf)), 2114 *e = errno_to_name(evsel, -ret); 2115 2116 fprintf(trace->output, "-1 %s (%s)", e, emsg); 2117 } 2118 } else if (ret == 0 && sc->fmt->timeout) 2119 fprintf(trace->output, "0 (Timeout)"); 2120 else if (ttrace->ret_scnprintf) { 2121 char bf[1024]; 2122 struct syscall_arg arg = { 2123 .val = ret, 2124 .thread = thread, 2125 .trace = trace, 2126 }; 2127 ttrace->ret_scnprintf(bf, sizeof(bf), &arg); 2128 ttrace->ret_scnprintf = NULL; 2129 fprintf(trace->output, "%s", bf); 2130 } else if (sc->fmt->hexret) 2131 fprintf(trace->output, "%#lx", ret); 2132 else if (sc->fmt->errpid) { 2133 struct thread *child = machine__find_thread(trace->host, ret, ret); 2134 2135 if (child != NULL) { 2136 fprintf(trace->output, "%ld", ret); 2137 if (child->comm_set) 2138 fprintf(trace->output, " (%s)", thread__comm_str(child)); 2139 thread__put(child); 2140 } 2141 } else 2142 goto signed_print; 2143 2144 fputc('\n', trace->output); 2145 2146 /* 2147 * We only consider an 'event' for the sake of --max-events a non-filtered 2148 * sys_enter + sys_exit and other tracepoint events. 2149 */ 2150 if (++trace->nr_events_printed == trace->max_events && trace->max_events != ULONG_MAX) 2151 interrupted = true; 2152 2153 if (callchain_ret > 0) 2154 trace__fprintf_callchain(trace, sample); 2155 else if (callchain_ret < 0) 2156 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 2157 out: 2158 ttrace->entry_pending = false; 2159 err = 0; 2160 out_put: 2161 thread__put(thread); 2162 return err; 2163 } 2164 2165 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel, 2166 union perf_event *event __maybe_unused, 2167 struct perf_sample *sample) 2168 { 2169 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 2170 struct thread_trace *ttrace; 2171 size_t filename_len, entry_str_len, to_move; 2172 ssize_t remaining_space; 2173 char *pos; 2174 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname"); 2175 2176 if (!thread) 2177 goto out; 2178 2179 ttrace = thread__priv(thread); 2180 if (!ttrace) 2181 goto out_put; 2182 2183 filename_len = strlen(filename); 2184 if (filename_len == 0) 2185 goto out_put; 2186 2187 if (ttrace->filename.namelen < filename_len) { 2188 char *f = realloc(ttrace->filename.name, filename_len + 1); 2189 2190 if (f == NULL) 2191 goto out_put; 2192 2193 ttrace->filename.namelen = filename_len; 2194 ttrace->filename.name = f; 2195 } 2196 2197 strcpy(ttrace->filename.name, filename); 2198 ttrace->filename.pending_open = true; 2199 2200 if (!ttrace->filename.ptr) 2201 goto out_put; 2202 2203 entry_str_len = strlen(ttrace->entry_str); 2204 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */ 2205 if (remaining_space <= 0) 2206 goto out_put; 2207 2208 if (filename_len > (size_t)remaining_space) { 2209 filename += filename_len - remaining_space; 2210 filename_len = remaining_space; 2211 } 2212 2213 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */ 2214 pos = ttrace->entry_str + ttrace->filename.entry_str_pos; 2215 memmove(pos + filename_len, pos, to_move); 2216 memcpy(pos, filename, filename_len); 2217 2218 ttrace->filename.ptr = 0; 2219 ttrace->filename.entry_str_pos = 0; 2220 out_put: 2221 thread__put(thread); 2222 out: 2223 return 0; 2224 } 2225 2226 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel, 2227 union perf_event *event __maybe_unused, 2228 struct perf_sample *sample) 2229 { 2230 u64 runtime = perf_evsel__intval(evsel, sample, "runtime"); 2231 double runtime_ms = (double)runtime / NSEC_PER_MSEC; 2232 struct thread *thread = machine__findnew_thread(trace->host, 2233 sample->pid, 2234 sample->tid); 2235 struct thread_trace *ttrace = thread__trace(thread, trace->output); 2236 2237 if (ttrace == NULL) 2238 goto out_dump; 2239 2240 ttrace->runtime_ms += runtime_ms; 2241 trace->runtime_ms += runtime_ms; 2242 out_put: 2243 thread__put(thread); 2244 return 0; 2245 2246 out_dump: 2247 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n", 2248 evsel->name, 2249 perf_evsel__strval(evsel, sample, "comm"), 2250 (pid_t)perf_evsel__intval(evsel, sample, "pid"), 2251 runtime, 2252 perf_evsel__intval(evsel, sample, "vruntime")); 2253 goto out_put; 2254 } 2255 2256 static int bpf_output__printer(enum binary_printer_ops op, 2257 unsigned int val, void *extra __maybe_unused, FILE *fp) 2258 { 2259 unsigned char ch = (unsigned char)val; 2260 2261 switch (op) { 2262 case BINARY_PRINT_CHAR_DATA: 2263 return fprintf(fp, "%c", isprint(ch) ? ch : '.'); 2264 case BINARY_PRINT_DATA_BEGIN: 2265 case BINARY_PRINT_LINE_BEGIN: 2266 case BINARY_PRINT_ADDR: 2267 case BINARY_PRINT_NUM_DATA: 2268 case BINARY_PRINT_NUM_PAD: 2269 case BINARY_PRINT_SEP: 2270 case BINARY_PRINT_CHAR_PAD: 2271 case BINARY_PRINT_LINE_END: 2272 case BINARY_PRINT_DATA_END: 2273 default: 2274 break; 2275 } 2276 2277 return 0; 2278 } 2279 2280 static void bpf_output__fprintf(struct trace *trace, 2281 struct perf_sample *sample) 2282 { 2283 binary__fprintf(sample->raw_data, sample->raw_size, 8, 2284 bpf_output__printer, NULL, trace->output); 2285 ++trace->nr_events_printed; 2286 } 2287 2288 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel, 2289 union perf_event *event __maybe_unused, 2290 struct perf_sample *sample) 2291 { 2292 struct thread *thread; 2293 int callchain_ret = 0; 2294 /* 2295 * Check if we called perf_evsel__disable(evsel) due to, for instance, 2296 * this event's max_events having been hit and this is an entry coming 2297 * from the ring buffer that we should discard, since the max events 2298 * have already been considered/printed. 2299 */ 2300 if (evsel->disabled) 2301 return 0; 2302 2303 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 2304 2305 if (sample->callchain) { 2306 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 2307 if (callchain_ret == 0) { 2308 if (callchain_cursor.nr < trace->min_stack) 2309 goto out; 2310 callchain_ret = 1; 2311 } 2312 } 2313 2314 trace__printf_interrupted_entry(trace); 2315 trace__fprintf_tstamp(trace, sample->time, trace->output); 2316 2317 if (trace->trace_syscalls && trace->show_duration) 2318 fprintf(trace->output, "( ): "); 2319 2320 if (thread) 2321 trace__fprintf_comm_tid(trace, thread, trace->output); 2322 2323 if (evsel == trace->syscalls.events.augmented) { 2324 int id = perf_evsel__sc_tp_uint(evsel, id, sample); 2325 struct syscall *sc = trace__syscall_info(trace, evsel, id); 2326 2327 if (sc) { 2328 fprintf(trace->output, "%s(", sc->name); 2329 trace__fprintf_sys_enter(trace, evsel, sample); 2330 fputc(')', trace->output); 2331 goto newline; 2332 } 2333 2334 /* 2335 * XXX: Not having the associated syscall info or not finding/adding 2336 * the thread should never happen, but if it does... 2337 * fall thru and print it as a bpf_output event. 2338 */ 2339 } 2340 2341 fprintf(trace->output, "%s:", evsel->name); 2342 2343 if (perf_evsel__is_bpf_output(evsel)) { 2344 bpf_output__fprintf(trace, sample); 2345 } else if (evsel->tp_format) { 2346 if (strncmp(evsel->tp_format->name, "sys_enter_", 10) || 2347 trace__fprintf_sys_enter(trace, evsel, sample)) { 2348 event_format__fprintf(evsel->tp_format, sample->cpu, 2349 sample->raw_data, sample->raw_size, 2350 trace->output); 2351 ++trace->nr_events_printed; 2352 2353 if (evsel->max_events != ULONG_MAX && ++evsel->nr_events_printed == evsel->max_events) { 2354 perf_evsel__disable(evsel); 2355 perf_evsel__close(evsel); 2356 } 2357 } 2358 } 2359 2360 newline: 2361 fprintf(trace->output, "\n"); 2362 2363 if (callchain_ret > 0) 2364 trace__fprintf_callchain(trace, sample); 2365 else if (callchain_ret < 0) 2366 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 2367 out: 2368 thread__put(thread); 2369 return 0; 2370 } 2371 2372 static void print_location(FILE *f, struct perf_sample *sample, 2373 struct addr_location *al, 2374 bool print_dso, bool print_sym) 2375 { 2376 2377 if ((verbose > 0 || print_dso) && al->map) 2378 fprintf(f, "%s@", al->map->dso->long_name); 2379 2380 if ((verbose > 0 || print_sym) && al->sym) 2381 fprintf(f, "%s+0x%" PRIx64, al->sym->name, 2382 al->addr - al->sym->start); 2383 else if (al->map) 2384 fprintf(f, "0x%" PRIx64, al->addr); 2385 else 2386 fprintf(f, "0x%" PRIx64, sample->addr); 2387 } 2388 2389 static int trace__pgfault(struct trace *trace, 2390 struct perf_evsel *evsel, 2391 union perf_event *event __maybe_unused, 2392 struct perf_sample *sample) 2393 { 2394 struct thread *thread; 2395 struct addr_location al; 2396 char map_type = 'd'; 2397 struct thread_trace *ttrace; 2398 int err = -1; 2399 int callchain_ret = 0; 2400 2401 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 2402 2403 if (sample->callchain) { 2404 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor); 2405 if (callchain_ret == 0) { 2406 if (callchain_cursor.nr < trace->min_stack) 2407 goto out_put; 2408 callchain_ret = 1; 2409 } 2410 } 2411 2412 ttrace = thread__trace(thread, trace->output); 2413 if (ttrace == NULL) 2414 goto out_put; 2415 2416 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ) 2417 ttrace->pfmaj++; 2418 else 2419 ttrace->pfmin++; 2420 2421 if (trace->summary_only) 2422 goto out; 2423 2424 thread__find_symbol(thread, sample->cpumode, sample->ip, &al); 2425 2426 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output); 2427 2428 fprintf(trace->output, "%sfault [", 2429 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ? 2430 "maj" : "min"); 2431 2432 print_location(trace->output, sample, &al, false, true); 2433 2434 fprintf(trace->output, "] => "); 2435 2436 thread__find_symbol(thread, sample->cpumode, sample->addr, &al); 2437 2438 if (!al.map) { 2439 thread__find_symbol(thread, sample->cpumode, sample->addr, &al); 2440 2441 if (al.map) 2442 map_type = 'x'; 2443 else 2444 map_type = '?'; 2445 } 2446 2447 print_location(trace->output, sample, &al, true, false); 2448 2449 fprintf(trace->output, " (%c%c)\n", map_type, al.level); 2450 2451 if (callchain_ret > 0) 2452 trace__fprintf_callchain(trace, sample); 2453 else if (callchain_ret < 0) 2454 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel)); 2455 2456 ++trace->nr_events_printed; 2457 out: 2458 err = 0; 2459 out_put: 2460 thread__put(thread); 2461 return err; 2462 } 2463 2464 static void trace__set_base_time(struct trace *trace, 2465 struct perf_evsel *evsel, 2466 struct perf_sample *sample) 2467 { 2468 /* 2469 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust 2470 * and don't use sample->time unconditionally, we may end up having 2471 * some other event in the future without PERF_SAMPLE_TIME for good 2472 * reason, i.e. we may not be interested in its timestamps, just in 2473 * it taking place, picking some piece of information when it 2474 * appears in our event stream (vfs_getname comes to mind). 2475 */ 2476 if (trace->base_time == 0 && !trace->full_time && 2477 (evsel->attr.sample_type & PERF_SAMPLE_TIME)) 2478 trace->base_time = sample->time; 2479 } 2480 2481 static int trace__process_sample(struct perf_tool *tool, 2482 union perf_event *event, 2483 struct perf_sample *sample, 2484 struct perf_evsel *evsel, 2485 struct machine *machine __maybe_unused) 2486 { 2487 struct trace *trace = container_of(tool, struct trace, tool); 2488 struct thread *thread; 2489 int err = 0; 2490 2491 tracepoint_handler handler = evsel->handler; 2492 2493 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid); 2494 if (thread && thread__is_filtered(thread)) 2495 goto out; 2496 2497 trace__set_base_time(trace, evsel, sample); 2498 2499 if (handler) { 2500 ++trace->nr_events; 2501 handler(trace, evsel, event, sample); 2502 } 2503 out: 2504 thread__put(thread); 2505 return err; 2506 } 2507 2508 static int trace__record(struct trace *trace, int argc, const char **argv) 2509 { 2510 unsigned int rec_argc, i, j; 2511 const char **rec_argv; 2512 const char * const record_args[] = { 2513 "record", 2514 "-R", 2515 "-m", "1024", 2516 "-c", "1", 2517 }; 2518 2519 const char * const sc_args[] = { "-e", }; 2520 unsigned int sc_args_nr = ARRAY_SIZE(sc_args); 2521 const char * const majpf_args[] = { "-e", "major-faults" }; 2522 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args); 2523 const char * const minpf_args[] = { "-e", "minor-faults" }; 2524 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args); 2525 2526 /* +1 is for the event string below */ 2527 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 + 2528 majpf_args_nr + minpf_args_nr + argc; 2529 rec_argv = calloc(rec_argc + 1, sizeof(char *)); 2530 2531 if (rec_argv == NULL) 2532 return -ENOMEM; 2533 2534 j = 0; 2535 for (i = 0; i < ARRAY_SIZE(record_args); i++) 2536 rec_argv[j++] = record_args[i]; 2537 2538 if (trace->trace_syscalls) { 2539 for (i = 0; i < sc_args_nr; i++) 2540 rec_argv[j++] = sc_args[i]; 2541 2542 /* event string may be different for older kernels - e.g., RHEL6 */ 2543 if (is_valid_tracepoint("raw_syscalls:sys_enter")) 2544 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit"; 2545 else if (is_valid_tracepoint("syscalls:sys_enter")) 2546 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit"; 2547 else { 2548 pr_err("Neither raw_syscalls nor syscalls events exist.\n"); 2549 free(rec_argv); 2550 return -1; 2551 } 2552 } 2553 2554 if (trace->trace_pgfaults & TRACE_PFMAJ) 2555 for (i = 0; i < majpf_args_nr; i++) 2556 rec_argv[j++] = majpf_args[i]; 2557 2558 if (trace->trace_pgfaults & TRACE_PFMIN) 2559 for (i = 0; i < minpf_args_nr; i++) 2560 rec_argv[j++] = minpf_args[i]; 2561 2562 for (i = 0; i < (unsigned int)argc; i++) 2563 rec_argv[j++] = argv[i]; 2564 2565 return cmd_record(j, rec_argv); 2566 } 2567 2568 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp); 2569 2570 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist) 2571 { 2572 bool found = false; 2573 struct perf_evsel *evsel, *tmp; 2574 struct parse_events_error err = { .idx = 0, }; 2575 int ret = parse_events(evlist, "probe:vfs_getname*", &err); 2576 2577 if (ret) 2578 return false; 2579 2580 evlist__for_each_entry_safe(evlist, evsel, tmp) { 2581 if (!strstarts(perf_evsel__name(evsel), "probe:vfs_getname")) 2582 continue; 2583 2584 if (perf_evsel__field(evsel, "pathname")) { 2585 evsel->handler = trace__vfs_getname; 2586 found = true; 2587 continue; 2588 } 2589 2590 list_del_init(&evsel->node); 2591 evsel->evlist = NULL; 2592 perf_evsel__delete(evsel); 2593 } 2594 2595 return found; 2596 } 2597 2598 static struct perf_evsel *perf_evsel__new_pgfault(u64 config) 2599 { 2600 struct perf_evsel *evsel; 2601 struct perf_event_attr attr = { 2602 .type = PERF_TYPE_SOFTWARE, 2603 .mmap_data = 1, 2604 }; 2605 2606 attr.config = config; 2607 attr.sample_period = 1; 2608 2609 event_attr_init(&attr); 2610 2611 evsel = perf_evsel__new(&attr); 2612 if (evsel) 2613 evsel->handler = trace__pgfault; 2614 2615 return evsel; 2616 } 2617 2618 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample) 2619 { 2620 const u32 type = event->header.type; 2621 struct perf_evsel *evsel; 2622 2623 if (type != PERF_RECORD_SAMPLE) { 2624 trace__process_event(trace, trace->host, event, sample); 2625 return; 2626 } 2627 2628 evsel = perf_evlist__id2evsel(trace->evlist, sample->id); 2629 if (evsel == NULL) { 2630 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id); 2631 return; 2632 } 2633 2634 trace__set_base_time(trace, evsel, sample); 2635 2636 if (evsel->attr.type == PERF_TYPE_TRACEPOINT && 2637 sample->raw_data == NULL) { 2638 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n", 2639 perf_evsel__name(evsel), sample->tid, 2640 sample->cpu, sample->raw_size); 2641 } else { 2642 tracepoint_handler handler = evsel->handler; 2643 handler(trace, evsel, event, sample); 2644 } 2645 2646 if (trace->nr_events_printed >= trace->max_events && trace->max_events != ULONG_MAX) 2647 interrupted = true; 2648 } 2649 2650 static int trace__add_syscall_newtp(struct trace *trace) 2651 { 2652 int ret = -1; 2653 struct perf_evlist *evlist = trace->evlist; 2654 struct perf_evsel *sys_enter, *sys_exit; 2655 2656 sys_enter = perf_evsel__raw_syscall_newtp("sys_enter", trace__sys_enter); 2657 if (sys_enter == NULL) 2658 goto out; 2659 2660 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args)) 2661 goto out_delete_sys_enter; 2662 2663 sys_exit = perf_evsel__raw_syscall_newtp("sys_exit", trace__sys_exit); 2664 if (sys_exit == NULL) 2665 goto out_delete_sys_enter; 2666 2667 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret)) 2668 goto out_delete_sys_exit; 2669 2670 perf_evsel__config_callchain(sys_enter, &trace->opts, &callchain_param); 2671 perf_evsel__config_callchain(sys_exit, &trace->opts, &callchain_param); 2672 2673 perf_evlist__add(evlist, sys_enter); 2674 perf_evlist__add(evlist, sys_exit); 2675 2676 if (callchain_param.enabled && !trace->kernel_syscallchains) { 2677 /* 2678 * We're interested only in the user space callchain 2679 * leading to the syscall, allow overriding that for 2680 * debugging reasons using --kernel_syscall_callchains 2681 */ 2682 sys_exit->attr.exclude_callchain_kernel = 1; 2683 } 2684 2685 trace->syscalls.events.sys_enter = sys_enter; 2686 trace->syscalls.events.sys_exit = sys_exit; 2687 2688 ret = 0; 2689 out: 2690 return ret; 2691 2692 out_delete_sys_exit: 2693 perf_evsel__delete_priv(sys_exit); 2694 out_delete_sys_enter: 2695 perf_evsel__delete_priv(sys_enter); 2696 goto out; 2697 } 2698 2699 static int trace__set_ev_qualifier_tp_filter(struct trace *trace) 2700 { 2701 int err = -1; 2702 struct perf_evsel *sys_exit; 2703 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier, 2704 trace->ev_qualifier_ids.nr, 2705 trace->ev_qualifier_ids.entries); 2706 2707 if (filter == NULL) 2708 goto out_enomem; 2709 2710 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter, 2711 filter)) { 2712 sys_exit = trace->syscalls.events.sys_exit; 2713 err = perf_evsel__append_tp_filter(sys_exit, filter); 2714 } 2715 2716 free(filter); 2717 out: 2718 return err; 2719 out_enomem: 2720 errno = ENOMEM; 2721 goto out; 2722 } 2723 2724 #ifdef HAVE_LIBBPF_SUPPORT 2725 static void trace__init_bpf_map_syscall_args(struct trace *trace, int id, struct bpf_map_syscall_entry *entry) 2726 { 2727 struct syscall *sc = trace__syscall_info(trace, NULL, id); 2728 int arg = 0; 2729 2730 if (sc == NULL) 2731 goto out; 2732 2733 for (; arg < sc->nr_args; ++arg) { 2734 entry->string_args_len[arg] = 0; 2735 if (sc->arg_fmt[arg].scnprintf == SCA_FILENAME) { 2736 /* Should be set like strace -s strsize */ 2737 entry->string_args_len[arg] = PATH_MAX; 2738 } 2739 } 2740 out: 2741 for (; arg < 6; ++arg) 2742 entry->string_args_len[arg] = 0; 2743 } 2744 static int trace__set_ev_qualifier_bpf_filter(struct trace *trace) 2745 { 2746 int fd = bpf_map__fd(trace->syscalls.map); 2747 struct bpf_map_syscall_entry value = { 2748 .enabled = !trace->not_ev_qualifier, 2749 }; 2750 int err = 0; 2751 size_t i; 2752 2753 for (i = 0; i < trace->ev_qualifier_ids.nr; ++i) { 2754 int key = trace->ev_qualifier_ids.entries[i]; 2755 2756 if (value.enabled) 2757 trace__init_bpf_map_syscall_args(trace, key, &value); 2758 2759 err = bpf_map_update_elem(fd, &key, &value, BPF_EXIST); 2760 if (err) 2761 break; 2762 } 2763 2764 return err; 2765 } 2766 2767 static int __trace__init_syscalls_bpf_map(struct trace *trace, bool enabled) 2768 { 2769 int fd = bpf_map__fd(trace->syscalls.map); 2770 struct bpf_map_syscall_entry value = { 2771 .enabled = enabled, 2772 }; 2773 int err = 0, key; 2774 2775 for (key = 0; key < trace->sctbl->syscalls.nr_entries; ++key) { 2776 if (enabled) 2777 trace__init_bpf_map_syscall_args(trace, key, &value); 2778 2779 err = bpf_map_update_elem(fd, &key, &value, BPF_ANY); 2780 if (err) 2781 break; 2782 } 2783 2784 return err; 2785 } 2786 2787 static int trace__init_syscalls_bpf_map(struct trace *trace) 2788 { 2789 bool enabled = true; 2790 2791 if (trace->ev_qualifier_ids.nr) 2792 enabled = trace->not_ev_qualifier; 2793 2794 return __trace__init_syscalls_bpf_map(trace, enabled); 2795 } 2796 #else 2797 static int trace__set_ev_qualifier_bpf_filter(struct trace *trace __maybe_unused) 2798 { 2799 return 0; 2800 } 2801 2802 static int trace__init_syscalls_bpf_map(struct trace *trace __maybe_unused) 2803 { 2804 return 0; 2805 } 2806 #endif // HAVE_LIBBPF_SUPPORT 2807 2808 static int trace__set_ev_qualifier_filter(struct trace *trace) 2809 { 2810 if (trace->syscalls.map) 2811 return trace__set_ev_qualifier_bpf_filter(trace); 2812 if (trace->syscalls.events.sys_enter) 2813 return trace__set_ev_qualifier_tp_filter(trace); 2814 return 0; 2815 } 2816 2817 static int bpf_map__set_filter_pids(struct bpf_map *map __maybe_unused, 2818 size_t npids __maybe_unused, pid_t *pids __maybe_unused) 2819 { 2820 int err = 0; 2821 #ifdef HAVE_LIBBPF_SUPPORT 2822 bool value = true; 2823 int map_fd = bpf_map__fd(map); 2824 size_t i; 2825 2826 for (i = 0; i < npids; ++i) { 2827 err = bpf_map_update_elem(map_fd, &pids[i], &value, BPF_ANY); 2828 if (err) 2829 break; 2830 } 2831 #endif 2832 return err; 2833 } 2834 2835 static int trace__set_filter_loop_pids(struct trace *trace) 2836 { 2837 unsigned int nr = 1, err; 2838 pid_t pids[32] = { 2839 getpid(), 2840 }; 2841 struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]); 2842 2843 while (thread && nr < ARRAY_SIZE(pids)) { 2844 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid); 2845 2846 if (parent == NULL) 2847 break; 2848 2849 if (!strcmp(thread__comm_str(parent), "sshd") || 2850 strstarts(thread__comm_str(parent), "gnome-terminal")) { 2851 pids[nr++] = parent->tid; 2852 break; 2853 } 2854 thread = parent; 2855 } 2856 2857 err = perf_evlist__set_tp_filter_pids(trace->evlist, nr, pids); 2858 if (!err && trace->filter_pids.map) 2859 err = bpf_map__set_filter_pids(trace->filter_pids.map, nr, pids); 2860 2861 return err; 2862 } 2863 2864 static int trace__set_filter_pids(struct trace *trace) 2865 { 2866 int err = 0; 2867 /* 2868 * Better not use !target__has_task() here because we need to cover the 2869 * case where no threads were specified in the command line, but a 2870 * workload was, and in that case we will fill in the thread_map when 2871 * we fork the workload in perf_evlist__prepare_workload. 2872 */ 2873 if (trace->filter_pids.nr > 0) { 2874 err = perf_evlist__set_tp_filter_pids(trace->evlist, trace->filter_pids.nr, 2875 trace->filter_pids.entries); 2876 if (!err && trace->filter_pids.map) { 2877 err = bpf_map__set_filter_pids(trace->filter_pids.map, trace->filter_pids.nr, 2878 trace->filter_pids.entries); 2879 } 2880 } else if (thread_map__pid(trace->evlist->threads, 0) == -1) { 2881 err = trace__set_filter_loop_pids(trace); 2882 } 2883 2884 return err; 2885 } 2886 2887 static int __trace__deliver_event(struct trace *trace, union perf_event *event) 2888 { 2889 struct perf_evlist *evlist = trace->evlist; 2890 struct perf_sample sample; 2891 int err; 2892 2893 err = perf_evlist__parse_sample(evlist, event, &sample); 2894 if (err) 2895 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err); 2896 else 2897 trace__handle_event(trace, event, &sample); 2898 2899 return 0; 2900 } 2901 2902 static int __trace__flush_events(struct trace *trace) 2903 { 2904 u64 first = ordered_events__first_time(&trace->oe.data); 2905 u64 flush = trace->oe.last - NSEC_PER_SEC; 2906 2907 /* Is there some thing to flush.. */ 2908 if (first && first < flush) 2909 return ordered_events__flush_time(&trace->oe.data, flush); 2910 2911 return 0; 2912 } 2913 2914 static int trace__flush_events(struct trace *trace) 2915 { 2916 return !trace->sort_events ? 0 : __trace__flush_events(trace); 2917 } 2918 2919 static int trace__deliver_event(struct trace *trace, union perf_event *event) 2920 { 2921 int err; 2922 2923 if (!trace->sort_events) 2924 return __trace__deliver_event(trace, event); 2925 2926 err = perf_evlist__parse_sample_timestamp(trace->evlist, event, &trace->oe.last); 2927 if (err && err != -1) 2928 return err; 2929 2930 err = ordered_events__queue(&trace->oe.data, event, trace->oe.last, 0); 2931 if (err) 2932 return err; 2933 2934 return trace__flush_events(trace); 2935 } 2936 2937 static int ordered_events__deliver_event(struct ordered_events *oe, 2938 struct ordered_event *event) 2939 { 2940 struct trace *trace = container_of(oe, struct trace, oe.data); 2941 2942 return __trace__deliver_event(trace, event->event); 2943 } 2944 2945 static int trace__run(struct trace *trace, int argc, const char **argv) 2946 { 2947 struct perf_evlist *evlist = trace->evlist; 2948 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL; 2949 int err = -1, i; 2950 unsigned long before; 2951 const bool forks = argc > 0; 2952 bool draining = false; 2953 2954 trace->live = true; 2955 2956 if (!trace->raw_augmented_syscalls) { 2957 if (trace->trace_syscalls && trace__add_syscall_newtp(trace)) 2958 goto out_error_raw_syscalls; 2959 2960 if (trace->trace_syscalls) 2961 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist); 2962 } 2963 2964 if ((trace->trace_pgfaults & TRACE_PFMAJ)) { 2965 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ); 2966 if (pgfault_maj == NULL) 2967 goto out_error_mem; 2968 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param); 2969 perf_evlist__add(evlist, pgfault_maj); 2970 } 2971 2972 if ((trace->trace_pgfaults & TRACE_PFMIN)) { 2973 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN); 2974 if (pgfault_min == NULL) 2975 goto out_error_mem; 2976 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param); 2977 perf_evlist__add(evlist, pgfault_min); 2978 } 2979 2980 if (trace->sched && 2981 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime", 2982 trace__sched_stat_runtime)) 2983 goto out_error_sched_stat_runtime; 2984 2985 /* 2986 * If a global cgroup was set, apply it to all the events without an 2987 * explicit cgroup. I.e.: 2988 * 2989 * trace -G A -e sched:*switch 2990 * 2991 * Will set all raw_syscalls:sys_{enter,exit}, pgfault, vfs_getname, etc 2992 * _and_ sched:sched_switch to the 'A' cgroup, while: 2993 * 2994 * trace -e sched:*switch -G A 2995 * 2996 * will only set the sched:sched_switch event to the 'A' cgroup, all the 2997 * other events (raw_syscalls:sys_{enter,exit}, etc are left "without" 2998 * a cgroup (on the root cgroup, sys wide, etc). 2999 * 3000 * Multiple cgroups: 3001 * 3002 * trace -G A -e sched:*switch -G B 3003 * 3004 * the syscall ones go to the 'A' cgroup, the sched:sched_switch goes 3005 * to the 'B' cgroup. 3006 * 3007 * evlist__set_default_cgroup() grabs a reference of the passed cgroup 3008 * only for the evsels still without a cgroup, i.e. evsel->cgroup == NULL. 3009 */ 3010 if (trace->cgroup) 3011 evlist__set_default_cgroup(trace->evlist, trace->cgroup); 3012 3013 err = perf_evlist__create_maps(evlist, &trace->opts.target); 3014 if (err < 0) { 3015 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n"); 3016 goto out_delete_evlist; 3017 } 3018 3019 err = trace__symbols_init(trace, evlist); 3020 if (err < 0) { 3021 fprintf(trace->output, "Problems initializing symbol libraries!\n"); 3022 goto out_delete_evlist; 3023 } 3024 3025 perf_evlist__config(evlist, &trace->opts, &callchain_param); 3026 3027 signal(SIGCHLD, sig_handler); 3028 signal(SIGINT, sig_handler); 3029 3030 if (forks) { 3031 err = perf_evlist__prepare_workload(evlist, &trace->opts.target, 3032 argv, false, NULL); 3033 if (err < 0) { 3034 fprintf(trace->output, "Couldn't run the workload!\n"); 3035 goto out_delete_evlist; 3036 } 3037 } 3038 3039 err = perf_evlist__open(evlist); 3040 if (err < 0) 3041 goto out_error_open; 3042 3043 err = bpf__apply_obj_config(); 3044 if (err) { 3045 char errbuf[BUFSIZ]; 3046 3047 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf)); 3048 pr_err("ERROR: Apply config to BPF failed: %s\n", 3049 errbuf); 3050 goto out_error_open; 3051 } 3052 3053 err = trace__set_filter_pids(trace); 3054 if (err < 0) 3055 goto out_error_mem; 3056 3057 if (trace->syscalls.map) 3058 trace__init_syscalls_bpf_map(trace); 3059 3060 if (trace->ev_qualifier_ids.nr > 0) { 3061 err = trace__set_ev_qualifier_filter(trace); 3062 if (err < 0) 3063 goto out_errno; 3064 3065 if (trace->syscalls.events.sys_exit) { 3066 pr_debug("event qualifier tracepoint filter: %s\n", 3067 trace->syscalls.events.sys_exit->filter); 3068 } 3069 } 3070 3071 err = perf_evlist__apply_filters(evlist, &evsel); 3072 if (err < 0) 3073 goto out_error_apply_filters; 3074 3075 if (trace->dump.map) 3076 bpf_map__fprintf(trace->dump.map, trace->output); 3077 3078 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages); 3079 if (err < 0) 3080 goto out_error_mmap; 3081 3082 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay) 3083 perf_evlist__enable(evlist); 3084 3085 if (forks) 3086 perf_evlist__start_workload(evlist); 3087 3088 if (trace->opts.initial_delay) { 3089 usleep(trace->opts.initial_delay * 1000); 3090 perf_evlist__enable(evlist); 3091 } 3092 3093 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 || 3094 evlist->threads->nr > 1 || 3095 perf_evlist__first(evlist)->attr.inherit; 3096 3097 /* 3098 * Now that we already used evsel->attr to ask the kernel to setup the 3099 * events, lets reuse evsel->attr.sample_max_stack as the limit in 3100 * trace__resolve_callchain(), allowing per-event max-stack settings 3101 * to override an explicitly set --max-stack global setting. 3102 */ 3103 evlist__for_each_entry(evlist, evsel) { 3104 if (evsel__has_callchain(evsel) && 3105 evsel->attr.sample_max_stack == 0) 3106 evsel->attr.sample_max_stack = trace->max_stack; 3107 } 3108 again: 3109 before = trace->nr_events; 3110 3111 for (i = 0; i < evlist->nr_mmaps; i++) { 3112 union perf_event *event; 3113 struct perf_mmap *md; 3114 3115 md = &evlist->mmap[i]; 3116 if (perf_mmap__read_init(md) < 0) 3117 continue; 3118 3119 while ((event = perf_mmap__read_event(md)) != NULL) { 3120 ++trace->nr_events; 3121 3122 err = trace__deliver_event(trace, event); 3123 if (err) 3124 goto out_disable; 3125 3126 perf_mmap__consume(md); 3127 3128 if (interrupted) 3129 goto out_disable; 3130 3131 if (done && !draining) { 3132 perf_evlist__disable(evlist); 3133 draining = true; 3134 } 3135 } 3136 perf_mmap__read_done(md); 3137 } 3138 3139 if (trace->nr_events == before) { 3140 int timeout = done ? 100 : -1; 3141 3142 if (!draining && perf_evlist__poll(evlist, timeout) > 0) { 3143 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP | POLLNVAL) == 0) 3144 draining = true; 3145 3146 goto again; 3147 } else { 3148 if (trace__flush_events(trace)) 3149 goto out_disable; 3150 } 3151 } else { 3152 goto again; 3153 } 3154 3155 out_disable: 3156 thread__zput(trace->current); 3157 3158 perf_evlist__disable(evlist); 3159 3160 if (trace->sort_events) 3161 ordered_events__flush(&trace->oe.data, OE_FLUSH__FINAL); 3162 3163 if (!err) { 3164 if (trace->summary) 3165 trace__fprintf_thread_summary(trace, trace->output); 3166 3167 if (trace->show_tool_stats) { 3168 fprintf(trace->output, "Stats:\n " 3169 " vfs_getname : %" PRIu64 "\n" 3170 " proc_getname: %" PRIu64 "\n", 3171 trace->stats.vfs_getname, 3172 trace->stats.proc_getname); 3173 } 3174 } 3175 3176 out_delete_evlist: 3177 trace__symbols__exit(trace); 3178 3179 perf_evlist__delete(evlist); 3180 cgroup__put(trace->cgroup); 3181 trace->evlist = NULL; 3182 trace->live = false; 3183 return err; 3184 { 3185 char errbuf[BUFSIZ]; 3186 3187 out_error_sched_stat_runtime: 3188 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime"); 3189 goto out_error; 3190 3191 out_error_raw_syscalls: 3192 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)"); 3193 goto out_error; 3194 3195 out_error_mmap: 3196 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf)); 3197 goto out_error; 3198 3199 out_error_open: 3200 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf)); 3201 3202 out_error: 3203 fprintf(trace->output, "%s\n", errbuf); 3204 goto out_delete_evlist; 3205 3206 out_error_apply_filters: 3207 fprintf(trace->output, 3208 "Failed to set filter \"%s\" on event %s with %d (%s)\n", 3209 evsel->filter, perf_evsel__name(evsel), errno, 3210 str_error_r(errno, errbuf, sizeof(errbuf))); 3211 goto out_delete_evlist; 3212 } 3213 out_error_mem: 3214 fprintf(trace->output, "Not enough memory to run!\n"); 3215 goto out_delete_evlist; 3216 3217 out_errno: 3218 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno)); 3219 goto out_delete_evlist; 3220 } 3221 3222 static int trace__replay(struct trace *trace) 3223 { 3224 const struct perf_evsel_str_handler handlers[] = { 3225 { "probe:vfs_getname", trace__vfs_getname, }, 3226 }; 3227 struct perf_data data = { 3228 .path = input_name, 3229 .mode = PERF_DATA_MODE_READ, 3230 .force = trace->force, 3231 }; 3232 struct perf_session *session; 3233 struct perf_evsel *evsel; 3234 int err = -1; 3235 3236 trace->tool.sample = trace__process_sample; 3237 trace->tool.mmap = perf_event__process_mmap; 3238 trace->tool.mmap2 = perf_event__process_mmap2; 3239 trace->tool.comm = perf_event__process_comm; 3240 trace->tool.exit = perf_event__process_exit; 3241 trace->tool.fork = perf_event__process_fork; 3242 trace->tool.attr = perf_event__process_attr; 3243 trace->tool.tracing_data = perf_event__process_tracing_data; 3244 trace->tool.build_id = perf_event__process_build_id; 3245 trace->tool.namespaces = perf_event__process_namespaces; 3246 3247 trace->tool.ordered_events = true; 3248 trace->tool.ordering_requires_timestamps = true; 3249 3250 /* add tid to output */ 3251 trace->multiple_threads = true; 3252 3253 session = perf_session__new(&data, false, &trace->tool); 3254 if (session == NULL) 3255 return -1; 3256 3257 if (trace->opts.target.pid) 3258 symbol_conf.pid_list_str = strdup(trace->opts.target.pid); 3259 3260 if (trace->opts.target.tid) 3261 symbol_conf.tid_list_str = strdup(trace->opts.target.tid); 3262 3263 if (symbol__init(&session->header.env) < 0) 3264 goto out; 3265 3266 trace->host = &session->machines.host; 3267 3268 err = perf_session__set_tracepoints_handlers(session, handlers); 3269 if (err) 3270 goto out; 3271 3272 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 3273 "raw_syscalls:sys_enter"); 3274 /* older kernels have syscalls tp versus raw_syscalls */ 3275 if (evsel == NULL) 3276 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 3277 "syscalls:sys_enter"); 3278 3279 if (evsel && 3280 (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_enter) < 0 || 3281 perf_evsel__init_sc_tp_ptr_field(evsel, args))) { 3282 pr_err("Error during initialize raw_syscalls:sys_enter event\n"); 3283 goto out; 3284 } 3285 3286 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 3287 "raw_syscalls:sys_exit"); 3288 if (evsel == NULL) 3289 evsel = perf_evlist__find_tracepoint_by_name(session->evlist, 3290 "syscalls:sys_exit"); 3291 if (evsel && 3292 (perf_evsel__init_raw_syscall_tp(evsel, trace__sys_exit) < 0 || 3293 perf_evsel__init_sc_tp_uint_field(evsel, ret))) { 3294 pr_err("Error during initialize raw_syscalls:sys_exit event\n"); 3295 goto out; 3296 } 3297 3298 evlist__for_each_entry(session->evlist, evsel) { 3299 if (evsel->attr.type == PERF_TYPE_SOFTWARE && 3300 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ || 3301 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN || 3302 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS)) 3303 evsel->handler = trace__pgfault; 3304 } 3305 3306 setup_pager(); 3307 3308 err = perf_session__process_events(session); 3309 if (err) 3310 pr_err("Failed to process events, error %d", err); 3311 3312 else if (trace->summary) 3313 trace__fprintf_thread_summary(trace, trace->output); 3314 3315 out: 3316 perf_session__delete(session); 3317 3318 return err; 3319 } 3320 3321 static size_t trace__fprintf_threads_header(FILE *fp) 3322 { 3323 size_t printed; 3324 3325 printed = fprintf(fp, "\n Summary of events:\n\n"); 3326 3327 return printed; 3328 } 3329 3330 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs, 3331 struct stats *stats; 3332 double msecs; 3333 int syscall; 3334 ) 3335 { 3336 struct int_node *source = rb_entry(nd, struct int_node, rb_node); 3337 struct stats *stats = source->priv; 3338 3339 entry->syscall = source->i; 3340 entry->stats = stats; 3341 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0; 3342 } 3343 3344 static size_t thread__dump_stats(struct thread_trace *ttrace, 3345 struct trace *trace, FILE *fp) 3346 { 3347 size_t printed = 0; 3348 struct syscall *sc; 3349 struct rb_node *nd; 3350 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats); 3351 3352 if (syscall_stats == NULL) 3353 return 0; 3354 3355 printed += fprintf(fp, "\n"); 3356 3357 printed += fprintf(fp, " syscall calls total min avg max stddev\n"); 3358 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); 3359 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n"); 3360 3361 resort_rb__for_each_entry(nd, syscall_stats) { 3362 struct stats *stats = syscall_stats_entry->stats; 3363 if (stats) { 3364 double min = (double)(stats->min) / NSEC_PER_MSEC; 3365 double max = (double)(stats->max) / NSEC_PER_MSEC; 3366 double avg = avg_stats(stats); 3367 double pct; 3368 u64 n = (u64) stats->n; 3369 3370 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0; 3371 avg /= NSEC_PER_MSEC; 3372 3373 sc = &trace->syscalls.table[syscall_stats_entry->syscall]; 3374 printed += fprintf(fp, " %-15s", sc->name); 3375 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f", 3376 n, syscall_stats_entry->msecs, min, avg); 3377 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct); 3378 } 3379 } 3380 3381 resort_rb__delete(syscall_stats); 3382 printed += fprintf(fp, "\n\n"); 3383 3384 return printed; 3385 } 3386 3387 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace) 3388 { 3389 size_t printed = 0; 3390 struct thread_trace *ttrace = thread__priv(thread); 3391 double ratio; 3392 3393 if (ttrace == NULL) 3394 return 0; 3395 3396 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0; 3397 3398 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid); 3399 printed += fprintf(fp, "%lu events, ", ttrace->nr_events); 3400 printed += fprintf(fp, "%.1f%%", ratio); 3401 if (ttrace->pfmaj) 3402 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj); 3403 if (ttrace->pfmin) 3404 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin); 3405 if (trace->sched) 3406 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms); 3407 else if (fputc('\n', fp) != EOF) 3408 ++printed; 3409 3410 printed += thread__dump_stats(ttrace, trace, fp); 3411 3412 return printed; 3413 } 3414 3415 static unsigned long thread__nr_events(struct thread_trace *ttrace) 3416 { 3417 return ttrace ? ttrace->nr_events : 0; 3418 } 3419 3420 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)), 3421 struct thread *thread; 3422 ) 3423 { 3424 entry->thread = rb_entry(nd, struct thread, rb_node); 3425 } 3426 3427 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp) 3428 { 3429 size_t printed = trace__fprintf_threads_header(fp); 3430 struct rb_node *nd; 3431 int i; 3432 3433 for (i = 0; i < THREADS__TABLE_SIZE; i++) { 3434 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host, i); 3435 3436 if (threads == NULL) { 3437 fprintf(fp, "%s", "Error sorting output by nr_events!\n"); 3438 return 0; 3439 } 3440 3441 resort_rb__for_each_entry(nd, threads) 3442 printed += trace__fprintf_thread(fp, threads_entry->thread, trace); 3443 3444 resort_rb__delete(threads); 3445 } 3446 return printed; 3447 } 3448 3449 static int trace__set_duration(const struct option *opt, const char *str, 3450 int unset __maybe_unused) 3451 { 3452 struct trace *trace = opt->value; 3453 3454 trace->duration_filter = atof(str); 3455 return 0; 3456 } 3457 3458 static int trace__set_filter_pids_from_option(const struct option *opt, const char *str, 3459 int unset __maybe_unused) 3460 { 3461 int ret = -1; 3462 size_t i; 3463 struct trace *trace = opt->value; 3464 /* 3465 * FIXME: introduce a intarray class, plain parse csv and create a 3466 * { int nr, int entries[] } struct... 3467 */ 3468 struct intlist *list = intlist__new(str); 3469 3470 if (list == NULL) 3471 return -1; 3472 3473 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1; 3474 trace->filter_pids.entries = calloc(i, sizeof(pid_t)); 3475 3476 if (trace->filter_pids.entries == NULL) 3477 goto out; 3478 3479 trace->filter_pids.entries[0] = getpid(); 3480 3481 for (i = 1; i < trace->filter_pids.nr; ++i) 3482 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i; 3483 3484 intlist__delete(list); 3485 ret = 0; 3486 out: 3487 return ret; 3488 } 3489 3490 static int trace__open_output(struct trace *trace, const char *filename) 3491 { 3492 struct stat st; 3493 3494 if (!stat(filename, &st) && st.st_size) { 3495 char oldname[PATH_MAX]; 3496 3497 scnprintf(oldname, sizeof(oldname), "%s.old", filename); 3498 unlink(oldname); 3499 rename(filename, oldname); 3500 } 3501 3502 trace->output = fopen(filename, "w"); 3503 3504 return trace->output == NULL ? -errno : 0; 3505 } 3506 3507 static int parse_pagefaults(const struct option *opt, const char *str, 3508 int unset __maybe_unused) 3509 { 3510 int *trace_pgfaults = opt->value; 3511 3512 if (strcmp(str, "all") == 0) 3513 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN; 3514 else if (strcmp(str, "maj") == 0) 3515 *trace_pgfaults |= TRACE_PFMAJ; 3516 else if (strcmp(str, "min") == 0) 3517 *trace_pgfaults |= TRACE_PFMIN; 3518 else 3519 return -1; 3520 3521 return 0; 3522 } 3523 3524 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler) 3525 { 3526 struct perf_evsel *evsel; 3527 3528 evlist__for_each_entry(evlist, evsel) 3529 evsel->handler = handler; 3530 } 3531 3532 static int evlist__set_syscall_tp_fields(struct perf_evlist *evlist) 3533 { 3534 struct perf_evsel *evsel; 3535 3536 evlist__for_each_entry(evlist, evsel) { 3537 if (evsel->priv || !evsel->tp_format) 3538 continue; 3539 3540 if (strcmp(evsel->tp_format->system, "syscalls")) 3541 continue; 3542 3543 if (perf_evsel__init_syscall_tp(evsel)) 3544 return -1; 3545 3546 if (!strncmp(evsel->tp_format->name, "sys_enter_", 10)) { 3547 struct syscall_tp *sc = evsel->priv; 3548 3549 if (__tp_field__init_ptr(&sc->args, sc->id.offset + sizeof(u64))) 3550 return -1; 3551 } else if (!strncmp(evsel->tp_format->name, "sys_exit_", 9)) { 3552 struct syscall_tp *sc = evsel->priv; 3553 3554 if (__tp_field__init_uint(&sc->ret, sizeof(u64), sc->id.offset + sizeof(u64), evsel->needs_swap)) 3555 return -1; 3556 } 3557 } 3558 3559 return 0; 3560 } 3561 3562 /* 3563 * XXX: Hackish, just splitting the combined -e+--event (syscalls 3564 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use 3565 * existing facilities unchanged (trace->ev_qualifier + parse_options()). 3566 * 3567 * It'd be better to introduce a parse_options() variant that would return a 3568 * list with the terms it didn't match to an event... 3569 */ 3570 static int trace__parse_events_option(const struct option *opt, const char *str, 3571 int unset __maybe_unused) 3572 { 3573 struct trace *trace = (struct trace *)opt->value; 3574 const char *s = str; 3575 char *sep = NULL, *lists[2] = { NULL, NULL, }; 3576 int len = strlen(str) + 1, err = -1, list, idx; 3577 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR); 3578 char group_name[PATH_MAX]; 3579 struct syscall_fmt *fmt; 3580 3581 if (strace_groups_dir == NULL) 3582 return -1; 3583 3584 if (*s == '!') { 3585 ++s; 3586 trace->not_ev_qualifier = true; 3587 } 3588 3589 while (1) { 3590 if ((sep = strchr(s, ',')) != NULL) 3591 *sep = '\0'; 3592 3593 list = 0; 3594 if (syscalltbl__id(trace->sctbl, s) >= 0 || 3595 syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) { 3596 list = 1; 3597 goto do_concat; 3598 } 3599 3600 fmt = syscall_fmt__find_by_alias(s); 3601 if (fmt != NULL) { 3602 list = 1; 3603 s = fmt->name; 3604 } else { 3605 path__join(group_name, sizeof(group_name), strace_groups_dir, s); 3606 if (access(group_name, R_OK) == 0) 3607 list = 1; 3608 } 3609 do_concat: 3610 if (lists[list]) { 3611 sprintf(lists[list] + strlen(lists[list]), ",%s", s); 3612 } else { 3613 lists[list] = malloc(len); 3614 if (lists[list] == NULL) 3615 goto out; 3616 strcpy(lists[list], s); 3617 } 3618 3619 if (!sep) 3620 break; 3621 3622 *sep = ','; 3623 s = sep + 1; 3624 } 3625 3626 if (lists[1] != NULL) { 3627 struct strlist_config slist_config = { 3628 .dirname = strace_groups_dir, 3629 }; 3630 3631 trace->ev_qualifier = strlist__new(lists[1], &slist_config); 3632 if (trace->ev_qualifier == NULL) { 3633 fputs("Not enough memory to parse event qualifier", trace->output); 3634 goto out; 3635 } 3636 3637 if (trace__validate_ev_qualifier(trace)) 3638 goto out; 3639 trace->trace_syscalls = true; 3640 } 3641 3642 err = 0; 3643 3644 if (lists[0]) { 3645 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event", 3646 "event selector. use 'perf list' to list available events", 3647 parse_events_option); 3648 err = parse_events_option(&o, lists[0], 0); 3649 } 3650 out: 3651 if (sep) 3652 *sep = ','; 3653 3654 return err; 3655 } 3656 3657 static int trace__parse_cgroups(const struct option *opt, const char *str, int unset) 3658 { 3659 struct trace *trace = opt->value; 3660 3661 if (!list_empty(&trace->evlist->entries)) 3662 return parse_cgroups(opt, str, unset); 3663 3664 trace->cgroup = evlist__findnew_cgroup(trace->evlist, str); 3665 3666 return 0; 3667 } 3668 3669 static struct bpf_map *bpf__find_map_by_name(const char *name) 3670 { 3671 struct bpf_object *obj, *tmp; 3672 3673 bpf_object__for_each_safe(obj, tmp) { 3674 struct bpf_map *map = bpf_object__find_map_by_name(obj, name); 3675 if (map) 3676 return map; 3677 3678 } 3679 3680 return NULL; 3681 } 3682 3683 static void trace__set_bpf_map_filtered_pids(struct trace *trace) 3684 { 3685 trace->filter_pids.map = bpf__find_map_by_name("pids_filtered"); 3686 } 3687 3688 static void trace__set_bpf_map_syscalls(struct trace *trace) 3689 { 3690 trace->syscalls.map = bpf__find_map_by_name("syscalls"); 3691 } 3692 3693 static int trace__config(const char *var, const char *value, void *arg) 3694 { 3695 struct trace *trace = arg; 3696 int err = 0; 3697 3698 if (!strcmp(var, "trace.add_events")) { 3699 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event", 3700 "event selector. use 'perf list' to list available events", 3701 parse_events_option); 3702 /* 3703 * We can't propagate parse_event_option() return, as it is 1 3704 * for failure while perf_config() expects -1. 3705 */ 3706 if (parse_events_option(&o, value, 0)) 3707 err = -1; 3708 } else if (!strcmp(var, "trace.show_timestamp")) { 3709 trace->show_tstamp = perf_config_bool(var, value); 3710 } else if (!strcmp(var, "trace.show_duration")) { 3711 trace->show_duration = perf_config_bool(var, value); 3712 } else if (!strcmp(var, "trace.show_arg_names")) { 3713 trace->show_arg_names = perf_config_bool(var, value); 3714 if (!trace->show_arg_names) 3715 trace->show_zeros = true; 3716 } else if (!strcmp(var, "trace.show_zeros")) { 3717 bool new_show_zeros = perf_config_bool(var, value); 3718 if (!trace->show_arg_names && !new_show_zeros) { 3719 pr_warning("trace.show_zeros has to be set when trace.show_arg_names=no\n"); 3720 goto out; 3721 } 3722 trace->show_zeros = new_show_zeros; 3723 } else if (!strcmp(var, "trace.show_prefix")) { 3724 trace->show_string_prefix = perf_config_bool(var, value); 3725 } else if (!strcmp(var, "trace.no_inherit")) { 3726 trace->opts.no_inherit = perf_config_bool(var, value); 3727 } else if (!strcmp(var, "trace.args_alignment")) { 3728 int args_alignment = 0; 3729 if (perf_config_int(&args_alignment, var, value) == 0) 3730 trace->args_alignment = args_alignment; 3731 } 3732 out: 3733 return err; 3734 } 3735 3736 int cmd_trace(int argc, const char **argv) 3737 { 3738 const char *trace_usage[] = { 3739 "perf trace [<options>] [<command>]", 3740 "perf trace [<options>] -- <command> [<options>]", 3741 "perf trace record [<options>] [<command>]", 3742 "perf trace record [<options>] -- <command> [<options>]", 3743 NULL 3744 }; 3745 struct trace trace = { 3746 .syscalls = { 3747 . max = -1, 3748 }, 3749 .opts = { 3750 .target = { 3751 .uid = UINT_MAX, 3752 .uses_mmap = true, 3753 }, 3754 .user_freq = UINT_MAX, 3755 .user_interval = ULLONG_MAX, 3756 .no_buffering = true, 3757 .mmap_pages = UINT_MAX, 3758 }, 3759 .output = stderr, 3760 .show_comm = true, 3761 .show_tstamp = true, 3762 .show_duration = true, 3763 .show_arg_names = true, 3764 .args_alignment = 70, 3765 .trace_syscalls = false, 3766 .kernel_syscallchains = false, 3767 .max_stack = UINT_MAX, 3768 .max_events = ULONG_MAX, 3769 }; 3770 const char *map_dump_str = NULL; 3771 const char *output_name = NULL; 3772 const struct option trace_options[] = { 3773 OPT_CALLBACK('e', "event", &trace, "event", 3774 "event/syscall selector. use 'perf list' to list available events", 3775 trace__parse_events_option), 3776 OPT_BOOLEAN(0, "comm", &trace.show_comm, 3777 "show the thread COMM next to its id"), 3778 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"), 3779 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace", 3780 trace__parse_events_option), 3781 OPT_STRING('o', "output", &output_name, "file", "output file name"), 3782 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"), 3783 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid", 3784 "trace events on existing process id"), 3785 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid", 3786 "trace events on existing thread id"), 3787 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids", 3788 "pids to filter (by the kernel)", trace__set_filter_pids_from_option), 3789 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide, 3790 "system-wide collection from all CPUs"), 3791 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu", 3792 "list of cpus to monitor"), 3793 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit, 3794 "child tasks do not inherit counters"), 3795 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages", 3796 "number of mmap data pages", 3797 perf_evlist__parse_mmap_pages), 3798 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user", 3799 "user to profile"), 3800 OPT_CALLBACK(0, "duration", &trace, "float", 3801 "show only events with duration > N.M ms", 3802 trace__set_duration), 3803 #ifdef HAVE_LIBBPF_SUPPORT 3804 OPT_STRING(0, "map-dump", &map_dump_str, "BPF map", "BPF map to periodically dump"), 3805 #endif 3806 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"), 3807 OPT_INCR('v', "verbose", &verbose, "be more verbose"), 3808 OPT_BOOLEAN('T', "time", &trace.full_time, 3809 "Show full timestamp, not time relative to first start"), 3810 OPT_BOOLEAN(0, "failure", &trace.failure_only, 3811 "Show only syscalls that failed"), 3812 OPT_BOOLEAN('s', "summary", &trace.summary_only, 3813 "Show only syscall summary with statistics"), 3814 OPT_BOOLEAN('S', "with-summary", &trace.summary, 3815 "Show all syscalls and summary with statistics"), 3816 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min", 3817 "Trace pagefaults", parse_pagefaults, "maj"), 3818 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"), 3819 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"), 3820 OPT_CALLBACK(0, "call-graph", &trace.opts, 3821 "record_mode[,record_size]", record_callchain_help, 3822 &record_parse_callchain_opt), 3823 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains, 3824 "Show the kernel callchains on the syscall exit path"), 3825 OPT_ULONG(0, "max-events", &trace.max_events, 3826 "Set the maximum number of events to print, exit after that is reached. "), 3827 OPT_UINTEGER(0, "min-stack", &trace.min_stack, 3828 "Set the minimum stack depth when parsing the callchain, " 3829 "anything below the specified depth will be ignored."), 3830 OPT_UINTEGER(0, "max-stack", &trace.max_stack, 3831 "Set the maximum stack depth when parsing the callchain, " 3832 "anything beyond the specified depth will be ignored. " 3833 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)), 3834 OPT_BOOLEAN(0, "sort-events", &trace.sort_events, 3835 "Sort batch of events before processing, use if getting out of order events"), 3836 OPT_BOOLEAN(0, "print-sample", &trace.print_sample, 3837 "print the PERF_RECORD_SAMPLE PERF_SAMPLE_ info, for debugging"), 3838 OPT_UINTEGER(0, "proc-map-timeout", &proc_map_timeout, 3839 "per thread proc mmap processing timeout in ms"), 3840 OPT_CALLBACK('G', "cgroup", &trace, "name", "monitor event in cgroup name only", 3841 trace__parse_cgroups), 3842 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay, 3843 "ms to wait before starting measurement after program " 3844 "start"), 3845 OPT_END() 3846 }; 3847 bool __maybe_unused max_stack_user_set = true; 3848 bool mmap_pages_user_set = true; 3849 struct perf_evsel *evsel; 3850 const char * const trace_subcommands[] = { "record", NULL }; 3851 int err = -1; 3852 char bf[BUFSIZ]; 3853 3854 signal(SIGSEGV, sighandler_dump_stack); 3855 signal(SIGFPE, sighandler_dump_stack); 3856 3857 trace.evlist = perf_evlist__new(); 3858 trace.sctbl = syscalltbl__new(); 3859 3860 if (trace.evlist == NULL || trace.sctbl == NULL) { 3861 pr_err("Not enough memory to run!\n"); 3862 err = -ENOMEM; 3863 goto out; 3864 } 3865 3866 err = perf_config(trace__config, &trace); 3867 if (err) 3868 goto out; 3869 3870 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands, 3871 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION); 3872 3873 if ((nr_cgroups || trace.cgroup) && !trace.opts.target.system_wide) { 3874 usage_with_options_msg(trace_usage, trace_options, 3875 "cgroup monitoring only available in system-wide mode"); 3876 } 3877 3878 evsel = bpf__setup_output_event(trace.evlist, "__augmented_syscalls__"); 3879 if (IS_ERR(evsel)) { 3880 bpf__strerror_setup_output_event(trace.evlist, PTR_ERR(evsel), bf, sizeof(bf)); 3881 pr_err("ERROR: Setup trace syscalls enter failed: %s\n", bf); 3882 goto out; 3883 } 3884 3885 if (evsel) { 3886 trace.syscalls.events.augmented = evsel; 3887 trace__set_bpf_map_filtered_pids(&trace); 3888 trace__set_bpf_map_syscalls(&trace); 3889 } 3890 3891 err = bpf__setup_stdout(trace.evlist); 3892 if (err) { 3893 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf)); 3894 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf); 3895 goto out; 3896 } 3897 3898 err = -1; 3899 3900 if (map_dump_str) { 3901 trace.dump.map = bpf__find_map_by_name(map_dump_str); 3902 if (trace.dump.map == NULL) { 3903 pr_err("ERROR: BPF map \"%s\" not found\n", map_dump_str); 3904 goto out; 3905 } 3906 } 3907 3908 if (trace.trace_pgfaults) { 3909 trace.opts.sample_address = true; 3910 trace.opts.sample_time = true; 3911 } 3912 3913 if (trace.opts.mmap_pages == UINT_MAX) 3914 mmap_pages_user_set = false; 3915 3916 if (trace.max_stack == UINT_MAX) { 3917 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl__max_stack(); 3918 max_stack_user_set = false; 3919 } 3920 3921 #ifdef HAVE_DWARF_UNWIND_SUPPORT 3922 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled) { 3923 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false); 3924 } 3925 #endif 3926 3927 if (callchain_param.enabled) { 3928 if (!mmap_pages_user_set && geteuid() == 0) 3929 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4; 3930 3931 symbol_conf.use_callchain = true; 3932 } 3933 3934 if (trace.evlist->nr_entries > 0) { 3935 evlist__set_evsel_handler(trace.evlist, trace__event_handler); 3936 if (evlist__set_syscall_tp_fields(trace.evlist)) { 3937 perror("failed to set syscalls:* tracepoint fields"); 3938 goto out; 3939 } 3940 } 3941 3942 if (trace.sort_events) { 3943 ordered_events__init(&trace.oe.data, ordered_events__deliver_event, &trace); 3944 ordered_events__set_copy_on_queue(&trace.oe.data, true); 3945 } 3946 3947 /* 3948 * If we are augmenting syscalls, then combine what we put in the 3949 * __augmented_syscalls__ BPF map with what is in the 3950 * syscalls:sys_exit_FOO tracepoints, i.e. just like we do without BPF, 3951 * combining raw_syscalls:sys_enter with raw_syscalls:sys_exit. 3952 * 3953 * We'll switch to look at two BPF maps, one for sys_enter and the 3954 * other for sys_exit when we start augmenting the sys_exit paths with 3955 * buffers that are being copied from kernel to userspace, think 'read' 3956 * syscall. 3957 */ 3958 if (trace.syscalls.events.augmented) { 3959 evlist__for_each_entry(trace.evlist, evsel) { 3960 bool raw_syscalls_sys_exit = strcmp(perf_evsel__name(evsel), "raw_syscalls:sys_exit") == 0; 3961 3962 if (raw_syscalls_sys_exit) { 3963 trace.raw_augmented_syscalls = true; 3964 goto init_augmented_syscall_tp; 3965 } 3966 3967 if (trace.syscalls.events.augmented->priv == NULL && 3968 strstr(perf_evsel__name(evsel), "syscalls:sys_enter")) { 3969 struct perf_evsel *augmented = trace.syscalls.events.augmented; 3970 if (perf_evsel__init_augmented_syscall_tp(augmented, evsel) || 3971 perf_evsel__init_augmented_syscall_tp_args(augmented)) 3972 goto out; 3973 augmented->handler = trace__sys_enter; 3974 } 3975 3976 if (strstarts(perf_evsel__name(evsel), "syscalls:sys_exit_")) { 3977 struct syscall_tp *sc; 3978 init_augmented_syscall_tp: 3979 if (perf_evsel__init_augmented_syscall_tp(evsel, evsel)) 3980 goto out; 3981 sc = evsel->priv; 3982 /* 3983 * For now with BPF raw_augmented we hook into 3984 * raw_syscalls:sys_enter and there we get all 3985 * 6 syscall args plus the tracepoint common 3986 * fields and the syscall_nr (another long). 3987 * So we check if that is the case and if so 3988 * don't look after the sc->args_size but 3989 * always after the full raw_syscalls:sys_enter 3990 * payload, which is fixed. 3991 * 3992 * We'll revisit this later to pass 3993 * s->args_size to the BPF augmenter (now 3994 * tools/perf/examples/bpf/augmented_raw_syscalls.c, 3995 * so that it copies only what we need for each 3996 * syscall, like what happens when we use 3997 * syscalls:sys_enter_NAME, so that we reduce 3998 * the kernel/userspace traffic to just what is 3999 * needed for each syscall. 4000 */ 4001 if (trace.raw_augmented_syscalls) 4002 trace.raw_augmented_syscalls_args_size = (6 + 1) * sizeof(long) + sc->id.offset; 4003 perf_evsel__init_augmented_syscall_tp_ret(evsel); 4004 evsel->handler = trace__sys_exit; 4005 } 4006 } 4007 } 4008 4009 if ((argc >= 1) && (strcmp(argv[0], "record") == 0)) 4010 return trace__record(&trace, argc-1, &argv[1]); 4011 4012 /* summary_only implies summary option, but don't overwrite summary if set */ 4013 if (trace.summary_only) 4014 trace.summary = trace.summary_only; 4015 4016 if (!trace.trace_syscalls && !trace.trace_pgfaults && 4017 trace.evlist->nr_entries == 0 /* Was --events used? */) { 4018 trace.trace_syscalls = true; 4019 } 4020 4021 if (output_name != NULL) { 4022 err = trace__open_output(&trace, output_name); 4023 if (err < 0) { 4024 perror("failed to create output file"); 4025 goto out; 4026 } 4027 } 4028 4029 err = target__validate(&trace.opts.target); 4030 if (err) { 4031 target__strerror(&trace.opts.target, err, bf, sizeof(bf)); 4032 fprintf(trace.output, "%s", bf); 4033 goto out_close; 4034 } 4035 4036 err = target__parse_uid(&trace.opts.target); 4037 if (err) { 4038 target__strerror(&trace.opts.target, err, bf, sizeof(bf)); 4039 fprintf(trace.output, "%s", bf); 4040 goto out_close; 4041 } 4042 4043 if (!argc && target__none(&trace.opts.target)) 4044 trace.opts.target.system_wide = true; 4045 4046 if (input_name) 4047 err = trace__replay(&trace); 4048 else 4049 err = trace__run(&trace, argc, argv); 4050 4051 out_close: 4052 if (output_name != NULL) 4053 fclose(trace.output); 4054 out: 4055 return err; 4056 } 4057