1 #include <trace/syscall.h> 2 #include <trace/events/syscalls.h> 3 #include <linux/slab.h> 4 #include <linux/kernel.h> 5 #include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ 6 #include <linux/ftrace.h> 7 #include <linux/perf_event.h> 8 #include <asm/syscall.h> 9 10 #include "trace_output.h" 11 #include "trace.h" 12 13 static DEFINE_MUTEX(syscall_trace_lock); 14 static int sys_refcount_enter; 15 static int sys_refcount_exit; 16 static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); 17 static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 18 19 static int syscall_enter_register(struct ftrace_event_call *event, 20 enum trace_reg type, void *data); 21 static int syscall_exit_register(struct ftrace_event_call *event, 22 enum trace_reg type, void *data); 23 24 static struct list_head * 25 syscall_get_enter_fields(struct ftrace_event_call *call) 26 { 27 struct syscall_metadata *entry = call->data; 28 29 return &entry->enter_fields; 30 } 31 32 extern struct syscall_metadata *__start_syscalls_metadata[]; 33 extern struct syscall_metadata *__stop_syscalls_metadata[]; 34 35 static struct syscall_metadata **syscalls_metadata; 36 37 #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME 38 static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) 39 { 40 /* 41 * Only compare after the "sys" prefix. Archs that use 42 * syscall wrappers may have syscalls symbols aliases prefixed 43 * with "SyS" instead of "sys", leading to an unwanted 44 * mismatch. 45 */ 46 return !strcmp(sym + 3, name + 3); 47 } 48 #endif 49 50 static __init struct syscall_metadata * 51 find_syscall_meta(unsigned long syscall) 52 { 53 struct syscall_metadata **start; 54 struct syscall_metadata **stop; 55 char str[KSYM_SYMBOL_LEN]; 56 57 58 start = __start_syscalls_metadata; 59 stop = __stop_syscalls_metadata; 60 kallsyms_lookup(syscall, NULL, NULL, NULL, str); 61 62 if (arch_syscall_match_sym_name(str, "sys_ni_syscall")) 63 return NULL; 64 65 for ( ; start < stop; start++) { 66 if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) 67 return *start; 68 } 69 return NULL; 70 } 71 72 static struct syscall_metadata *syscall_nr_to_meta(int nr) 73 { 74 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) 75 return NULL; 76 77 return syscalls_metadata[nr]; 78 } 79 80 enum print_line_t 81 print_syscall_enter(struct trace_iterator *iter, int flags, 82 struct trace_event *event) 83 { 84 struct trace_seq *s = &iter->seq; 85 struct trace_entry *ent = iter->ent; 86 struct syscall_trace_enter *trace; 87 struct syscall_metadata *entry; 88 int i, ret, syscall; 89 90 trace = (typeof(trace))ent; 91 syscall = trace->nr; 92 entry = syscall_nr_to_meta(syscall); 93 94 if (!entry) 95 goto end; 96 97 if (entry->enter_event->event.type != ent->type) { 98 WARN_ON_ONCE(1); 99 goto end; 100 } 101 102 ret = trace_seq_printf(s, "%s(", entry->name); 103 if (!ret) 104 return TRACE_TYPE_PARTIAL_LINE; 105 106 for (i = 0; i < entry->nb_args; i++) { 107 /* parameter types */ 108 if (trace_flags & TRACE_ITER_VERBOSE) { 109 ret = trace_seq_printf(s, "%s ", entry->types[i]); 110 if (!ret) 111 return TRACE_TYPE_PARTIAL_LINE; 112 } 113 /* parameter values */ 114 ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i], 115 trace->args[i], 116 i == entry->nb_args - 1 ? "" : ", "); 117 if (!ret) 118 return TRACE_TYPE_PARTIAL_LINE; 119 } 120 121 ret = trace_seq_putc(s, ')'); 122 if (!ret) 123 return TRACE_TYPE_PARTIAL_LINE; 124 125 end: 126 ret = trace_seq_putc(s, '\n'); 127 if (!ret) 128 return TRACE_TYPE_PARTIAL_LINE; 129 130 return TRACE_TYPE_HANDLED; 131 } 132 133 enum print_line_t 134 print_syscall_exit(struct trace_iterator *iter, int flags, 135 struct trace_event *event) 136 { 137 struct trace_seq *s = &iter->seq; 138 struct trace_entry *ent = iter->ent; 139 struct syscall_trace_exit *trace; 140 int syscall; 141 struct syscall_metadata *entry; 142 int ret; 143 144 trace = (typeof(trace))ent; 145 syscall = trace->nr; 146 entry = syscall_nr_to_meta(syscall); 147 148 if (!entry) { 149 trace_seq_printf(s, "\n"); 150 return TRACE_TYPE_HANDLED; 151 } 152 153 if (entry->exit_event->event.type != ent->type) { 154 WARN_ON_ONCE(1); 155 return TRACE_TYPE_UNHANDLED; 156 } 157 158 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, 159 trace->ret); 160 if (!ret) 161 return TRACE_TYPE_PARTIAL_LINE; 162 163 return TRACE_TYPE_HANDLED; 164 } 165 166 extern char *__bad_type_size(void); 167 168 #define SYSCALL_FIELD(type, name) \ 169 sizeof(type) != sizeof(trace.name) ? \ 170 __bad_type_size() : \ 171 #type, #name, offsetof(typeof(trace), name), \ 172 sizeof(trace.name), is_signed_type(type) 173 174 static 175 int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) 176 { 177 int i; 178 int pos = 0; 179 180 /* When len=0, we just calculate the needed length */ 181 #define LEN_OR_ZERO (len ? len - pos : 0) 182 183 pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); 184 for (i = 0; i < entry->nb_args; i++) { 185 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", 186 entry->args[i], sizeof(unsigned long), 187 i == entry->nb_args - 1 ? "" : ", "); 188 } 189 pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); 190 191 for (i = 0; i < entry->nb_args; i++) { 192 pos += snprintf(buf + pos, LEN_OR_ZERO, 193 ", ((unsigned long)(REC->%s))", entry->args[i]); 194 } 195 196 #undef LEN_OR_ZERO 197 198 /* return the length of print_fmt */ 199 return pos; 200 } 201 202 static int set_syscall_print_fmt(struct ftrace_event_call *call) 203 { 204 char *print_fmt; 205 int len; 206 struct syscall_metadata *entry = call->data; 207 208 if (entry->enter_event != call) { 209 call->print_fmt = "\"0x%lx\", REC->ret"; 210 return 0; 211 } 212 213 /* First: called with 0 length to calculate the needed length */ 214 len = __set_enter_print_fmt(entry, NULL, 0); 215 216 print_fmt = kmalloc(len + 1, GFP_KERNEL); 217 if (!print_fmt) 218 return -ENOMEM; 219 220 /* Second: actually write the @print_fmt */ 221 __set_enter_print_fmt(entry, print_fmt, len + 1); 222 call->print_fmt = print_fmt; 223 224 return 0; 225 } 226 227 static void free_syscall_print_fmt(struct ftrace_event_call *call) 228 { 229 struct syscall_metadata *entry = call->data; 230 231 if (entry->enter_event == call) 232 kfree(call->print_fmt); 233 } 234 235 static int syscall_enter_define_fields(struct ftrace_event_call *call) 236 { 237 struct syscall_trace_enter trace; 238 struct syscall_metadata *meta = call->data; 239 int ret; 240 int i; 241 int offset = offsetof(typeof(trace), args); 242 243 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); 244 if (ret) 245 return ret; 246 247 for (i = 0; i < meta->nb_args; i++) { 248 ret = trace_define_field(call, meta->types[i], 249 meta->args[i], offset, 250 sizeof(unsigned long), 0, 251 FILTER_OTHER); 252 offset += sizeof(unsigned long); 253 } 254 255 return ret; 256 } 257 258 static int syscall_exit_define_fields(struct ftrace_event_call *call) 259 { 260 struct syscall_trace_exit trace; 261 int ret; 262 263 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); 264 if (ret) 265 return ret; 266 267 ret = trace_define_field(call, SYSCALL_FIELD(long, ret), 268 FILTER_OTHER); 269 270 return ret; 271 } 272 273 void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) 274 { 275 struct syscall_trace_enter *entry; 276 struct syscall_metadata *sys_data; 277 struct ring_buffer_event *event; 278 struct ring_buffer *buffer; 279 int size; 280 int syscall_nr; 281 282 syscall_nr = syscall_get_nr(current, regs); 283 if (syscall_nr < 0) 284 return; 285 if (!test_bit(syscall_nr, enabled_enter_syscalls)) 286 return; 287 288 sys_data = syscall_nr_to_meta(syscall_nr); 289 if (!sys_data) 290 return; 291 292 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 293 294 event = trace_current_buffer_lock_reserve(&buffer, 295 sys_data->enter_event->event.type, size, 0, 0); 296 if (!event) 297 return; 298 299 entry = ring_buffer_event_data(event); 300 entry->nr = syscall_nr; 301 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); 302 303 if (!filter_current_check_discard(buffer, sys_data->enter_event, 304 entry, event)) 305 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 306 } 307 308 void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) 309 { 310 struct syscall_trace_exit *entry; 311 struct syscall_metadata *sys_data; 312 struct ring_buffer_event *event; 313 struct ring_buffer *buffer; 314 int syscall_nr; 315 316 syscall_nr = syscall_get_nr(current, regs); 317 if (syscall_nr < 0) 318 return; 319 if (!test_bit(syscall_nr, enabled_exit_syscalls)) 320 return; 321 322 sys_data = syscall_nr_to_meta(syscall_nr); 323 if (!sys_data) 324 return; 325 326 event = trace_current_buffer_lock_reserve(&buffer, 327 sys_data->exit_event->event.type, sizeof(*entry), 0, 0); 328 if (!event) 329 return; 330 331 entry = ring_buffer_event_data(event); 332 entry->nr = syscall_nr; 333 entry->ret = syscall_get_return_value(current, regs); 334 335 if (!filter_current_check_discard(buffer, sys_data->exit_event, 336 entry, event)) 337 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 338 } 339 340 int reg_event_syscall_enter(struct ftrace_event_call *call) 341 { 342 int ret = 0; 343 int num; 344 345 num = ((struct syscall_metadata *)call->data)->syscall_nr; 346 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 347 return -ENOSYS; 348 mutex_lock(&syscall_trace_lock); 349 if (!sys_refcount_enter) 350 ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); 351 if (!ret) { 352 set_bit(num, enabled_enter_syscalls); 353 sys_refcount_enter++; 354 } 355 mutex_unlock(&syscall_trace_lock); 356 return ret; 357 } 358 359 void unreg_event_syscall_enter(struct ftrace_event_call *call) 360 { 361 int num; 362 363 num = ((struct syscall_metadata *)call->data)->syscall_nr; 364 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 365 return; 366 mutex_lock(&syscall_trace_lock); 367 sys_refcount_enter--; 368 clear_bit(num, enabled_enter_syscalls); 369 if (!sys_refcount_enter) 370 unregister_trace_sys_enter(ftrace_syscall_enter, NULL); 371 mutex_unlock(&syscall_trace_lock); 372 } 373 374 int reg_event_syscall_exit(struct ftrace_event_call *call) 375 { 376 int ret = 0; 377 int num; 378 379 num = ((struct syscall_metadata *)call->data)->syscall_nr; 380 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 381 return -ENOSYS; 382 mutex_lock(&syscall_trace_lock); 383 if (!sys_refcount_exit) 384 ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); 385 if (!ret) { 386 set_bit(num, enabled_exit_syscalls); 387 sys_refcount_exit++; 388 } 389 mutex_unlock(&syscall_trace_lock); 390 return ret; 391 } 392 393 void unreg_event_syscall_exit(struct ftrace_event_call *call) 394 { 395 int num; 396 397 num = ((struct syscall_metadata *)call->data)->syscall_nr; 398 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 399 return; 400 mutex_lock(&syscall_trace_lock); 401 sys_refcount_exit--; 402 clear_bit(num, enabled_exit_syscalls); 403 if (!sys_refcount_exit) 404 unregister_trace_sys_exit(ftrace_syscall_exit, NULL); 405 mutex_unlock(&syscall_trace_lock); 406 } 407 408 static int init_syscall_trace(struct ftrace_event_call *call) 409 { 410 int id; 411 int num; 412 413 num = ((struct syscall_metadata *)call->data)->syscall_nr; 414 if (num < 0 || num >= NR_syscalls) { 415 pr_debug("syscall %s metadata not mapped, disabling ftrace event\n", 416 ((struct syscall_metadata *)call->data)->name); 417 return -ENOSYS; 418 } 419 420 if (set_syscall_print_fmt(call) < 0) 421 return -ENOMEM; 422 423 id = trace_event_raw_init(call); 424 425 if (id < 0) { 426 free_syscall_print_fmt(call); 427 return id; 428 } 429 430 return id; 431 } 432 433 struct trace_event_functions enter_syscall_print_funcs = { 434 .trace = print_syscall_enter, 435 }; 436 437 struct trace_event_functions exit_syscall_print_funcs = { 438 .trace = print_syscall_exit, 439 }; 440 441 struct ftrace_event_class event_class_syscall_enter = { 442 .system = "syscalls", 443 .reg = syscall_enter_register, 444 .define_fields = syscall_enter_define_fields, 445 .get_fields = syscall_get_enter_fields, 446 .raw_init = init_syscall_trace, 447 }; 448 449 struct ftrace_event_class event_class_syscall_exit = { 450 .system = "syscalls", 451 .reg = syscall_exit_register, 452 .define_fields = syscall_exit_define_fields, 453 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), 454 .raw_init = init_syscall_trace, 455 }; 456 457 unsigned long __init __weak arch_syscall_addr(int nr) 458 { 459 return (unsigned long)sys_call_table[nr]; 460 } 461 462 int __init init_ftrace_syscalls(void) 463 { 464 struct syscall_metadata *meta; 465 unsigned long addr; 466 int i; 467 468 syscalls_metadata = kcalloc(NR_syscalls, sizeof(*syscalls_metadata), 469 GFP_KERNEL); 470 if (!syscalls_metadata) { 471 WARN_ON(1); 472 return -ENOMEM; 473 } 474 475 for (i = 0; i < NR_syscalls; i++) { 476 addr = arch_syscall_addr(i); 477 meta = find_syscall_meta(addr); 478 if (!meta) 479 continue; 480 481 meta->syscall_nr = i; 482 syscalls_metadata[i] = meta; 483 } 484 485 return 0; 486 } 487 early_initcall(init_ftrace_syscalls); 488 489 #ifdef CONFIG_PERF_EVENTS 490 491 static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls); 492 static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); 493 static int sys_perf_refcount_enter; 494 static int sys_perf_refcount_exit; 495 496 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) 497 { 498 struct syscall_metadata *sys_data; 499 struct syscall_trace_enter *rec; 500 struct hlist_head *head; 501 int syscall_nr; 502 int rctx; 503 int size; 504 505 syscall_nr = syscall_get_nr(current, regs); 506 if (syscall_nr < 0) 507 return; 508 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) 509 return; 510 511 sys_data = syscall_nr_to_meta(syscall_nr); 512 if (!sys_data) 513 return; 514 515 /* get the size after alignment with the u32 buffer size field */ 516 size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); 517 size = ALIGN(size + sizeof(u32), sizeof(u64)); 518 size -= sizeof(u32); 519 520 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 521 "perf buffer not large enough")) 522 return; 523 524 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, 525 sys_data->enter_event->event.type, regs, &rctx); 526 if (!rec) 527 return; 528 529 rec->nr = syscall_nr; 530 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 531 (unsigned long *)&rec->args); 532 533 head = this_cpu_ptr(sys_data->enter_event->perf_events); 534 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); 535 } 536 537 static int perf_sysenter_enable(struct ftrace_event_call *call) 538 { 539 int ret = 0; 540 int num; 541 542 num = ((struct syscall_metadata *)call->data)->syscall_nr; 543 544 mutex_lock(&syscall_trace_lock); 545 if (!sys_perf_refcount_enter) 546 ret = register_trace_sys_enter(perf_syscall_enter, NULL); 547 if (ret) { 548 pr_info("event trace: Could not activate" 549 "syscall entry trace point"); 550 } else { 551 set_bit(num, enabled_perf_enter_syscalls); 552 sys_perf_refcount_enter++; 553 } 554 mutex_unlock(&syscall_trace_lock); 555 return ret; 556 } 557 558 static void perf_sysenter_disable(struct ftrace_event_call *call) 559 { 560 int num; 561 562 num = ((struct syscall_metadata *)call->data)->syscall_nr; 563 564 mutex_lock(&syscall_trace_lock); 565 sys_perf_refcount_enter--; 566 clear_bit(num, enabled_perf_enter_syscalls); 567 if (!sys_perf_refcount_enter) 568 unregister_trace_sys_enter(perf_syscall_enter, NULL); 569 mutex_unlock(&syscall_trace_lock); 570 } 571 572 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) 573 { 574 struct syscall_metadata *sys_data; 575 struct syscall_trace_exit *rec; 576 struct hlist_head *head; 577 int syscall_nr; 578 int rctx; 579 int size; 580 581 syscall_nr = syscall_get_nr(current, regs); 582 if (syscall_nr < 0) 583 return; 584 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) 585 return; 586 587 sys_data = syscall_nr_to_meta(syscall_nr); 588 if (!sys_data) 589 return; 590 591 /* We can probably do that at build time */ 592 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); 593 size -= sizeof(u32); 594 595 /* 596 * Impossible, but be paranoid with the future 597 * How to put this check outside runtime? 598 */ 599 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 600 "exit event has grown above perf buffer size")) 601 return; 602 603 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, 604 sys_data->exit_event->event.type, regs, &rctx); 605 if (!rec) 606 return; 607 608 rec->nr = syscall_nr; 609 rec->ret = syscall_get_return_value(current, regs); 610 611 head = this_cpu_ptr(sys_data->exit_event->perf_events); 612 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head, NULL); 613 } 614 615 static int perf_sysexit_enable(struct ftrace_event_call *call) 616 { 617 int ret = 0; 618 int num; 619 620 num = ((struct syscall_metadata *)call->data)->syscall_nr; 621 622 mutex_lock(&syscall_trace_lock); 623 if (!sys_perf_refcount_exit) 624 ret = register_trace_sys_exit(perf_syscall_exit, NULL); 625 if (ret) { 626 pr_info("event trace: Could not activate" 627 "syscall exit trace point"); 628 } else { 629 set_bit(num, enabled_perf_exit_syscalls); 630 sys_perf_refcount_exit++; 631 } 632 mutex_unlock(&syscall_trace_lock); 633 return ret; 634 } 635 636 static void perf_sysexit_disable(struct ftrace_event_call *call) 637 { 638 int num; 639 640 num = ((struct syscall_metadata *)call->data)->syscall_nr; 641 642 mutex_lock(&syscall_trace_lock); 643 sys_perf_refcount_exit--; 644 clear_bit(num, enabled_perf_exit_syscalls); 645 if (!sys_perf_refcount_exit) 646 unregister_trace_sys_exit(perf_syscall_exit, NULL); 647 mutex_unlock(&syscall_trace_lock); 648 } 649 650 #endif /* CONFIG_PERF_EVENTS */ 651 652 static int syscall_enter_register(struct ftrace_event_call *event, 653 enum trace_reg type, void *data) 654 { 655 switch (type) { 656 case TRACE_REG_REGISTER: 657 return reg_event_syscall_enter(event); 658 case TRACE_REG_UNREGISTER: 659 unreg_event_syscall_enter(event); 660 return 0; 661 662 #ifdef CONFIG_PERF_EVENTS 663 case TRACE_REG_PERF_REGISTER: 664 return perf_sysenter_enable(event); 665 case TRACE_REG_PERF_UNREGISTER: 666 perf_sysenter_disable(event); 667 return 0; 668 case TRACE_REG_PERF_OPEN: 669 case TRACE_REG_PERF_CLOSE: 670 case TRACE_REG_PERF_ADD: 671 case TRACE_REG_PERF_DEL: 672 return 0; 673 #endif 674 } 675 return 0; 676 } 677 678 static int syscall_exit_register(struct ftrace_event_call *event, 679 enum trace_reg type, void *data) 680 { 681 switch (type) { 682 case TRACE_REG_REGISTER: 683 return reg_event_syscall_exit(event); 684 case TRACE_REG_UNREGISTER: 685 unreg_event_syscall_exit(event); 686 return 0; 687 688 #ifdef CONFIG_PERF_EVENTS 689 case TRACE_REG_PERF_REGISTER: 690 return perf_sysexit_enable(event); 691 case TRACE_REG_PERF_UNREGISTER: 692 perf_sysexit_disable(event); 693 return 0; 694 case TRACE_REG_PERF_OPEN: 695 case TRACE_REG_PERF_CLOSE: 696 case TRACE_REG_PERF_ADD: 697 case TRACE_REG_PERF_DEL: 698 return 0; 699 #endif 700 } 701 return 0; 702 } 703