1 #include <trace/syscall.h> 2 #include <trace/events/syscalls.h> 3 #include <linux/slab.h> 4 #include <linux/kernel.h> 5 #include <linux/ftrace.h> 6 #include <linux/perf_event.h> 7 #include <asm/syscall.h> 8 9 #include "trace_output.h" 10 #include "trace.h" 11 12 static DEFINE_MUTEX(syscall_trace_lock); 13 static int sys_refcount_enter; 14 static int sys_refcount_exit; 15 static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); 16 static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 17 18 static int syscall_enter_register(struct ftrace_event_call *event, 19 enum trace_reg type); 20 static int syscall_exit_register(struct ftrace_event_call *event, 21 enum trace_reg type); 22 23 static int syscall_enter_define_fields(struct ftrace_event_call *call); 24 static int syscall_exit_define_fields(struct ftrace_event_call *call); 25 26 static struct list_head * 27 syscall_get_enter_fields(struct ftrace_event_call *call) 28 { 29 struct syscall_metadata *entry = call->data; 30 31 return &entry->enter_fields; 32 } 33 34 struct trace_event_functions enter_syscall_print_funcs = { 35 .trace = print_syscall_enter, 36 }; 37 38 struct trace_event_functions exit_syscall_print_funcs = { 39 .trace = print_syscall_exit, 40 }; 41 42 struct ftrace_event_class event_class_syscall_enter = { 43 .system = "syscalls", 44 .reg = syscall_enter_register, 45 .define_fields = syscall_enter_define_fields, 46 .get_fields = syscall_get_enter_fields, 47 .raw_init = init_syscall_trace, 48 }; 49 50 struct ftrace_event_class event_class_syscall_exit = { 51 .system = "syscalls", 52 .reg = syscall_exit_register, 53 .define_fields = syscall_exit_define_fields, 54 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), 55 .raw_init = init_syscall_trace, 56 }; 57 58 extern unsigned long __start_syscalls_metadata[]; 59 extern unsigned long __stop_syscalls_metadata[]; 60 61 static struct syscall_metadata **syscalls_metadata; 62 63 static struct syscall_metadata *find_syscall_meta(unsigned long syscall) 64 { 65 struct syscall_metadata *start; 66 struct syscall_metadata *stop; 67 char str[KSYM_SYMBOL_LEN]; 68 69 70 start = (struct syscall_metadata *)__start_syscalls_metadata; 71 stop = (struct syscall_metadata *)__stop_syscalls_metadata; 72 kallsyms_lookup(syscall, NULL, NULL, NULL, str); 73 74 for ( ; start < stop; start++) { 75 /* 76 * Only compare after the "sys" prefix. Archs that use 77 * syscall wrappers may have syscalls symbols aliases prefixed 78 * with "SyS" instead of "sys", leading to an unwanted 79 * mismatch. 80 */ 81 if (start->name && !strcmp(start->name + 3, str + 3)) 82 return start; 83 } 84 return NULL; 85 } 86 87 static struct syscall_metadata *syscall_nr_to_meta(int nr) 88 { 89 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) 90 return NULL; 91 92 return syscalls_metadata[nr]; 93 } 94 95 enum print_line_t 96 print_syscall_enter(struct trace_iterator *iter, int flags, 97 struct trace_event *event) 98 { 99 struct trace_seq *s = &iter->seq; 100 struct trace_entry *ent = iter->ent; 101 struct syscall_trace_enter *trace; 102 struct syscall_metadata *entry; 103 int i, ret, syscall; 104 105 trace = (typeof(trace))ent; 106 syscall = trace->nr; 107 entry = syscall_nr_to_meta(syscall); 108 109 if (!entry) 110 goto end; 111 112 if (entry->enter_event->event.type != ent->type) { 113 WARN_ON_ONCE(1); 114 goto end; 115 } 116 117 ret = trace_seq_printf(s, "%s(", entry->name); 118 if (!ret) 119 return TRACE_TYPE_PARTIAL_LINE; 120 121 for (i = 0; i < entry->nb_args; i++) { 122 /* parameter types */ 123 if (trace_flags & TRACE_ITER_VERBOSE) { 124 ret = trace_seq_printf(s, "%s ", entry->types[i]); 125 if (!ret) 126 return TRACE_TYPE_PARTIAL_LINE; 127 } 128 /* parameter values */ 129 ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i], 130 trace->args[i], 131 i == entry->nb_args - 1 ? "" : ", "); 132 if (!ret) 133 return TRACE_TYPE_PARTIAL_LINE; 134 } 135 136 ret = trace_seq_putc(s, ')'); 137 if (!ret) 138 return TRACE_TYPE_PARTIAL_LINE; 139 140 end: 141 ret = trace_seq_putc(s, '\n'); 142 if (!ret) 143 return TRACE_TYPE_PARTIAL_LINE; 144 145 return TRACE_TYPE_HANDLED; 146 } 147 148 enum print_line_t 149 print_syscall_exit(struct trace_iterator *iter, int flags, 150 struct trace_event *event) 151 { 152 struct trace_seq *s = &iter->seq; 153 struct trace_entry *ent = iter->ent; 154 struct syscall_trace_exit *trace; 155 int syscall; 156 struct syscall_metadata *entry; 157 int ret; 158 159 trace = (typeof(trace))ent; 160 syscall = trace->nr; 161 entry = syscall_nr_to_meta(syscall); 162 163 if (!entry) { 164 trace_seq_printf(s, "\n"); 165 return TRACE_TYPE_HANDLED; 166 } 167 168 if (entry->exit_event->event.type != ent->type) { 169 WARN_ON_ONCE(1); 170 return TRACE_TYPE_UNHANDLED; 171 } 172 173 ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, 174 trace->ret); 175 if (!ret) 176 return TRACE_TYPE_PARTIAL_LINE; 177 178 return TRACE_TYPE_HANDLED; 179 } 180 181 extern char *__bad_type_size(void); 182 183 #define SYSCALL_FIELD(type, name) \ 184 sizeof(type) != sizeof(trace.name) ? \ 185 __bad_type_size() : \ 186 #type, #name, offsetof(typeof(trace), name), \ 187 sizeof(trace.name), is_signed_type(type) 188 189 static 190 int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) 191 { 192 int i; 193 int pos = 0; 194 195 /* When len=0, we just calculate the needed length */ 196 #define LEN_OR_ZERO (len ? len - pos : 0) 197 198 pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); 199 for (i = 0; i < entry->nb_args; i++) { 200 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", 201 entry->args[i], sizeof(unsigned long), 202 i == entry->nb_args - 1 ? "" : ", "); 203 } 204 pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); 205 206 for (i = 0; i < entry->nb_args; i++) { 207 pos += snprintf(buf + pos, LEN_OR_ZERO, 208 ", ((unsigned long)(REC->%s))", entry->args[i]); 209 } 210 211 #undef LEN_OR_ZERO 212 213 /* return the length of print_fmt */ 214 return pos; 215 } 216 217 static int set_syscall_print_fmt(struct ftrace_event_call *call) 218 { 219 char *print_fmt; 220 int len; 221 struct syscall_metadata *entry = call->data; 222 223 if (entry->enter_event != call) { 224 call->print_fmt = "\"0x%lx\", REC->ret"; 225 return 0; 226 } 227 228 /* First: called with 0 length to calculate the needed length */ 229 len = __set_enter_print_fmt(entry, NULL, 0); 230 231 print_fmt = kmalloc(len + 1, GFP_KERNEL); 232 if (!print_fmt) 233 return -ENOMEM; 234 235 /* Second: actually write the @print_fmt */ 236 __set_enter_print_fmt(entry, print_fmt, len + 1); 237 call->print_fmt = print_fmt; 238 239 return 0; 240 } 241 242 static void free_syscall_print_fmt(struct ftrace_event_call *call) 243 { 244 struct syscall_metadata *entry = call->data; 245 246 if (entry->enter_event == call) 247 kfree(call->print_fmt); 248 } 249 250 static int syscall_enter_define_fields(struct ftrace_event_call *call) 251 { 252 struct syscall_trace_enter trace; 253 struct syscall_metadata *meta = call->data; 254 int ret; 255 int i; 256 int offset = offsetof(typeof(trace), args); 257 258 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); 259 if (ret) 260 return ret; 261 262 for (i = 0; i < meta->nb_args; i++) { 263 ret = trace_define_field(call, meta->types[i], 264 meta->args[i], offset, 265 sizeof(unsigned long), 0, 266 FILTER_OTHER); 267 offset += sizeof(unsigned long); 268 } 269 270 return ret; 271 } 272 273 static int syscall_exit_define_fields(struct ftrace_event_call *call) 274 { 275 struct syscall_trace_exit trace; 276 int ret; 277 278 ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); 279 if (ret) 280 return ret; 281 282 ret = trace_define_field(call, SYSCALL_FIELD(long, ret), 283 FILTER_OTHER); 284 285 return ret; 286 } 287 288 void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) 289 { 290 struct syscall_trace_enter *entry; 291 struct syscall_metadata *sys_data; 292 struct ring_buffer_event *event; 293 struct ring_buffer *buffer; 294 int size; 295 int syscall_nr; 296 297 syscall_nr = syscall_get_nr(current, regs); 298 if (syscall_nr < 0) 299 return; 300 if (!test_bit(syscall_nr, enabled_enter_syscalls)) 301 return; 302 303 sys_data = syscall_nr_to_meta(syscall_nr); 304 if (!sys_data) 305 return; 306 307 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 308 309 event = trace_current_buffer_lock_reserve(&buffer, 310 sys_data->enter_event->event.type, size, 0, 0); 311 if (!event) 312 return; 313 314 entry = ring_buffer_event_data(event); 315 entry->nr = syscall_nr; 316 syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); 317 318 if (!filter_current_check_discard(buffer, sys_data->enter_event, 319 entry, event)) 320 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 321 } 322 323 void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) 324 { 325 struct syscall_trace_exit *entry; 326 struct syscall_metadata *sys_data; 327 struct ring_buffer_event *event; 328 struct ring_buffer *buffer; 329 int syscall_nr; 330 331 syscall_nr = syscall_get_nr(current, regs); 332 if (syscall_nr < 0) 333 return; 334 if (!test_bit(syscall_nr, enabled_exit_syscalls)) 335 return; 336 337 sys_data = syscall_nr_to_meta(syscall_nr); 338 if (!sys_data) 339 return; 340 341 event = trace_current_buffer_lock_reserve(&buffer, 342 sys_data->exit_event->event.type, sizeof(*entry), 0, 0); 343 if (!event) 344 return; 345 346 entry = ring_buffer_event_data(event); 347 entry->nr = syscall_nr; 348 entry->ret = syscall_get_return_value(current, regs); 349 350 if (!filter_current_check_discard(buffer, sys_data->exit_event, 351 entry, event)) 352 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 353 } 354 355 int reg_event_syscall_enter(struct ftrace_event_call *call) 356 { 357 int ret = 0; 358 int num; 359 360 num = ((struct syscall_metadata *)call->data)->syscall_nr; 361 if (num < 0 || num >= NR_syscalls) 362 return -ENOSYS; 363 mutex_lock(&syscall_trace_lock); 364 if (!sys_refcount_enter) 365 ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); 366 if (!ret) { 367 set_bit(num, enabled_enter_syscalls); 368 sys_refcount_enter++; 369 } 370 mutex_unlock(&syscall_trace_lock); 371 return ret; 372 } 373 374 void unreg_event_syscall_enter(struct ftrace_event_call *call) 375 { 376 int num; 377 378 num = ((struct syscall_metadata *)call->data)->syscall_nr; 379 if (num < 0 || num >= NR_syscalls) 380 return; 381 mutex_lock(&syscall_trace_lock); 382 sys_refcount_enter--; 383 clear_bit(num, enabled_enter_syscalls); 384 if (!sys_refcount_enter) 385 unregister_trace_sys_enter(ftrace_syscall_enter, NULL); 386 mutex_unlock(&syscall_trace_lock); 387 } 388 389 int reg_event_syscall_exit(struct ftrace_event_call *call) 390 { 391 int ret = 0; 392 int num; 393 394 num = ((struct syscall_metadata *)call->data)->syscall_nr; 395 if (num < 0 || num >= NR_syscalls) 396 return -ENOSYS; 397 mutex_lock(&syscall_trace_lock); 398 if (!sys_refcount_exit) 399 ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); 400 if (!ret) { 401 set_bit(num, enabled_exit_syscalls); 402 sys_refcount_exit++; 403 } 404 mutex_unlock(&syscall_trace_lock); 405 return ret; 406 } 407 408 void unreg_event_syscall_exit(struct ftrace_event_call *call) 409 { 410 int num; 411 412 num = ((struct syscall_metadata *)call->data)->syscall_nr; 413 if (num < 0 || num >= NR_syscalls) 414 return; 415 mutex_lock(&syscall_trace_lock); 416 sys_refcount_exit--; 417 clear_bit(num, enabled_exit_syscalls); 418 if (!sys_refcount_exit) 419 unregister_trace_sys_exit(ftrace_syscall_exit, NULL); 420 mutex_unlock(&syscall_trace_lock); 421 } 422 423 int init_syscall_trace(struct ftrace_event_call *call) 424 { 425 int id; 426 427 if (set_syscall_print_fmt(call) < 0) 428 return -ENOMEM; 429 430 id = trace_event_raw_init(call); 431 432 if (id < 0) { 433 free_syscall_print_fmt(call); 434 return id; 435 } 436 437 return id; 438 } 439 440 unsigned long __init arch_syscall_addr(int nr) 441 { 442 return (unsigned long)sys_call_table[nr]; 443 } 444 445 int __init init_ftrace_syscalls(void) 446 { 447 struct syscall_metadata *meta; 448 unsigned long addr; 449 int i; 450 451 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * 452 NR_syscalls, GFP_KERNEL); 453 if (!syscalls_metadata) { 454 WARN_ON(1); 455 return -ENOMEM; 456 } 457 458 for (i = 0; i < NR_syscalls; i++) { 459 addr = arch_syscall_addr(i); 460 meta = find_syscall_meta(addr); 461 if (!meta) 462 continue; 463 464 meta->syscall_nr = i; 465 syscalls_metadata[i] = meta; 466 } 467 468 return 0; 469 } 470 core_initcall(init_ftrace_syscalls); 471 472 #ifdef CONFIG_PERF_EVENTS 473 474 static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls); 475 static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); 476 static int sys_perf_refcount_enter; 477 static int sys_perf_refcount_exit; 478 479 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) 480 { 481 struct syscall_metadata *sys_data; 482 struct syscall_trace_enter *rec; 483 struct hlist_head *head; 484 int syscall_nr; 485 int rctx; 486 int size; 487 488 syscall_nr = syscall_get_nr(current, regs); 489 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) 490 return; 491 492 sys_data = syscall_nr_to_meta(syscall_nr); 493 if (!sys_data) 494 return; 495 496 /* get the size after alignment with the u32 buffer size field */ 497 size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); 498 size = ALIGN(size + sizeof(u32), sizeof(u64)); 499 size -= sizeof(u32); 500 501 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 502 "perf buffer not large enough")) 503 return; 504 505 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, 506 sys_data->enter_event->event.type, regs, &rctx); 507 if (!rec) 508 return; 509 510 rec->nr = syscall_nr; 511 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 512 (unsigned long *)&rec->args); 513 514 head = this_cpu_ptr(sys_data->enter_event->perf_events); 515 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); 516 } 517 518 int perf_sysenter_enable(struct ftrace_event_call *call) 519 { 520 int ret = 0; 521 int num; 522 523 num = ((struct syscall_metadata *)call->data)->syscall_nr; 524 525 mutex_lock(&syscall_trace_lock); 526 if (!sys_perf_refcount_enter) 527 ret = register_trace_sys_enter(perf_syscall_enter, NULL); 528 if (ret) { 529 pr_info("event trace: Could not activate" 530 "syscall entry trace point"); 531 } else { 532 set_bit(num, enabled_perf_enter_syscalls); 533 sys_perf_refcount_enter++; 534 } 535 mutex_unlock(&syscall_trace_lock); 536 return ret; 537 } 538 539 void perf_sysenter_disable(struct ftrace_event_call *call) 540 { 541 int num; 542 543 num = ((struct syscall_metadata *)call->data)->syscall_nr; 544 545 mutex_lock(&syscall_trace_lock); 546 sys_perf_refcount_enter--; 547 clear_bit(num, enabled_perf_enter_syscalls); 548 if (!sys_perf_refcount_enter) 549 unregister_trace_sys_enter(perf_syscall_enter, NULL); 550 mutex_unlock(&syscall_trace_lock); 551 } 552 553 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) 554 { 555 struct syscall_metadata *sys_data; 556 struct syscall_trace_exit *rec; 557 struct hlist_head *head; 558 int syscall_nr; 559 int rctx; 560 int size; 561 562 syscall_nr = syscall_get_nr(current, regs); 563 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) 564 return; 565 566 sys_data = syscall_nr_to_meta(syscall_nr); 567 if (!sys_data) 568 return; 569 570 /* We can probably do that at build time */ 571 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); 572 size -= sizeof(u32); 573 574 /* 575 * Impossible, but be paranoid with the future 576 * How to put this check outside runtime? 577 */ 578 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 579 "exit event has grown above perf buffer size")) 580 return; 581 582 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, 583 sys_data->exit_event->event.type, regs, &rctx); 584 if (!rec) 585 return; 586 587 rec->nr = syscall_nr; 588 rec->ret = syscall_get_return_value(current, regs); 589 590 head = this_cpu_ptr(sys_data->exit_event->perf_events); 591 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); 592 } 593 594 int perf_sysexit_enable(struct ftrace_event_call *call) 595 { 596 int ret = 0; 597 int num; 598 599 num = ((struct syscall_metadata *)call->data)->syscall_nr; 600 601 mutex_lock(&syscall_trace_lock); 602 if (!sys_perf_refcount_exit) 603 ret = register_trace_sys_exit(perf_syscall_exit, NULL); 604 if (ret) { 605 pr_info("event trace: Could not activate" 606 "syscall exit trace point"); 607 } else { 608 set_bit(num, enabled_perf_exit_syscalls); 609 sys_perf_refcount_exit++; 610 } 611 mutex_unlock(&syscall_trace_lock); 612 return ret; 613 } 614 615 void perf_sysexit_disable(struct ftrace_event_call *call) 616 { 617 int num; 618 619 num = ((struct syscall_metadata *)call->data)->syscall_nr; 620 621 mutex_lock(&syscall_trace_lock); 622 sys_perf_refcount_exit--; 623 clear_bit(num, enabled_perf_exit_syscalls); 624 if (!sys_perf_refcount_exit) 625 unregister_trace_sys_exit(perf_syscall_exit, NULL); 626 mutex_unlock(&syscall_trace_lock); 627 } 628 629 #endif /* CONFIG_PERF_EVENTS */ 630 631 static int syscall_enter_register(struct ftrace_event_call *event, 632 enum trace_reg type) 633 { 634 switch (type) { 635 case TRACE_REG_REGISTER: 636 return reg_event_syscall_enter(event); 637 case TRACE_REG_UNREGISTER: 638 unreg_event_syscall_enter(event); 639 return 0; 640 641 #ifdef CONFIG_PERF_EVENTS 642 case TRACE_REG_PERF_REGISTER: 643 return perf_sysenter_enable(event); 644 case TRACE_REG_PERF_UNREGISTER: 645 perf_sysenter_disable(event); 646 return 0; 647 #endif 648 } 649 return 0; 650 } 651 652 static int syscall_exit_register(struct ftrace_event_call *event, 653 enum trace_reg type) 654 { 655 switch (type) { 656 case TRACE_REG_REGISTER: 657 return reg_event_syscall_exit(event); 658 case TRACE_REG_UNREGISTER: 659 unreg_event_syscall_exit(event); 660 return 0; 661 662 #ifdef CONFIG_PERF_EVENTS 663 case TRACE_REG_PERF_REGISTER: 664 return perf_sysexit_enable(event); 665 case TRACE_REG_PERF_UNREGISTER: 666 perf_sysexit_disable(event); 667 return 0; 668 #endif 669 } 670 return 0; 671 } 672