1 // SPDX-License-Identifier: GPL-2.0 2 #include <trace/syscall.h> 3 #include <trace/events/syscalls.h> 4 #include <linux/kernel_stat.h> 5 #include <linux/syscalls.h> 6 #include <linux/slab.h> 7 #include <linux/kernel.h> 8 #include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ 9 #include <linux/ftrace.h> 10 #include <linux/perf_event.h> 11 #include <linux/xarray.h> 12 #include <asm/syscall.h> 13 14 #include "trace_output.h" 15 #include "trace.h" 16 17 static DEFINE_MUTEX(syscall_trace_lock); 18 19 static int syscall_enter_register(struct trace_event_call *event, 20 enum trace_reg type, void *data); 21 static int syscall_exit_register(struct trace_event_call *event, 22 enum trace_reg type, void *data); 23 24 static struct list_head * 25 syscall_get_enter_fields(struct trace_event_call *call) 26 { 27 struct syscall_metadata *entry = call->data; 28 29 return &entry->enter_fields; 30 } 31 32 extern struct syscall_metadata *__start_syscalls_metadata[]; 33 extern struct syscall_metadata *__stop_syscalls_metadata[]; 34 35 static DEFINE_XARRAY(syscalls_metadata_sparse); 36 static struct syscall_metadata **syscalls_metadata; 37 38 #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME 39 static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) 40 { 41 /* 42 * Only compare after the "sys" prefix. Archs that use 43 * syscall wrappers may have syscalls symbols aliases prefixed 44 * with ".SyS" or ".sys" instead of "sys", leading to an unwanted 45 * mismatch. 46 */ 47 return !strcmp(sym + 3, name + 3); 48 } 49 #endif 50 51 #ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS 52 /* 53 * Some architectures that allow for 32bit applications 54 * to run on a 64bit kernel, do not map the syscalls for 55 * the 32bit tasks the same as they do for 64bit tasks. 56 * 57 * *cough*x86*cough* 58 * 59 * In such a case, instead of reporting the wrong syscalls, 60 * simply ignore them. 61 * 62 * For an arch to ignore the compat syscalls it needs to 63 * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as 64 * define the function arch_trace_is_compat_syscall() to let 65 * the tracing system know that it should ignore it. 66 */ 67 static int 68 trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) 69 { 70 if (unlikely(arch_trace_is_compat_syscall(regs))) 71 return -1; 72 73 return syscall_get_nr(task, regs); 74 } 75 #else 76 static inline int 77 trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) 78 { 79 return syscall_get_nr(task, regs); 80 } 81 #endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */ 82 83 static __init struct syscall_metadata * 84 find_syscall_meta(unsigned long syscall) 85 { 86 struct syscall_metadata **start; 87 struct syscall_metadata **stop; 88 char str[KSYM_SYMBOL_LEN]; 89 90 91 start = __start_syscalls_metadata; 92 stop = __stop_syscalls_metadata; 93 kallsyms_lookup(syscall, NULL, NULL, NULL, str); 94 95 if (arch_syscall_match_sym_name(str, "sys_ni_syscall")) 96 return NULL; 97 98 for ( ; start < stop; start++) { 99 if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) 100 return *start; 101 } 102 return NULL; 103 } 104 105 static struct syscall_metadata *syscall_nr_to_meta(int nr) 106 { 107 if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) 108 return xa_load(&syscalls_metadata_sparse, (unsigned long)nr); 109 110 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) 111 return NULL; 112 113 return syscalls_metadata[nr]; 114 } 115 116 const char *get_syscall_name(int syscall) 117 { 118 struct syscall_metadata *entry; 119 120 entry = syscall_nr_to_meta(syscall); 121 if (!entry) 122 return NULL; 123 124 return entry->name; 125 } 126 127 /* Added to user strings or arrays when max limit is reached */ 128 #define EXTRA "..." 129 130 static void get_dynamic_len_ptr(struct syscall_trace_enter *trace, 131 struct syscall_metadata *entry, 132 int *offset_p, int *len_p, unsigned char **ptr_p) 133 { 134 unsigned char *ptr; 135 int offset = *offset_p; 136 int val; 137 138 /* This arg points to a user space string */ 139 ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset; 140 val = *(int *)ptr; 141 142 /* The value is a dynamic string (len << 16 | offset) */ 143 ptr = (void *)trace + (val & 0xffff); 144 *len_p = val >> 16; 145 offset += 4; 146 147 *ptr_p = ptr; 148 *offset_p = offset; 149 } 150 151 static enum print_line_t 152 sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadata *entry, 153 struct trace_seq *s, struct trace_event *event) 154 { 155 unsigned char *ptr; 156 int offset = 0; 157 int bits, len; 158 bool done = false; 159 static const struct trace_print_flags __flags[] = 160 { 161 { O_TMPFILE, "O_TMPFILE" }, 162 { O_WRONLY, "O_WRONLY" }, 163 { O_RDWR, "O_RDWR" }, 164 { O_CREAT, "O_CREAT" }, 165 { O_EXCL, "O_EXCL" }, 166 { O_NOCTTY, "O_NOCTTY" }, 167 { O_TRUNC, "O_TRUNC" }, 168 { O_APPEND, "O_APPEND" }, 169 { O_NONBLOCK, "O_NONBLOCK" }, 170 { O_DSYNC, "O_DSYNC" }, 171 { O_DIRECT, "O_DIRECT" }, 172 { O_LARGEFILE, "O_LARGEFILE" }, 173 { O_DIRECTORY, "O_DIRECTORY" }, 174 { O_NOFOLLOW, "O_NOFOLLOW" }, 175 { O_NOATIME, "O_NOATIME" }, 176 { O_CLOEXEC, "O_CLOEXEC" }, 177 { -1, NULL } 178 }; 179 180 trace_seq_printf(s, "%s(", entry->name); 181 182 for (int i = 0; !done && i < entry->nb_args; i++) { 183 184 if (trace_seq_has_overflowed(s)) 185 goto end; 186 187 if (i) 188 trace_seq_puts(s, ", "); 189 190 switch (i) { 191 case 2: 192 bits = trace->args[2]; 193 194 trace_seq_puts(s, "flags: "); 195 196 /* No need to show mode when not creating the file */ 197 if (!(bits & (O_CREAT|O_TMPFILE))) 198 done = true; 199 200 if (!(bits & O_ACCMODE)) { 201 if (!bits) { 202 trace_seq_puts(s, "O_RDONLY"); 203 continue; 204 } 205 trace_seq_puts(s, "O_RDONLY|"); 206 } 207 208 trace_print_flags_seq(s, "|", bits, __flags); 209 /* 210 * trace_print_flags_seq() adds a '\0' to the 211 * buffer, but this needs to append more to the seq. 212 */ 213 if (!trace_seq_has_overflowed(s)) 214 trace_seq_pop(s); 215 216 continue; 217 case 3: 218 trace_seq_printf(s, "%s: 0%03o", entry->args[i], 219 (unsigned int)trace->args[i]); 220 continue; 221 } 222 223 trace_seq_printf(s, "%s: %lu", entry->args[i], 224 trace->args[i]); 225 226 if (!(BIT(i) & entry->user_mask)) 227 continue; 228 229 get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr); 230 trace_seq_printf(s, " \"%.*s\"", len, ptr); 231 } 232 233 trace_seq_putc(s, ')'); 234 end: 235 trace_seq_putc(s, '\n'); 236 237 return trace_handle_return(s); 238 } 239 240 static enum print_line_t 241 print_syscall_enter(struct trace_iterator *iter, int flags, 242 struct trace_event *event) 243 { 244 struct trace_array *tr = iter->tr; 245 struct trace_seq *s = &iter->seq; 246 struct trace_entry *ent = iter->ent; 247 struct syscall_trace_enter *trace; 248 struct syscall_metadata *entry; 249 int i, syscall, val, len; 250 unsigned char *ptr; 251 int offset = 0; 252 253 trace = (typeof(trace))ent; 254 syscall = trace->nr; 255 entry = syscall_nr_to_meta(syscall); 256 257 if (!entry) 258 goto end; 259 260 if (entry->enter_event->event.type != ent->type) { 261 WARN_ON_ONCE(1); 262 goto end; 263 } 264 265 switch (entry->syscall_nr) { 266 case __NR_openat: 267 if (!tr || !(tr->trace_flags & TRACE_ITER(VERBOSE))) 268 return sys_enter_openat_print(trace, entry, s, event); 269 break; 270 default: 271 break; 272 } 273 274 trace_seq_printf(s, "%s(", entry->name); 275 276 for (i = 0; i < entry->nb_args; i++) { 277 bool printable = false; 278 char *str; 279 280 if (trace_seq_has_overflowed(s)) 281 goto end; 282 283 if (i) 284 trace_seq_puts(s, ", "); 285 286 /* parameter types */ 287 if (tr && tr->trace_flags & TRACE_ITER(VERBOSE)) 288 trace_seq_printf(s, "%s ", entry->types[i]); 289 290 /* parameter values */ 291 if (trace->args[i] < 10) 292 trace_seq_printf(s, "%s: %lu", entry->args[i], 293 trace->args[i]); 294 else 295 trace_seq_printf(s, "%s: 0x%lx", entry->args[i], 296 trace->args[i]); 297 298 if (!(BIT(i) & entry->user_mask)) 299 continue; 300 301 get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr); 302 303 if (entry->user_arg_size < 0 || entry->user_arg_is_str) { 304 trace_seq_printf(s, " \"%.*s\"", len, ptr); 305 continue; 306 } 307 308 val = trace->args[entry->user_arg_size]; 309 310 str = ptr; 311 trace_seq_puts(s, " ("); 312 for (int x = 0; x < len; x++, ptr++) { 313 if (isascii(*ptr) && isprint(*ptr)) 314 printable = true; 315 if (x) 316 trace_seq_putc(s, ':'); 317 trace_seq_printf(s, "%02x", *ptr); 318 } 319 if (len < val) 320 trace_seq_printf(s, ", %s", EXTRA); 321 322 trace_seq_putc(s, ')'); 323 324 /* If nothing is printable, don't bother printing anything */ 325 if (!printable) 326 continue; 327 328 trace_seq_puts(s, " \""); 329 for (int x = 0; x < len; x++) { 330 if (isascii(str[x]) && isprint(str[x])) 331 trace_seq_putc(s, str[x]); 332 else 333 trace_seq_putc(s, '.'); 334 } 335 if (len < val) 336 trace_seq_printf(s, "\"%s", EXTRA); 337 else 338 trace_seq_putc(s, '"'); 339 } 340 341 trace_seq_putc(s, ')'); 342 end: 343 trace_seq_putc(s, '\n'); 344 345 return trace_handle_return(s); 346 } 347 348 static enum print_line_t 349 print_syscall_exit(struct trace_iterator *iter, int flags, 350 struct trace_event *event) 351 { 352 struct trace_seq *s = &iter->seq; 353 struct trace_entry *ent = iter->ent; 354 struct syscall_trace_exit *trace; 355 int syscall; 356 struct syscall_metadata *entry; 357 358 trace = (typeof(trace))ent; 359 syscall = trace->nr; 360 entry = syscall_nr_to_meta(syscall); 361 362 if (!entry) { 363 trace_seq_putc(s, '\n'); 364 goto out; 365 } 366 367 if (entry->exit_event->event.type != ent->type) { 368 WARN_ON_ONCE(1); 369 return TRACE_TYPE_UNHANDLED; 370 } 371 372 trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, 373 trace->ret); 374 375 out: 376 return trace_handle_return(s); 377 } 378 379 #define SYSCALL_FIELD(_type, _name) { \ 380 .type = #_type, .name = #_name, \ 381 .size = sizeof(_type), .align = __alignof__(_type), \ 382 .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER } 383 384 /* When len=0, we just calculate the needed length */ 385 #define LEN_OR_ZERO (len ? len - pos : 0) 386 387 static int __init 388 sys_enter_openat_print_fmt(struct syscall_metadata *entry, char *buf, int len) 389 { 390 int pos = 0; 391 392 pos += snprintf(buf + pos, LEN_OR_ZERO, 393 "\"dfd: 0x%%08lx, filename: 0x%%08lx \\\"%%s\\\", flags: %%s%%s, mode: 0%%03o\","); 394 pos += snprintf(buf + pos, LEN_OR_ZERO, 395 " ((unsigned long)(REC->dfd)),"); 396 pos += snprintf(buf + pos, LEN_OR_ZERO, 397 " ((unsigned long)(REC->filename)),"); 398 pos += snprintf(buf + pos, LEN_OR_ZERO, 399 " __get_str(__filename_val),"); 400 pos += snprintf(buf + pos, LEN_OR_ZERO, 401 " (REC->flags & ~3) && !(REC->flags & 3) ? \"O_RDONLY|\" : \"\", "); 402 pos += snprintf(buf + pos, LEN_OR_ZERO, 403 " REC->flags ? __print_flags(REC->flags, \"|\", "); 404 pos += snprintf(buf + pos, LEN_OR_ZERO, 405 "{ 0x%x, \"O_WRONLY\" }, ", O_WRONLY); 406 pos += snprintf(buf + pos, LEN_OR_ZERO, 407 "{ 0x%x, \"O_RDWR\" }, ", O_RDWR); 408 pos += snprintf(buf + pos, LEN_OR_ZERO, 409 "{ 0x%x, \"O_CREAT\" }, ", O_CREAT); 410 pos += snprintf(buf + pos, LEN_OR_ZERO, 411 "{ 0x%x, \"O_EXCL\" }, ", O_EXCL); 412 pos += snprintf(buf + pos, LEN_OR_ZERO, 413 "{ 0x%x, \"O_NOCTTY\" }, ", O_NOCTTY); 414 pos += snprintf(buf + pos, LEN_OR_ZERO, 415 "{ 0x%x, \"O_TRUNC\" }, ", O_TRUNC); 416 pos += snprintf(buf + pos, LEN_OR_ZERO, 417 "{ 0x%x, \"O_APPEND\" }, ", O_APPEND); 418 pos += snprintf(buf + pos, LEN_OR_ZERO, 419 "{ 0x%x, \"O_NONBLOCK\" }, ", O_NONBLOCK); 420 pos += snprintf(buf + pos, LEN_OR_ZERO, 421 "{ 0x%x, \"O_DSYNC\" }, ", O_DSYNC); 422 pos += snprintf(buf + pos, LEN_OR_ZERO, 423 "{ 0x%x, \"O_DIRECT\" }, ", O_DIRECT); 424 pos += snprintf(buf + pos, LEN_OR_ZERO, 425 "{ 0x%x, \"O_LARGEFILE\" }, ", O_LARGEFILE); 426 pos += snprintf(buf + pos, LEN_OR_ZERO, 427 "{ 0x%x, \"O_DIRECTORY\" }, ", O_DIRECTORY); 428 pos += snprintf(buf + pos, LEN_OR_ZERO, 429 "{ 0x%x, \"O_NOFOLLOW\" }, ", O_NOFOLLOW); 430 pos += snprintf(buf + pos, LEN_OR_ZERO, 431 "{ 0x%x, \"O_NOATIME\" }, ", O_NOATIME); 432 pos += snprintf(buf + pos, LEN_OR_ZERO, 433 "{ 0x%x, \"O_CLOEXEC\" }) : \"O_RDONLY\", ", O_CLOEXEC); 434 435 pos += snprintf(buf + pos, LEN_OR_ZERO, 436 " ((unsigned long)(REC->mode))"); 437 return pos; 438 } 439 440 static int __init 441 __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) 442 { 443 bool is_string = entry->user_arg_is_str; 444 int i; 445 int pos = 0; 446 447 switch (entry->syscall_nr) { 448 case __NR_openat: 449 return sys_enter_openat_print_fmt(entry, buf, len); 450 default: 451 break; 452 } 453 454 pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); 455 for (i = 0; i < entry->nb_args; i++) { 456 if (i) 457 pos += snprintf(buf + pos, LEN_OR_ZERO, ", "); 458 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx", 459 entry->args[i], sizeof(unsigned long)); 460 461 if (!(BIT(i) & entry->user_mask)) 462 continue; 463 464 /* Add the format for the user space string or array */ 465 if (entry->user_arg_size < 0 || is_string) 466 pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\""); 467 else 468 pos += snprintf(buf + pos, LEN_OR_ZERO, " (%%s)"); 469 } 470 pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); 471 472 for (i = 0; i < entry->nb_args; i++) { 473 pos += snprintf(buf + pos, LEN_OR_ZERO, 474 ", ((unsigned long)(REC->%s))", entry->args[i]); 475 if (!(BIT(i) & entry->user_mask)) 476 continue; 477 /* The user space data for arg has name __<arg>_val */ 478 if (entry->user_arg_size < 0 || is_string) { 479 pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)", 480 entry->args[i]); 481 } else { 482 pos += snprintf(buf + pos, LEN_OR_ZERO, ", __print_dynamic_array(__%s_val, 1)", 483 entry->args[i]); 484 } 485 } 486 487 #undef LEN_OR_ZERO 488 489 /* return the length of print_fmt */ 490 return pos; 491 } 492 493 static int __init set_syscall_print_fmt(struct trace_event_call *call) 494 { 495 char *print_fmt; 496 int len; 497 struct syscall_metadata *entry = call->data; 498 499 if (entry->enter_event != call) { 500 call->print_fmt = "\"0x%lx\", REC->ret"; 501 return 0; 502 } 503 504 /* First: called with 0 length to calculate the needed length */ 505 len = __set_enter_print_fmt(entry, NULL, 0); 506 507 print_fmt = kmalloc(len + 1, GFP_KERNEL); 508 if (!print_fmt) 509 return -ENOMEM; 510 511 /* Second: actually write the @print_fmt */ 512 __set_enter_print_fmt(entry, print_fmt, len + 1); 513 call->print_fmt = print_fmt; 514 515 return 0; 516 } 517 518 static void __init free_syscall_print_fmt(struct trace_event_call *call) 519 { 520 struct syscall_metadata *entry = call->data; 521 522 if (entry->enter_event == call) 523 kfree(call->print_fmt); 524 } 525 526 static int __init syscall_enter_define_fields(struct trace_event_call *call) 527 { 528 struct syscall_trace_enter trace; 529 struct syscall_metadata *meta = call->data; 530 unsigned long mask; 531 char *arg; 532 int offset = offsetof(typeof(trace), args); 533 int ret = 0; 534 int len; 535 int i; 536 537 for (i = 0; i < meta->nb_args; i++) { 538 ret = trace_define_field(call, meta->types[i], 539 meta->args[i], offset, 540 sizeof(unsigned long), 0, 541 FILTER_OTHER); 542 if (ret) 543 break; 544 offset += sizeof(unsigned long); 545 } 546 547 if (ret || !meta->user_mask) 548 return ret; 549 550 mask = meta->user_mask; 551 552 while (mask) { 553 int idx = ffs(mask) - 1; 554 mask &= ~BIT(idx); 555 556 /* 557 * User space data is faulted into a temporary buffer and then 558 * added as a dynamic string or array to the end of the event. 559 * The user space data name for the arg pointer is 560 * "__<arg>_val". 561 */ 562 len = strlen(meta->args[idx]) + sizeof("___val"); 563 arg = kmalloc(len, GFP_KERNEL); 564 if (WARN_ON_ONCE(!arg)) { 565 meta->user_mask = 0; 566 return -ENOMEM; 567 } 568 569 snprintf(arg, len, "__%s_val", meta->args[idx]); 570 571 ret = trace_define_field(call, "__data_loc char[]", 572 arg, offset, sizeof(int), 0, 573 FILTER_OTHER); 574 if (ret) { 575 kfree(arg); 576 break; 577 } 578 offset += 4; 579 } 580 return ret; 581 } 582 583 /* 584 * Create a per CPU temporary buffer to copy user space pointers into. 585 * 586 * SYSCALL_FAULT_USER_MAX is the amount to copy from user space. 587 * (defined in kernel/trace/trace.h) 588 589 * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space plus the 590 * nul terminating byte and possibly appended EXTRA (4 bytes). 591 * 592 * SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use 593 * to copy memory from user space addresses into that will hold 594 * 3 args as only 3 args are allowed to be copied from system calls. 595 */ 596 #define SYSCALL_FAULT_ARG_SZ (SYSCALL_FAULT_USER_MAX + 1 + 4) 597 #define SYSCALL_FAULT_MAX_CNT 3 598 #define SYSCALL_FAULT_BUF_SZ (SYSCALL_FAULT_ARG_SZ * SYSCALL_FAULT_MAX_CNT) 599 600 /* Use the tracing per CPU buffer infrastructure to copy from user space */ 601 struct syscall_user_buffer { 602 struct trace_user_buf_info buf; 603 struct rcu_head rcu; 604 }; 605 606 static struct syscall_user_buffer *syscall_buffer; 607 608 static int syscall_fault_buffer_enable(void) 609 { 610 struct syscall_user_buffer *sbuf; 611 int ret; 612 613 lockdep_assert_held(&syscall_trace_lock); 614 615 if (syscall_buffer) { 616 trace_user_fault_get(&syscall_buffer->buf); 617 return 0; 618 } 619 620 sbuf = kmalloc(sizeof(*sbuf), GFP_KERNEL); 621 if (!sbuf) 622 return -ENOMEM; 623 624 ret = trace_user_fault_init(&sbuf->buf, SYSCALL_FAULT_BUF_SZ); 625 if (ret < 0) { 626 kfree(sbuf); 627 return ret; 628 } 629 630 WRITE_ONCE(syscall_buffer, sbuf); 631 632 return 0; 633 } 634 635 static void rcu_free_syscall_buffer(struct rcu_head *rcu) 636 { 637 struct syscall_user_buffer *sbuf = 638 container_of(rcu, struct syscall_user_buffer, rcu); 639 640 trace_user_fault_destroy(&sbuf->buf); 641 kfree(sbuf); 642 } 643 644 645 static void syscall_fault_buffer_disable(void) 646 { 647 struct syscall_user_buffer *sbuf = syscall_buffer; 648 649 lockdep_assert_held(&syscall_trace_lock); 650 651 if (trace_user_fault_put(&sbuf->buf)) 652 return; 653 654 WRITE_ONCE(syscall_buffer, NULL); 655 call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer); 656 } 657 658 struct syscall_args { 659 char *ptr_array[SYSCALL_FAULT_MAX_CNT]; 660 int read[SYSCALL_FAULT_MAX_CNT]; 661 int uargs; 662 }; 663 664 static int syscall_copy_user(char *buf, const char __user *ptr, 665 size_t size, void *data) 666 { 667 struct syscall_args *args = data; 668 int ret; 669 670 for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { 671 ptr = (char __user *)args->ptr_array[i]; 672 ret = strncpy_from_user(buf, ptr, size); 673 args->read[i] = ret; 674 } 675 return 0; 676 } 677 678 static int syscall_copy_user_array(char *buf, const char __user *ptr, 679 size_t size, void *data) 680 { 681 struct syscall_args *args = data; 682 int ret; 683 684 for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { 685 ptr = (char __user *)args->ptr_array[i]; 686 ret = __copy_from_user(buf, ptr, size); 687 args->read[i] = ret ? -1 : size; 688 } 689 return 0; 690 } 691 692 static char *sys_fault_user(unsigned int buf_size, 693 struct syscall_metadata *sys_data, 694 struct syscall_user_buffer *sbuf, 695 unsigned long *args, 696 unsigned int data_size[SYSCALL_FAULT_MAX_CNT]) 697 { 698 trace_user_buf_copy syscall_copy = syscall_copy_user; 699 unsigned long mask = sys_data->user_mask; 700 unsigned long size = SYSCALL_FAULT_ARG_SZ - 1; 701 struct syscall_args sargs; 702 bool array = false; 703 char *buffer; 704 char *buf; 705 int ret; 706 int i = 0; 707 708 /* The extra is appended to the user data in the buffer */ 709 BUILD_BUG_ON(SYSCALL_FAULT_USER_MAX + sizeof(EXTRA) >= 710 SYSCALL_FAULT_ARG_SZ); 711 712 /* 713 * If this system call event has a size argument, use 714 * it to define how much of user space memory to read, 715 * and read it as an array and not a string. 716 */ 717 if (sys_data->user_arg_size >= 0) { 718 array = true; 719 size = args[sys_data->user_arg_size]; 720 if (size > SYSCALL_FAULT_ARG_SZ - 1) 721 size = SYSCALL_FAULT_ARG_SZ - 1; 722 syscall_copy = syscall_copy_user_array; 723 } 724 725 while (mask) { 726 int idx = ffs(mask) - 1; 727 mask &= ~BIT(idx); 728 729 if (WARN_ON_ONCE(i == SYSCALL_FAULT_MAX_CNT)) 730 break; 731 732 /* Get the pointer to user space memory to read */ 733 sargs.ptr_array[i++] = (char *)args[idx]; 734 } 735 736 sargs.uargs = i; 737 738 /* Clear the values that are not used */ 739 for (; i < SYSCALL_FAULT_MAX_CNT; i++) { 740 data_size[i] = -1; /* Denotes no pointer */ 741 } 742 743 /* A zero size means do not even try */ 744 if (!buf_size) 745 return NULL; 746 747 buffer = trace_user_fault_read(&sbuf->buf, NULL, size, 748 syscall_copy, &sargs); 749 if (!buffer) 750 return NULL; 751 752 buf = buffer; 753 for (i = 0; i < sargs.uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { 754 755 ret = sargs.read[i]; 756 if (ret < 0) 757 continue; 758 buf[ret] = '\0'; 759 760 /* For strings, replace any non-printable characters with '.' */ 761 if (!array) { 762 for (int x = 0; x < ret; x++) { 763 if (!isprint(buf[x])) 764 buf[x] = '.'; 765 } 766 767 size = min(buf_size, SYSCALL_FAULT_USER_MAX); 768 769 /* 770 * If the text was truncated due to our max limit, 771 * add "..." to the string. 772 */ 773 if (ret > size) { 774 strscpy(buf + size, EXTRA, sizeof(EXTRA)); 775 ret = size + sizeof(EXTRA); 776 } else { 777 buf[ret++] = '\0'; 778 } 779 } else { 780 ret = min((unsigned int)ret, buf_size); 781 } 782 data_size[i] = ret; 783 } 784 785 return buffer; 786 } 787 788 static int 789 syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args, 790 char **buffer, int *size, int *user_sizes, int *uargs, 791 int buf_size) 792 { 793 struct syscall_user_buffer *sbuf; 794 int i; 795 796 /* If the syscall_buffer is NULL, tracing is being shutdown */ 797 sbuf = READ_ONCE(syscall_buffer); 798 if (!sbuf) 799 return -1; 800 801 *buffer = sys_fault_user(buf_size, sys_data, sbuf, args, user_sizes); 802 /* 803 * user_size is the amount of data to append. 804 * Need to add 4 for the meta field that points to 805 * the user memory at the end of the event and also 806 * stores its size. 807 */ 808 for (i = 0; i < SYSCALL_FAULT_MAX_CNT; i++) { 809 if (user_sizes[i] < 0) 810 break; 811 *size += user_sizes[i] + 4; 812 } 813 /* Save the number of user read arguments of this syscall */ 814 *uargs = i; 815 return 0; 816 } 817 818 static void syscall_put_data(struct syscall_metadata *sys_data, 819 struct syscall_trace_enter *entry, 820 char *buffer, int size, int *user_sizes, int uargs) 821 { 822 char *buf = buffer; 823 void *ptr; 824 int val; 825 826 /* 827 * Set the pointer to point to the meta data of the event 828 * that has information about the stored user space memory. 829 */ 830 ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args; 831 832 /* 833 * The meta data will store the offset of the user data from 834 * the beginning of the event. That is after the static arguments 835 * and the meta data fields. 836 */ 837 val = (ptr - (void *)entry) + 4 * uargs; 838 839 for (int i = 0; i < uargs; i++) { 840 841 if (i) 842 val += user_sizes[i - 1]; 843 844 /* Store the offset and the size into the meta data */ 845 *(int *)ptr = val | (user_sizes[i] << 16); 846 847 /* Skip the meta data */ 848 ptr += 4; 849 } 850 851 for (int i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { 852 /* Nothing to do if the user space was empty or faulted */ 853 if (!user_sizes[i]) 854 continue; 855 856 memcpy(ptr, buf, user_sizes[i]); 857 ptr += user_sizes[i]; 858 } 859 } 860 861 static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) 862 { 863 struct trace_array *tr = data; 864 struct trace_event_file *trace_file; 865 struct syscall_trace_enter *entry; 866 struct syscall_metadata *sys_data; 867 struct trace_event_buffer fbuffer; 868 unsigned long args[6]; 869 char *user_ptr; 870 int user_sizes[SYSCALL_FAULT_MAX_CNT] = {}; 871 int syscall_nr; 872 int size = 0; 873 int uargs = 0; 874 bool mayfault; 875 876 /* 877 * Syscall probe called with preemption enabled, but the ring 878 * buffer and per-cpu data require preemption to be disabled. 879 */ 880 might_fault(); 881 882 syscall_nr = trace_get_syscall_nr(current, regs); 883 if (syscall_nr < 0 || syscall_nr >= NR_syscalls) 884 return; 885 886 trace_file = READ_ONCE(tr->enter_syscall_files[syscall_nr]); 887 if (!trace_file) 888 return; 889 890 if (trace_trigger_soft_disabled(trace_file)) 891 return; 892 893 sys_data = syscall_nr_to_meta(syscall_nr); 894 if (!sys_data) 895 return; 896 897 /* Check if this syscall event faults in user space memory */ 898 mayfault = sys_data->user_mask != 0; 899 900 guard(preempt_notrace)(); 901 902 syscall_get_arguments(current, regs, args); 903 904 if (mayfault) { 905 if (syscall_get_data(sys_data, args, &user_ptr, 906 &size, user_sizes, &uargs, tr->syscall_buf_sz) < 0) 907 return; 908 } 909 910 size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 911 912 entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); 913 if (!entry) 914 return; 915 916 entry = ring_buffer_event_data(fbuffer.event); 917 entry->nr = syscall_nr; 918 919 memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); 920 921 if (mayfault) 922 syscall_put_data(sys_data, entry, user_ptr, size, user_sizes, uargs); 923 924 trace_event_buffer_commit(&fbuffer); 925 } 926 927 static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) 928 { 929 struct trace_array *tr = data; 930 struct trace_event_file *trace_file; 931 struct syscall_trace_exit *entry; 932 struct syscall_metadata *sys_data; 933 struct trace_event_buffer fbuffer; 934 int syscall_nr; 935 936 /* 937 * Syscall probe called with preemption enabled, but the ring 938 * buffer and per-cpu data require preemption to be disabled. 939 */ 940 might_fault(); 941 guard(preempt_notrace)(); 942 943 syscall_nr = trace_get_syscall_nr(current, regs); 944 if (syscall_nr < 0 || syscall_nr >= NR_syscalls) 945 return; 946 947 trace_file = READ_ONCE(tr->exit_syscall_files[syscall_nr]); 948 if (!trace_file) 949 return; 950 951 if (trace_trigger_soft_disabled(trace_file)) 952 return; 953 954 sys_data = syscall_nr_to_meta(syscall_nr); 955 if (!sys_data) 956 return; 957 958 entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry)); 959 if (!entry) 960 return; 961 962 entry = ring_buffer_event_data(fbuffer.event); 963 entry->nr = syscall_nr; 964 entry->ret = syscall_get_return_value(current, regs); 965 966 trace_event_buffer_commit(&fbuffer); 967 } 968 969 static int reg_event_syscall_enter(struct trace_event_file *file, 970 struct trace_event_call *call) 971 { 972 struct syscall_metadata *sys_data = call->data; 973 struct trace_array *tr = file->tr; 974 int ret = 0; 975 int num; 976 977 num = sys_data->syscall_nr; 978 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 979 return -ENOSYS; 980 guard(mutex)(&syscall_trace_lock); 981 if (sys_data->user_mask) { 982 ret = syscall_fault_buffer_enable(); 983 if (ret < 0) 984 return ret; 985 } 986 if (!tr->sys_refcount_enter) { 987 ret = register_trace_sys_enter(ftrace_syscall_enter, tr); 988 if (ret < 0) { 989 if (sys_data->user_mask) 990 syscall_fault_buffer_disable(); 991 return ret; 992 } 993 } 994 WRITE_ONCE(tr->enter_syscall_files[num], file); 995 tr->sys_refcount_enter++; 996 return 0; 997 } 998 999 static void unreg_event_syscall_enter(struct trace_event_file *file, 1000 struct trace_event_call *call) 1001 { 1002 struct syscall_metadata *sys_data = call->data; 1003 struct trace_array *tr = file->tr; 1004 int num; 1005 1006 num = sys_data->syscall_nr; 1007 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 1008 return; 1009 guard(mutex)(&syscall_trace_lock); 1010 tr->sys_refcount_enter--; 1011 WRITE_ONCE(tr->enter_syscall_files[num], NULL); 1012 if (!tr->sys_refcount_enter) 1013 unregister_trace_sys_enter(ftrace_syscall_enter, tr); 1014 if (sys_data->user_mask) 1015 syscall_fault_buffer_disable(); 1016 } 1017 1018 static int reg_event_syscall_exit(struct trace_event_file *file, 1019 struct trace_event_call *call) 1020 { 1021 struct trace_array *tr = file->tr; 1022 int ret = 0; 1023 int num; 1024 1025 num = ((struct syscall_metadata *)call->data)->syscall_nr; 1026 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 1027 return -ENOSYS; 1028 mutex_lock(&syscall_trace_lock); 1029 if (!tr->sys_refcount_exit) 1030 ret = register_trace_sys_exit(ftrace_syscall_exit, tr); 1031 if (!ret) { 1032 WRITE_ONCE(tr->exit_syscall_files[num], file); 1033 tr->sys_refcount_exit++; 1034 } 1035 mutex_unlock(&syscall_trace_lock); 1036 return ret; 1037 } 1038 1039 static void unreg_event_syscall_exit(struct trace_event_file *file, 1040 struct trace_event_call *call) 1041 { 1042 struct trace_array *tr = file->tr; 1043 int num; 1044 1045 num = ((struct syscall_metadata *)call->data)->syscall_nr; 1046 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 1047 return; 1048 mutex_lock(&syscall_trace_lock); 1049 tr->sys_refcount_exit--; 1050 WRITE_ONCE(tr->exit_syscall_files[num], NULL); 1051 if (!tr->sys_refcount_exit) 1052 unregister_trace_sys_exit(ftrace_syscall_exit, tr); 1053 mutex_unlock(&syscall_trace_lock); 1054 } 1055 1056 /* 1057 * For system calls that reference user space memory that can 1058 * be recorded into the event, set the system call meta data's user_mask 1059 * to the "args" index that points to the user space memory to retrieve. 1060 */ 1061 static void check_faultable_syscall(struct trace_event_call *call, int nr) 1062 { 1063 struct syscall_metadata *sys_data = call->data; 1064 unsigned long mask; 1065 1066 /* Only work on entry */ 1067 if (sys_data->enter_event != call) 1068 return; 1069 1070 sys_data->user_arg_size = -1; 1071 1072 switch (nr) { 1073 /* user arg 1 with size arg at 2 */ 1074 case __NR_write: 1075 #ifdef __NR_mq_timedsend 1076 case __NR_mq_timedsend: 1077 #endif 1078 case __NR_pwrite64: 1079 sys_data->user_mask = BIT(1); 1080 sys_data->user_arg_size = 2; 1081 break; 1082 /* user arg 0 with size arg at 1 as string */ 1083 case __NR_setdomainname: 1084 case __NR_sethostname: 1085 sys_data->user_mask = BIT(0); 1086 sys_data->user_arg_size = 1; 1087 sys_data->user_arg_is_str = 1; 1088 break; 1089 #ifdef __NR_kexec_file_load 1090 /* user arg 4 with size arg at 3 as string */ 1091 case __NR_kexec_file_load: 1092 sys_data->user_mask = BIT(4); 1093 sys_data->user_arg_size = 3; 1094 sys_data->user_arg_is_str = 1; 1095 break; 1096 #endif 1097 /* user arg at position 0 */ 1098 #ifdef __NR_access 1099 case __NR_access: 1100 #endif 1101 case __NR_acct: 1102 case __NR_chdir: 1103 #ifdef __NR_chown 1104 case __NR_chown: 1105 #endif 1106 #ifdef __NR_chmod 1107 case __NR_chmod: 1108 #endif 1109 case __NR_chroot: 1110 #ifdef __NR_creat 1111 case __NR_creat: 1112 #endif 1113 case __NR_delete_module: 1114 case __NR_execve: 1115 case __NR_fsopen: 1116 #ifdef __NR_lchown 1117 case __NR_lchown: 1118 #endif 1119 #ifdef __NR_open 1120 case __NR_open: 1121 #endif 1122 case __NR_memfd_create: 1123 #ifdef __NR_mkdir 1124 case __NR_mkdir: 1125 #endif 1126 #ifdef __NR_mknod 1127 case __NR_mknod: 1128 #endif 1129 case __NR_mq_open: 1130 case __NR_mq_unlink: 1131 #ifdef __NR_readlink 1132 case __NR_readlink: 1133 #endif 1134 #ifdef __NR_rmdir 1135 case __NR_rmdir: 1136 #endif 1137 case __NR_shmdt: 1138 #ifdef __NR_statfs 1139 case __NR_statfs: 1140 #endif 1141 case __NR_swapon: 1142 case __NR_swapoff: 1143 #ifdef __NR_truncate 1144 case __NR_truncate: 1145 #endif 1146 #ifdef __NR_unlink 1147 case __NR_unlink: 1148 #endif 1149 case __NR_umount2: 1150 #ifdef __NR_utime 1151 case __NR_utime: 1152 #endif 1153 #ifdef __NR_utimes 1154 case __NR_utimes: 1155 #endif 1156 sys_data->user_mask = BIT(0); 1157 break; 1158 /* user arg at position 1 */ 1159 case __NR_execveat: 1160 case __NR_faccessat: 1161 case __NR_faccessat2: 1162 case __NR_finit_module: 1163 case __NR_fchmodat: 1164 case __NR_fchmodat2: 1165 case __NR_fchownat: 1166 case __NR_fgetxattr: 1167 case __NR_flistxattr: 1168 case __NR_fsetxattr: 1169 case __NR_fspick: 1170 case __NR_fremovexattr: 1171 #ifdef __NR_futimesat 1172 case __NR_futimesat: 1173 #endif 1174 case __NR_inotify_add_watch: 1175 case __NR_mkdirat: 1176 case __NR_mknodat: 1177 case __NR_mount_setattr: 1178 case __NR_name_to_handle_at: 1179 #ifdef __NR_newfstatat 1180 case __NR_newfstatat: 1181 #endif 1182 case __NR_openat: 1183 case __NR_openat2: 1184 case __NR_open_tree: 1185 case __NR_open_tree_attr: 1186 case __NR_readlinkat: 1187 case __NR_quotactl: 1188 case __NR_syslog: 1189 case __NR_statx: 1190 case __NR_unlinkat: 1191 #ifdef __NR_utimensat 1192 case __NR_utimensat: 1193 #endif 1194 sys_data->user_mask = BIT(1); 1195 break; 1196 /* user arg at position 2 */ 1197 case __NR_init_module: 1198 case __NR_fsconfig: 1199 sys_data->user_mask = BIT(2); 1200 break; 1201 /* user arg at position 4 */ 1202 case __NR_fanotify_mark: 1203 sys_data->user_mask = BIT(4); 1204 break; 1205 /* 2 user args, 0 and 1 */ 1206 case __NR_add_key: 1207 case __NR_getxattr: 1208 case __NR_lgetxattr: 1209 case __NR_lremovexattr: 1210 #ifdef __NR_link 1211 case __NR_link: 1212 #endif 1213 case __NR_listxattr: 1214 case __NR_llistxattr: 1215 case __NR_lsetxattr: 1216 case __NR_pivot_root: 1217 case __NR_removexattr: 1218 #ifdef __NR_rename 1219 case __NR_rename: 1220 #endif 1221 case __NR_request_key: 1222 case __NR_setxattr: 1223 #ifdef __NR_symlink 1224 case __NR_symlink: 1225 #endif 1226 sys_data->user_mask = BIT(0) | BIT(1); 1227 break; 1228 /* 2 user args, 0 and 2 */ 1229 case __NR_symlinkat: 1230 sys_data->user_mask = BIT(0) | BIT(2); 1231 break; 1232 /* 2 user args, 1 and 3 */ 1233 case __NR_getxattrat: 1234 case __NR_linkat: 1235 case __NR_listxattrat: 1236 case __NR_move_mount: 1237 #ifdef __NR_renameat 1238 case __NR_renameat: 1239 #endif 1240 case __NR_renameat2: 1241 case __NR_removexattrat: 1242 case __NR_setxattrat: 1243 sys_data->user_mask = BIT(1) | BIT(3); 1244 break; 1245 case __NR_mount: /* Just dev_name and dir_name, TODO add type */ 1246 sys_data->user_mask = BIT(0) | BIT(1) | BIT(2); 1247 break; 1248 default: 1249 sys_data->user_mask = 0; 1250 return; 1251 } 1252 1253 if (sys_data->user_arg_size < 0) 1254 return; 1255 1256 /* 1257 * The user_arg_size can only be used when the system call 1258 * is reading only a single address from user space. 1259 */ 1260 mask = sys_data->user_mask; 1261 if (WARN_ON(mask & (mask - 1))) 1262 sys_data->user_arg_size = -1; 1263 } 1264 1265 static int __init init_syscall_trace(struct trace_event_call *call) 1266 { 1267 int id; 1268 int num; 1269 1270 num = ((struct syscall_metadata *)call->data)->syscall_nr; 1271 if (num < 0 || num >= NR_syscalls) { 1272 pr_debug("syscall %s metadata not mapped, disabling ftrace event\n", 1273 ((struct syscall_metadata *)call->data)->name); 1274 return -ENOSYS; 1275 } 1276 1277 check_faultable_syscall(call, num); 1278 1279 if (set_syscall_print_fmt(call) < 0) 1280 return -ENOMEM; 1281 1282 id = trace_event_raw_init(call); 1283 1284 if (id < 0) { 1285 free_syscall_print_fmt(call); 1286 return id; 1287 } 1288 1289 return id; 1290 } 1291 1292 static struct trace_event_fields __refdata syscall_enter_fields_array[] = { 1293 SYSCALL_FIELD(int, __syscall_nr), 1294 { .type = TRACE_FUNCTION_TYPE, 1295 .define_fields = syscall_enter_define_fields }, 1296 {} 1297 }; 1298 1299 struct trace_event_functions enter_syscall_print_funcs = { 1300 .trace = print_syscall_enter, 1301 }; 1302 1303 struct trace_event_functions exit_syscall_print_funcs = { 1304 .trace = print_syscall_exit, 1305 }; 1306 1307 struct trace_event_class __refdata event_class_syscall_enter = { 1308 .system = "syscalls", 1309 .reg = syscall_enter_register, 1310 .fields_array = syscall_enter_fields_array, 1311 .get_fields = syscall_get_enter_fields, 1312 .raw_init = init_syscall_trace, 1313 }; 1314 1315 struct trace_event_class __refdata event_class_syscall_exit = { 1316 .system = "syscalls", 1317 .reg = syscall_exit_register, 1318 .fields_array = (struct trace_event_fields[]){ 1319 SYSCALL_FIELD(int, __syscall_nr), 1320 SYSCALL_FIELD(long, ret), 1321 {} 1322 }, 1323 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), 1324 .raw_init = init_syscall_trace, 1325 }; 1326 1327 unsigned long __init __weak arch_syscall_addr(int nr) 1328 { 1329 return (unsigned long)sys_call_table[nr]; 1330 } 1331 1332 void __init init_ftrace_syscalls(void) 1333 { 1334 struct syscall_metadata *meta; 1335 unsigned long addr; 1336 int i; 1337 void *ret; 1338 1339 if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) { 1340 syscalls_metadata = kcalloc(NR_syscalls, 1341 sizeof(*syscalls_metadata), 1342 GFP_KERNEL); 1343 if (!syscalls_metadata) { 1344 WARN_ON(1); 1345 return; 1346 } 1347 } 1348 1349 for (i = 0; i < NR_syscalls; i++) { 1350 addr = arch_syscall_addr(i); 1351 meta = find_syscall_meta(addr); 1352 if (!meta) 1353 continue; 1354 1355 meta->syscall_nr = i; 1356 1357 if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) { 1358 syscalls_metadata[i] = meta; 1359 } else { 1360 ret = xa_store(&syscalls_metadata_sparse, i, meta, 1361 GFP_KERNEL); 1362 WARN(xa_is_err(ret), 1363 "Syscall memory allocation failed\n"); 1364 } 1365 1366 } 1367 } 1368 1369 #ifdef CONFIG_PERF_EVENTS 1370 1371 static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls); 1372 static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); 1373 static int sys_perf_refcount_enter; 1374 static int sys_perf_refcount_exit; 1375 1376 static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs, 1377 struct syscall_metadata *sys_data, 1378 struct syscall_trace_enter *rec) 1379 { 1380 struct syscall_tp_t { 1381 struct trace_entry ent; 1382 int syscall_nr; 1383 unsigned long args[SYSCALL_DEFINE_MAXARGS]; 1384 } __aligned(8) param; 1385 int i; 1386 1387 BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *)); 1388 1389 /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ 1390 perf_fetch_caller_regs(regs); 1391 *(struct pt_regs **)¶m = regs; 1392 param.syscall_nr = rec->nr; 1393 for (i = 0; i < sys_data->nb_args; i++) 1394 param.args[i] = rec->args[i]; 1395 return trace_call_bpf(call, ¶m); 1396 } 1397 1398 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) 1399 { 1400 struct syscall_metadata *sys_data; 1401 struct syscall_trace_enter *rec; 1402 struct pt_regs *fake_regs; 1403 struct hlist_head *head; 1404 unsigned long args[6]; 1405 bool valid_prog_array; 1406 bool mayfault; 1407 char *user_ptr; 1408 int user_sizes[SYSCALL_FAULT_MAX_CNT] = {}; 1409 int buf_size = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT; 1410 int syscall_nr; 1411 int rctx; 1412 int size = 0; 1413 int uargs = 0; 1414 1415 /* 1416 * Syscall probe called with preemption enabled, but the ring 1417 * buffer and per-cpu data require preemption to be disabled. 1418 */ 1419 might_fault(); 1420 guard(preempt_notrace)(); 1421 1422 syscall_nr = trace_get_syscall_nr(current, regs); 1423 if (syscall_nr < 0 || syscall_nr >= NR_syscalls) 1424 return; 1425 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) 1426 return; 1427 1428 sys_data = syscall_nr_to_meta(syscall_nr); 1429 if (!sys_data) 1430 return; 1431 1432 syscall_get_arguments(current, regs, args); 1433 1434 /* Check if this syscall event faults in user space memory */ 1435 mayfault = sys_data->user_mask != 0; 1436 1437 if (mayfault) { 1438 if (syscall_get_data(sys_data, args, &user_ptr, 1439 &size, user_sizes, &uargs, buf_size) < 0) 1440 return; 1441 } 1442 1443 head = this_cpu_ptr(sys_data->enter_event->perf_events); 1444 valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); 1445 if (!valid_prog_array && hlist_empty(head)) 1446 return; 1447 1448 /* get the size after alignment with the u32 buffer size field */ 1449 size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); 1450 size = ALIGN(size + sizeof(u32), sizeof(u64)); 1451 size -= sizeof(u32); 1452 1453 rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); 1454 if (!rec) 1455 return; 1456 1457 rec->nr = syscall_nr; 1458 memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args); 1459 1460 if (mayfault) 1461 syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs); 1462 1463 if ((valid_prog_array && 1464 !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) || 1465 hlist_empty(head)) { 1466 perf_swevent_put_recursion_context(rctx); 1467 return; 1468 } 1469 1470 perf_trace_buf_submit(rec, size, rctx, 1471 sys_data->enter_event->event.type, 1, regs, 1472 head, NULL); 1473 } 1474 1475 static int perf_sysenter_enable(struct trace_event_call *call) 1476 { 1477 struct syscall_metadata *sys_data = call->data; 1478 int num; 1479 int ret; 1480 1481 num = sys_data->syscall_nr; 1482 1483 guard(mutex)(&syscall_trace_lock); 1484 if (sys_data->user_mask) { 1485 ret = syscall_fault_buffer_enable(); 1486 if (ret < 0) 1487 return ret; 1488 } 1489 if (!sys_perf_refcount_enter) { 1490 ret = register_trace_sys_enter(perf_syscall_enter, NULL); 1491 if (ret) { 1492 pr_info("event trace: Could not activate syscall entry trace point"); 1493 if (sys_data->user_mask) 1494 syscall_fault_buffer_disable(); 1495 return ret; 1496 } 1497 } 1498 set_bit(num, enabled_perf_enter_syscalls); 1499 sys_perf_refcount_enter++; 1500 return 0; 1501 } 1502 1503 static void perf_sysenter_disable(struct trace_event_call *call) 1504 { 1505 struct syscall_metadata *sys_data = call->data; 1506 int num; 1507 1508 num = sys_data->syscall_nr; 1509 1510 guard(mutex)(&syscall_trace_lock); 1511 sys_perf_refcount_enter--; 1512 clear_bit(num, enabled_perf_enter_syscalls); 1513 if (!sys_perf_refcount_enter) 1514 unregister_trace_sys_enter(perf_syscall_enter, NULL); 1515 if (sys_data->user_mask) 1516 syscall_fault_buffer_disable(); 1517 } 1518 1519 static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs, 1520 struct syscall_trace_exit *rec) 1521 { 1522 struct syscall_tp_t { 1523 struct trace_entry ent; 1524 int syscall_nr; 1525 unsigned long ret; 1526 } __aligned(8) param; 1527 1528 /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ 1529 perf_fetch_caller_regs(regs); 1530 *(struct pt_regs **)¶m = regs; 1531 param.syscall_nr = rec->nr; 1532 param.ret = rec->ret; 1533 return trace_call_bpf(call, ¶m); 1534 } 1535 1536 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) 1537 { 1538 struct syscall_metadata *sys_data; 1539 struct syscall_trace_exit *rec; 1540 struct pt_regs *fake_regs; 1541 struct hlist_head *head; 1542 bool valid_prog_array; 1543 int syscall_nr; 1544 int rctx; 1545 int size; 1546 1547 /* 1548 * Syscall probe called with preemption enabled, but the ring 1549 * buffer and per-cpu data require preemption to be disabled. 1550 */ 1551 might_fault(); 1552 guard(preempt_notrace)(); 1553 1554 syscall_nr = trace_get_syscall_nr(current, regs); 1555 if (syscall_nr < 0 || syscall_nr >= NR_syscalls) 1556 return; 1557 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) 1558 return; 1559 1560 sys_data = syscall_nr_to_meta(syscall_nr); 1561 if (!sys_data) 1562 return; 1563 1564 head = this_cpu_ptr(sys_data->exit_event->perf_events); 1565 valid_prog_array = bpf_prog_array_valid(sys_data->exit_event); 1566 if (!valid_prog_array && hlist_empty(head)) 1567 return; 1568 1569 /* We can probably do that at build time */ 1570 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); 1571 size -= sizeof(u32); 1572 1573 rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); 1574 if (!rec) 1575 return; 1576 1577 rec->nr = syscall_nr; 1578 rec->ret = syscall_get_return_value(current, regs); 1579 1580 if ((valid_prog_array && 1581 !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) || 1582 hlist_empty(head)) { 1583 perf_swevent_put_recursion_context(rctx); 1584 return; 1585 } 1586 1587 perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, 1588 1, regs, head, NULL); 1589 } 1590 1591 static int perf_sysexit_enable(struct trace_event_call *call) 1592 { 1593 int num; 1594 1595 num = ((struct syscall_metadata *)call->data)->syscall_nr; 1596 1597 guard(mutex)(&syscall_trace_lock); 1598 if (!sys_perf_refcount_exit) { 1599 int ret = register_trace_sys_exit(perf_syscall_exit, NULL); 1600 if (ret) { 1601 pr_info("event trace: Could not activate syscall exit trace point"); 1602 return ret; 1603 } 1604 } 1605 set_bit(num, enabled_perf_exit_syscalls); 1606 sys_perf_refcount_exit++; 1607 return 0; 1608 } 1609 1610 static void perf_sysexit_disable(struct trace_event_call *call) 1611 { 1612 int num; 1613 1614 num = ((struct syscall_metadata *)call->data)->syscall_nr; 1615 1616 guard(mutex)(&syscall_trace_lock); 1617 sys_perf_refcount_exit--; 1618 clear_bit(num, enabled_perf_exit_syscalls); 1619 if (!sys_perf_refcount_exit) 1620 unregister_trace_sys_exit(perf_syscall_exit, NULL); 1621 } 1622 1623 #endif /* CONFIG_PERF_EVENTS */ 1624 1625 static int syscall_enter_register(struct trace_event_call *event, 1626 enum trace_reg type, void *data) 1627 { 1628 struct trace_event_file *file = data; 1629 1630 switch (type) { 1631 case TRACE_REG_REGISTER: 1632 return reg_event_syscall_enter(file, event); 1633 case TRACE_REG_UNREGISTER: 1634 unreg_event_syscall_enter(file, event); 1635 return 0; 1636 1637 #ifdef CONFIG_PERF_EVENTS 1638 case TRACE_REG_PERF_REGISTER: 1639 return perf_sysenter_enable(event); 1640 case TRACE_REG_PERF_UNREGISTER: 1641 perf_sysenter_disable(event); 1642 return 0; 1643 case TRACE_REG_PERF_OPEN: 1644 case TRACE_REG_PERF_CLOSE: 1645 case TRACE_REG_PERF_ADD: 1646 case TRACE_REG_PERF_DEL: 1647 return 0; 1648 #endif 1649 } 1650 return 0; 1651 } 1652 1653 static int syscall_exit_register(struct trace_event_call *event, 1654 enum trace_reg type, void *data) 1655 { 1656 struct trace_event_file *file = data; 1657 1658 switch (type) { 1659 case TRACE_REG_REGISTER: 1660 return reg_event_syscall_exit(file, event); 1661 case TRACE_REG_UNREGISTER: 1662 unreg_event_syscall_exit(file, event); 1663 return 0; 1664 1665 #ifdef CONFIG_PERF_EVENTS 1666 case TRACE_REG_PERF_REGISTER: 1667 return perf_sysexit_enable(event); 1668 case TRACE_REG_PERF_UNREGISTER: 1669 perf_sysexit_disable(event); 1670 return 0; 1671 case TRACE_REG_PERF_OPEN: 1672 case TRACE_REG_PERF_CLOSE: 1673 case TRACE_REG_PERF_ADD: 1674 case TRACE_REG_PERF_DEL: 1675 return 0; 1676 #endif 1677 } 1678 return 0; 1679 } 1680