1 // SPDX-License-Identifier: GPL-2.0 2 #include <trace/syscall.h> 3 #include <trace/events/syscalls.h> 4 #include <linux/kernel_stat.h> 5 #include <linux/syscalls.h> 6 #include <linux/slab.h> 7 #include <linux/kernel.h> 8 #include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */ 9 #include <linux/ftrace.h> 10 #include <linux/perf_event.h> 11 #include <linux/xarray.h> 12 #include <asm/syscall.h> 13 14 #include "trace_output.h" 15 #include "trace.h" 16 17 static DEFINE_MUTEX(syscall_trace_lock); 18 19 static int syscall_enter_register(struct trace_event_call *event, 20 enum trace_reg type, void *data); 21 static int syscall_exit_register(struct trace_event_call *event, 22 enum trace_reg type, void *data); 23 24 static struct list_head * 25 syscall_get_enter_fields(struct trace_event_call *call) 26 { 27 struct syscall_metadata *entry = call->data; 28 29 return &entry->enter_fields; 30 } 31 32 extern struct syscall_metadata *__start_syscalls_metadata[]; 33 extern struct syscall_metadata *__stop_syscalls_metadata[]; 34 35 static DEFINE_XARRAY(syscalls_metadata_sparse); 36 static struct syscall_metadata **syscalls_metadata; 37 38 #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME 39 static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) 40 { 41 /* 42 * Only compare after the "sys" prefix. Archs that use 43 * syscall wrappers may have syscalls symbols aliases prefixed 44 * with ".SyS" or ".sys" instead of "sys", leading to an unwanted 45 * mismatch. 46 */ 47 return !strcmp(sym + 3, name + 3); 48 } 49 #endif 50 51 #ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS 52 /* 53 * Some architectures that allow for 32bit applications 54 * to run on a 64bit kernel, do not map the syscalls for 55 * the 32bit tasks the same as they do for 64bit tasks. 56 * 57 * *cough*x86*cough* 58 * 59 * In such a case, instead of reporting the wrong syscalls, 60 * simply ignore them. 61 * 62 * For an arch to ignore the compat syscalls it needs to 63 * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as 64 * define the function arch_trace_is_compat_syscall() to let 65 * the tracing system know that it should ignore it. 66 */ 67 static int 68 trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) 69 { 70 if (unlikely(arch_trace_is_compat_syscall(regs))) 71 return -1; 72 73 return syscall_get_nr(task, regs); 74 } 75 #else 76 static inline int 77 trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs) 78 { 79 return syscall_get_nr(task, regs); 80 } 81 #endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */ 82 83 static __init struct syscall_metadata * 84 find_syscall_meta(unsigned long syscall) 85 { 86 struct syscall_metadata **start; 87 struct syscall_metadata **stop; 88 char str[KSYM_SYMBOL_LEN]; 89 90 91 start = __start_syscalls_metadata; 92 stop = __stop_syscalls_metadata; 93 kallsyms_lookup(syscall, NULL, NULL, NULL, str); 94 95 if (arch_syscall_match_sym_name(str, "sys_ni_syscall")) 96 return NULL; 97 98 for ( ; start < stop; start++) { 99 if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) 100 return *start; 101 } 102 return NULL; 103 } 104 105 static struct syscall_metadata *syscall_nr_to_meta(int nr) 106 { 107 if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) 108 return xa_load(&syscalls_metadata_sparse, (unsigned long)nr); 109 110 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) 111 return NULL; 112 113 return syscalls_metadata[nr]; 114 } 115 116 const char *get_syscall_name(int syscall) 117 { 118 struct syscall_metadata *entry; 119 120 entry = syscall_nr_to_meta(syscall); 121 if (!entry) 122 return NULL; 123 124 return entry->name; 125 } 126 127 /* Added to user strings or arrays when max limit is reached */ 128 #define EXTRA "..." 129 130 static void get_dynamic_len_ptr(struct syscall_trace_enter *trace, 131 struct syscall_metadata *entry, 132 int *offset_p, int *len_p, unsigned char **ptr_p) 133 { 134 unsigned char *ptr; 135 int offset = *offset_p; 136 int val; 137 138 /* This arg points to a user space string */ 139 ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset; 140 val = *(int *)ptr; 141 142 /* The value is a dynamic string (len << 16 | offset) */ 143 ptr = (void *)trace + (val & 0xffff); 144 *len_p = val >> 16; 145 offset += 4; 146 147 *ptr_p = ptr; 148 *offset_p = offset; 149 } 150 151 static enum print_line_t 152 sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadata *entry, 153 struct trace_seq *s, struct trace_event *event) 154 { 155 unsigned char *ptr; 156 int offset = 0; 157 int bits, len; 158 bool done = false; 159 static const struct trace_print_flags __flags[] = 160 { 161 { O_TMPFILE, "O_TMPFILE" }, 162 { O_WRONLY, "O_WRONLY" }, 163 { O_RDWR, "O_RDWR" }, 164 { O_CREAT, "O_CREAT" }, 165 { O_EXCL, "O_EXCL" }, 166 { O_NOCTTY, "O_NOCTTY" }, 167 { O_TRUNC, "O_TRUNC" }, 168 { O_APPEND, "O_APPEND" }, 169 { O_NONBLOCK, "O_NONBLOCK" }, 170 { O_DSYNC, "O_DSYNC" }, 171 { O_DIRECT, "O_DIRECT" }, 172 { O_LARGEFILE, "O_LARGEFILE" }, 173 { O_DIRECTORY, "O_DIRECTORY" }, 174 { O_NOFOLLOW, "O_NOFOLLOW" }, 175 { O_NOATIME, "O_NOATIME" }, 176 { O_CLOEXEC, "O_CLOEXEC" }, 177 }; 178 179 trace_seq_printf(s, "%s(", entry->name); 180 181 for (int i = 0; !done && i < entry->nb_args; i++) { 182 183 if (trace_seq_has_overflowed(s)) 184 goto end; 185 186 if (i) 187 trace_seq_puts(s, ", "); 188 189 switch (i) { 190 case 2: 191 bits = trace->args[2]; 192 193 trace_seq_puts(s, "flags: "); 194 195 /* No need to show mode when not creating the file */ 196 if (!(bits & (O_CREAT|O_TMPFILE))) 197 done = true; 198 199 if (!(bits & O_ACCMODE)) { 200 if (!bits) { 201 trace_seq_puts(s, "O_RDONLY"); 202 continue; 203 } 204 trace_seq_puts(s, "O_RDONLY|"); 205 } 206 207 trace_print_flags_seq(s, "|", bits, __flags, ARRAY_SIZE(__flags)); 208 /* 209 * trace_print_flags_seq() adds a '\0' to the 210 * buffer, but this needs to append more to the seq. 211 */ 212 if (!trace_seq_has_overflowed(s)) 213 trace_seq_pop(s); 214 215 continue; 216 case 3: 217 trace_seq_printf(s, "%s: 0%03o", entry->args[i], 218 (unsigned int)trace->args[i]); 219 continue; 220 } 221 222 trace_seq_printf(s, "%s: %lu", entry->args[i], 223 trace->args[i]); 224 225 if (!(BIT(i) & entry->user_mask)) 226 continue; 227 228 get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr); 229 trace_seq_printf(s, " \"%.*s\"", len, ptr); 230 } 231 232 trace_seq_putc(s, ')'); 233 end: 234 trace_seq_putc(s, '\n'); 235 236 return trace_handle_return(s); 237 } 238 239 static enum print_line_t 240 print_syscall_enter(struct trace_iterator *iter, int flags, 241 struct trace_event *event) 242 { 243 struct trace_array *tr = iter->tr; 244 struct trace_seq *s = &iter->seq; 245 struct trace_entry *ent = iter->ent; 246 struct syscall_trace_enter *trace; 247 struct syscall_metadata *entry; 248 int i, syscall, val, len; 249 unsigned char *ptr; 250 int offset = 0; 251 252 trace = (typeof(trace))ent; 253 syscall = trace->nr; 254 entry = syscall_nr_to_meta(syscall); 255 256 if (!entry) 257 goto end; 258 259 if (entry->enter_event->event.type != ent->type) { 260 WARN_ON_ONCE(1); 261 goto end; 262 } 263 264 switch (entry->syscall_nr) { 265 case __NR_openat: 266 if (!tr || !(tr->trace_flags & TRACE_ITER(VERBOSE))) 267 return sys_enter_openat_print(trace, entry, s, event); 268 break; 269 default: 270 break; 271 } 272 273 trace_seq_printf(s, "%s(", entry->name); 274 275 for (i = 0; i < entry->nb_args; i++) { 276 bool printable = false; 277 char *str; 278 279 if (trace_seq_has_overflowed(s)) 280 goto end; 281 282 if (i) 283 trace_seq_puts(s, ", "); 284 285 /* parameter types */ 286 if (tr && tr->trace_flags & TRACE_ITER(VERBOSE)) 287 trace_seq_printf(s, "%s ", entry->types[i]); 288 289 /* parameter values */ 290 if (trace->args[i] < 10) 291 trace_seq_printf(s, "%s: %lu", entry->args[i], 292 trace->args[i]); 293 else 294 trace_seq_printf(s, "%s: 0x%lx", entry->args[i], 295 trace->args[i]); 296 297 if (!(BIT(i) & entry->user_mask)) 298 continue; 299 300 get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr); 301 302 if (entry->user_arg_size < 0 || entry->user_arg_is_str) { 303 trace_seq_printf(s, " \"%.*s\"", len, ptr); 304 continue; 305 } 306 307 val = trace->args[entry->user_arg_size]; 308 309 str = ptr; 310 trace_seq_puts(s, " ("); 311 for (int x = 0; x < len; x++, ptr++) { 312 if (isascii(*ptr) && isprint(*ptr)) 313 printable = true; 314 if (x) 315 trace_seq_putc(s, ':'); 316 trace_seq_printf(s, "%02x", *ptr); 317 } 318 if (len < val) 319 trace_seq_printf(s, ", %s", EXTRA); 320 321 trace_seq_putc(s, ')'); 322 323 /* If nothing is printable, don't bother printing anything */ 324 if (!printable) 325 continue; 326 327 trace_seq_puts(s, " \""); 328 for (int x = 0; x < len; x++) { 329 if (isascii(str[x]) && isprint(str[x])) 330 trace_seq_putc(s, str[x]); 331 else 332 trace_seq_putc(s, '.'); 333 } 334 if (len < val) 335 trace_seq_printf(s, "\"%s", EXTRA); 336 else 337 trace_seq_putc(s, '"'); 338 } 339 340 trace_seq_putc(s, ')'); 341 end: 342 trace_seq_putc(s, '\n'); 343 344 return trace_handle_return(s); 345 } 346 347 static enum print_line_t 348 print_syscall_exit(struct trace_iterator *iter, int flags, 349 struct trace_event *event) 350 { 351 struct trace_seq *s = &iter->seq; 352 struct trace_entry *ent = iter->ent; 353 struct syscall_trace_exit *trace; 354 int syscall; 355 struct syscall_metadata *entry; 356 357 trace = (typeof(trace))ent; 358 syscall = trace->nr; 359 entry = syscall_nr_to_meta(syscall); 360 361 if (!entry) { 362 trace_seq_putc(s, '\n'); 363 goto out; 364 } 365 366 if (entry->exit_event->event.type != ent->type) { 367 WARN_ON_ONCE(1); 368 return TRACE_TYPE_UNHANDLED; 369 } 370 371 trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, 372 trace->ret); 373 374 out: 375 return trace_handle_return(s); 376 } 377 378 #define SYSCALL_FIELD(_type, _name) { \ 379 .type = #_type, .name = #_name, \ 380 .size = sizeof(_type), .align = __alignof__(_type), \ 381 .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER } 382 383 /* When len=0, we just calculate the needed length */ 384 #define LEN_OR_ZERO (len ? len - pos : 0) 385 386 static int __init 387 sys_enter_openat_print_fmt(struct syscall_metadata *entry, char *buf, int len) 388 { 389 int pos = 0; 390 391 pos += snprintf(buf + pos, LEN_OR_ZERO, 392 "\"dfd: 0x%%08lx, filename: 0x%%08lx \\\"%%s\\\", flags: %%s%%s, mode: 0%%03o\","); 393 pos += snprintf(buf + pos, LEN_OR_ZERO, 394 " ((unsigned long)(REC->dfd)),"); 395 pos += snprintf(buf + pos, LEN_OR_ZERO, 396 " ((unsigned long)(REC->filename)),"); 397 pos += snprintf(buf + pos, LEN_OR_ZERO, 398 " __get_str(__filename_val),"); 399 pos += snprintf(buf + pos, LEN_OR_ZERO, 400 " (REC->flags & ~3) && !(REC->flags & 3) ? \"O_RDONLY|\" : \"\", "); 401 pos += snprintf(buf + pos, LEN_OR_ZERO, 402 " REC->flags ? __print_flags(REC->flags, \"|\", "); 403 pos += snprintf(buf + pos, LEN_OR_ZERO, 404 "{ 0x%x, \"O_WRONLY\" }, ", O_WRONLY); 405 pos += snprintf(buf + pos, LEN_OR_ZERO, 406 "{ 0x%x, \"O_RDWR\" }, ", O_RDWR); 407 pos += snprintf(buf + pos, LEN_OR_ZERO, 408 "{ 0x%x, \"O_CREAT\" }, ", O_CREAT); 409 pos += snprintf(buf + pos, LEN_OR_ZERO, 410 "{ 0x%x, \"O_EXCL\" }, ", O_EXCL); 411 pos += snprintf(buf + pos, LEN_OR_ZERO, 412 "{ 0x%x, \"O_NOCTTY\" }, ", O_NOCTTY); 413 pos += snprintf(buf + pos, LEN_OR_ZERO, 414 "{ 0x%x, \"O_TRUNC\" }, ", O_TRUNC); 415 pos += snprintf(buf + pos, LEN_OR_ZERO, 416 "{ 0x%x, \"O_APPEND\" }, ", O_APPEND); 417 pos += snprintf(buf + pos, LEN_OR_ZERO, 418 "{ 0x%x, \"O_NONBLOCK\" }, ", O_NONBLOCK); 419 pos += snprintf(buf + pos, LEN_OR_ZERO, 420 "{ 0x%x, \"O_DSYNC\" }, ", O_DSYNC); 421 pos += snprintf(buf + pos, LEN_OR_ZERO, 422 "{ 0x%x, \"O_DIRECT\" }, ", O_DIRECT); 423 pos += snprintf(buf + pos, LEN_OR_ZERO, 424 "{ 0x%x, \"O_LARGEFILE\" }, ", O_LARGEFILE); 425 pos += snprintf(buf + pos, LEN_OR_ZERO, 426 "{ 0x%x, \"O_DIRECTORY\" }, ", O_DIRECTORY); 427 pos += snprintf(buf + pos, LEN_OR_ZERO, 428 "{ 0x%x, \"O_NOFOLLOW\" }, ", O_NOFOLLOW); 429 pos += snprintf(buf + pos, LEN_OR_ZERO, 430 "{ 0x%x, \"O_NOATIME\" }, ", O_NOATIME); 431 pos += snprintf(buf + pos, LEN_OR_ZERO, 432 "{ 0x%x, \"O_CLOEXEC\" }) : \"O_RDONLY\", ", O_CLOEXEC); 433 434 pos += snprintf(buf + pos, LEN_OR_ZERO, 435 " ((unsigned long)(REC->mode))"); 436 return pos; 437 } 438 439 static int __init 440 __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) 441 { 442 bool is_string = entry->user_arg_is_str; 443 int i; 444 int pos = 0; 445 446 switch (entry->syscall_nr) { 447 case __NR_openat: 448 return sys_enter_openat_print_fmt(entry, buf, len); 449 default: 450 break; 451 } 452 453 pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); 454 for (i = 0; i < entry->nb_args; i++) { 455 if (i) 456 pos += snprintf(buf + pos, LEN_OR_ZERO, ", "); 457 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx", 458 entry->args[i], sizeof(unsigned long)); 459 460 if (!(BIT(i) & entry->user_mask)) 461 continue; 462 463 /* Add the format for the user space string or array */ 464 if (entry->user_arg_size < 0 || is_string) 465 pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\""); 466 else 467 pos += snprintf(buf + pos, LEN_OR_ZERO, " (%%s)"); 468 } 469 pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); 470 471 for (i = 0; i < entry->nb_args; i++) { 472 pos += snprintf(buf + pos, LEN_OR_ZERO, 473 ", ((unsigned long)(REC->%s))", entry->args[i]); 474 if (!(BIT(i) & entry->user_mask)) 475 continue; 476 /* The user space data for arg has name __<arg>_val */ 477 if (entry->user_arg_size < 0 || is_string) { 478 pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)", 479 entry->args[i]); 480 } else { 481 pos += snprintf(buf + pos, LEN_OR_ZERO, ", __print_dynamic_array(__%s_val, 1)", 482 entry->args[i]); 483 } 484 } 485 486 #undef LEN_OR_ZERO 487 488 /* return the length of print_fmt */ 489 return pos; 490 } 491 492 static int __init set_syscall_print_fmt(struct trace_event_call *call) 493 { 494 char *print_fmt; 495 int len; 496 struct syscall_metadata *entry = call->data; 497 498 if (entry->enter_event != call) { 499 call->print_fmt = "\"0x%lx\", REC->ret"; 500 return 0; 501 } 502 503 /* First: called with 0 length to calculate the needed length */ 504 len = __set_enter_print_fmt(entry, NULL, 0); 505 506 print_fmt = kmalloc(len + 1, GFP_KERNEL); 507 if (!print_fmt) 508 return -ENOMEM; 509 510 /* Second: actually write the @print_fmt */ 511 __set_enter_print_fmt(entry, print_fmt, len + 1); 512 call->print_fmt = print_fmt; 513 514 return 0; 515 } 516 517 static void __init free_syscall_print_fmt(struct trace_event_call *call) 518 { 519 struct syscall_metadata *entry = call->data; 520 521 if (entry->enter_event == call) 522 kfree(call->print_fmt); 523 } 524 525 static int __init syscall_enter_define_fields(struct trace_event_call *call) 526 { 527 struct syscall_trace_enter trace; 528 struct syscall_metadata *meta = call->data; 529 unsigned long mask; 530 char *arg; 531 int offset = offsetof(typeof(trace), args); 532 int ret = 0; 533 int len; 534 int i; 535 536 for (i = 0; i < meta->nb_args; i++) { 537 ret = trace_define_field(call, meta->types[i], 538 meta->args[i], offset, 539 sizeof(unsigned long), 0, 540 FILTER_OTHER); 541 if (ret) 542 break; 543 offset += sizeof(unsigned long); 544 } 545 546 if (ret || !meta->user_mask) 547 return ret; 548 549 mask = meta->user_mask; 550 551 while (mask) { 552 int idx = ffs(mask) - 1; 553 mask &= ~BIT(idx); 554 555 /* 556 * User space data is faulted into a temporary buffer and then 557 * added as a dynamic string or array to the end of the event. 558 * The user space data name for the arg pointer is 559 * "__<arg>_val". 560 */ 561 len = strlen(meta->args[idx]) + sizeof("___val"); 562 arg = kmalloc(len, GFP_KERNEL); 563 if (WARN_ON_ONCE(!arg)) { 564 meta->user_mask = 0; 565 return -ENOMEM; 566 } 567 568 snprintf(arg, len, "__%s_val", meta->args[idx]); 569 570 ret = trace_define_field(call, "__data_loc char[]", 571 arg, offset, sizeof(int), 0, 572 FILTER_OTHER); 573 if (ret) { 574 kfree(arg); 575 break; 576 } 577 offset += 4; 578 } 579 return ret; 580 } 581 582 /* 583 * Create a per CPU temporary buffer to copy user space pointers into. 584 * 585 * SYSCALL_FAULT_USER_MAX is the amount to copy from user space. 586 * (defined in kernel/trace/trace.h) 587 588 * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space plus the 589 * nul terminating byte and possibly appended EXTRA (4 bytes). 590 * 591 * SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use 592 * to copy memory from user space addresses into that will hold 593 * 3 args as only 3 args are allowed to be copied from system calls. 594 */ 595 #define SYSCALL_FAULT_ARG_SZ (SYSCALL_FAULT_USER_MAX + 1 + 4) 596 #define SYSCALL_FAULT_MAX_CNT 3 597 #define SYSCALL_FAULT_BUF_SZ (SYSCALL_FAULT_ARG_SZ * SYSCALL_FAULT_MAX_CNT) 598 599 /* Use the tracing per CPU buffer infrastructure to copy from user space */ 600 struct syscall_user_buffer { 601 struct trace_user_buf_info buf; 602 struct rcu_head rcu; 603 }; 604 605 static struct syscall_user_buffer *syscall_buffer; 606 607 static int syscall_fault_buffer_enable(void) 608 { 609 struct syscall_user_buffer *sbuf; 610 int ret; 611 612 lockdep_assert_held(&syscall_trace_lock); 613 614 if (syscall_buffer) { 615 trace_user_fault_get(&syscall_buffer->buf); 616 return 0; 617 } 618 619 sbuf = kmalloc_obj(*sbuf); 620 if (!sbuf) 621 return -ENOMEM; 622 623 ret = trace_user_fault_init(&sbuf->buf, SYSCALL_FAULT_BUF_SZ); 624 if (ret < 0) { 625 kfree(sbuf); 626 return ret; 627 } 628 629 WRITE_ONCE(syscall_buffer, sbuf); 630 631 return 0; 632 } 633 634 static void rcu_free_syscall_buffer(struct rcu_head *rcu) 635 { 636 struct syscall_user_buffer *sbuf = 637 container_of(rcu, struct syscall_user_buffer, rcu); 638 639 trace_user_fault_destroy(&sbuf->buf); 640 kfree(sbuf); 641 } 642 643 644 static void syscall_fault_buffer_disable(void) 645 { 646 struct syscall_user_buffer *sbuf = syscall_buffer; 647 648 lockdep_assert_held(&syscall_trace_lock); 649 650 if (trace_user_fault_put(&sbuf->buf)) 651 return; 652 653 WRITE_ONCE(syscall_buffer, NULL); 654 call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer); 655 } 656 657 struct syscall_args { 658 char *ptr_array[SYSCALL_FAULT_MAX_CNT]; 659 int read[SYSCALL_FAULT_MAX_CNT]; 660 int uargs; 661 }; 662 663 static int syscall_copy_user(char *buf, const char __user *ptr, 664 size_t size, void *data) 665 { 666 struct syscall_args *args = data; 667 int ret; 668 669 for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { 670 ptr = (char __user *)args->ptr_array[i]; 671 ret = strncpy_from_user(buf, ptr, size); 672 args->read[i] = ret; 673 } 674 return 0; 675 } 676 677 static int syscall_copy_user_array(char *buf, const char __user *ptr, 678 size_t size, void *data) 679 { 680 struct syscall_args *args = data; 681 int ret; 682 683 for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { 684 ptr = (char __user *)args->ptr_array[i]; 685 ret = __copy_from_user(buf, ptr, size); 686 args->read[i] = ret ? -1 : size; 687 } 688 return 0; 689 } 690 691 static char *sys_fault_user(unsigned int buf_size, 692 struct syscall_metadata *sys_data, 693 struct syscall_user_buffer *sbuf, 694 unsigned long *args, 695 unsigned int data_size[SYSCALL_FAULT_MAX_CNT]) 696 { 697 trace_user_buf_copy syscall_copy = syscall_copy_user; 698 unsigned long mask = sys_data->user_mask; 699 unsigned long size = SYSCALL_FAULT_ARG_SZ - 1; 700 struct syscall_args sargs; 701 bool array = false; 702 char *buffer; 703 char *buf; 704 int ret; 705 int i = 0; 706 707 /* The extra is appended to the user data in the buffer */ 708 BUILD_BUG_ON(SYSCALL_FAULT_USER_MAX + sizeof(EXTRA) >= 709 SYSCALL_FAULT_ARG_SZ); 710 711 /* 712 * If this system call event has a size argument, use 713 * it to define how much of user space memory to read, 714 * and read it as an array and not a string. 715 */ 716 if (sys_data->user_arg_size >= 0) { 717 array = true; 718 size = args[sys_data->user_arg_size]; 719 if (size > SYSCALL_FAULT_ARG_SZ - 1) 720 size = SYSCALL_FAULT_ARG_SZ - 1; 721 syscall_copy = syscall_copy_user_array; 722 } 723 724 while (mask) { 725 int idx = ffs(mask) - 1; 726 mask &= ~BIT(idx); 727 728 if (WARN_ON_ONCE(i == SYSCALL_FAULT_MAX_CNT)) 729 break; 730 731 /* Get the pointer to user space memory to read */ 732 sargs.ptr_array[i++] = (char *)args[idx]; 733 } 734 735 sargs.uargs = i; 736 737 /* Clear the values that are not used */ 738 for (; i < SYSCALL_FAULT_MAX_CNT; i++) { 739 data_size[i] = -1; /* Denotes no pointer */ 740 } 741 742 /* A zero size means do not even try */ 743 if (!buf_size) 744 return NULL; 745 746 buffer = trace_user_fault_read(&sbuf->buf, NULL, size, 747 syscall_copy, &sargs); 748 if (!buffer) 749 return NULL; 750 751 buf = buffer; 752 for (i = 0; i < sargs.uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { 753 754 ret = sargs.read[i]; 755 if (ret < 0) 756 continue; 757 buf[ret] = '\0'; 758 759 /* For strings, replace any non-printable characters with '.' */ 760 if (!array) { 761 for (int x = 0; x < ret; x++) { 762 if (!isprint(buf[x])) 763 buf[x] = '.'; 764 } 765 766 size = min(buf_size, SYSCALL_FAULT_USER_MAX); 767 768 /* 769 * If the text was truncated due to our max limit, 770 * add "..." to the string. 771 */ 772 if (ret > size) { 773 strscpy(buf + size, EXTRA, sizeof(EXTRA)); 774 ret = size + sizeof(EXTRA); 775 } else { 776 buf[ret++] = '\0'; 777 } 778 } else { 779 ret = min((unsigned int)ret, buf_size); 780 } 781 data_size[i] = ret; 782 } 783 784 return buffer; 785 } 786 787 static int 788 syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args, 789 char **buffer, int *size, int *user_sizes, int *uargs, 790 int buf_size) 791 { 792 struct syscall_user_buffer *sbuf; 793 int i; 794 795 /* If the syscall_buffer is NULL, tracing is being shutdown */ 796 sbuf = READ_ONCE(syscall_buffer); 797 if (!sbuf) 798 return -1; 799 800 *buffer = sys_fault_user(buf_size, sys_data, sbuf, args, user_sizes); 801 /* 802 * user_size is the amount of data to append. 803 * Need to add 4 for the meta field that points to 804 * the user memory at the end of the event and also 805 * stores its size. 806 */ 807 for (i = 0; i < SYSCALL_FAULT_MAX_CNT; i++) { 808 if (user_sizes[i] < 0) 809 break; 810 *size += user_sizes[i] + 4; 811 } 812 /* Save the number of user read arguments of this syscall */ 813 *uargs = i; 814 return 0; 815 } 816 817 static void syscall_put_data(struct syscall_metadata *sys_data, 818 struct syscall_trace_enter *entry, 819 char *buffer, int size, int *user_sizes, int uargs) 820 { 821 char *buf = buffer; 822 void *ptr; 823 int val; 824 825 /* 826 * Set the pointer to point to the meta data of the event 827 * that has information about the stored user space memory. 828 */ 829 ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args; 830 831 /* 832 * The meta data will store the offset of the user data from 833 * the beginning of the event. That is after the static arguments 834 * and the meta data fields. 835 */ 836 val = (ptr - (void *)entry) + 4 * uargs; 837 838 for (int i = 0; i < uargs; i++) { 839 840 if (i) 841 val += user_sizes[i - 1]; 842 843 /* Store the offset and the size into the meta data */ 844 *(int *)ptr = val | (user_sizes[i] << 16); 845 846 /* Skip the meta data */ 847 ptr += 4; 848 } 849 850 for (int i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) { 851 /* Nothing to do if the user space was empty or faulted */ 852 if (!user_sizes[i]) 853 continue; 854 855 memcpy(ptr, buf, user_sizes[i]); 856 ptr += user_sizes[i]; 857 } 858 } 859 860 static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) 861 { 862 struct trace_array *tr = data; 863 struct trace_event_file *trace_file; 864 struct syscall_trace_enter *entry; 865 struct syscall_metadata *sys_data; 866 struct trace_event_buffer fbuffer; 867 unsigned long args[6]; 868 char *user_ptr; 869 int user_sizes[SYSCALL_FAULT_MAX_CNT] = {}; 870 int syscall_nr; 871 int size = 0; 872 int uargs = 0; 873 bool mayfault; 874 875 /* 876 * Syscall probe called with preemption enabled, but the ring 877 * buffer and per-cpu data require preemption to be disabled. 878 */ 879 might_fault(); 880 881 syscall_nr = trace_get_syscall_nr(current, regs); 882 if (syscall_nr < 0 || syscall_nr >= NR_syscalls) 883 return; 884 885 trace_file = READ_ONCE(tr->enter_syscall_files[syscall_nr]); 886 if (!trace_file) 887 return; 888 889 if (trace_trigger_soft_disabled(trace_file)) 890 return; 891 892 sys_data = syscall_nr_to_meta(syscall_nr); 893 if (!sys_data) 894 return; 895 896 /* Check if this syscall event faults in user space memory */ 897 mayfault = sys_data->user_mask != 0; 898 899 guard(preempt_notrace)(); 900 901 syscall_get_arguments(current, regs, args); 902 903 if (mayfault) { 904 if (syscall_get_data(sys_data, args, &user_ptr, 905 &size, user_sizes, &uargs, tr->syscall_buf_sz) < 0) 906 return; 907 } 908 909 size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 910 911 entry = trace_event_buffer_reserve(&fbuffer, trace_file, size); 912 if (!entry) 913 return; 914 915 entry = ring_buffer_event_data(fbuffer.event); 916 entry->nr = syscall_nr; 917 918 memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args); 919 920 if (mayfault) 921 syscall_put_data(sys_data, entry, user_ptr, size, user_sizes, uargs); 922 923 trace_event_buffer_commit(&fbuffer); 924 } 925 926 static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) 927 { 928 struct trace_array *tr = data; 929 struct trace_event_file *trace_file; 930 struct syscall_trace_exit *entry; 931 struct syscall_metadata *sys_data; 932 struct trace_event_buffer fbuffer; 933 int syscall_nr; 934 935 /* 936 * Syscall probe called with preemption enabled, but the ring 937 * buffer and per-cpu data require preemption to be disabled. 938 */ 939 might_fault(); 940 guard(preempt_notrace)(); 941 942 syscall_nr = trace_get_syscall_nr(current, regs); 943 if (syscall_nr < 0 || syscall_nr >= NR_syscalls) 944 return; 945 946 trace_file = READ_ONCE(tr->exit_syscall_files[syscall_nr]); 947 if (!trace_file) 948 return; 949 950 if (trace_trigger_soft_disabled(trace_file)) 951 return; 952 953 sys_data = syscall_nr_to_meta(syscall_nr); 954 if (!sys_data) 955 return; 956 957 entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry)); 958 if (!entry) 959 return; 960 961 entry = ring_buffer_event_data(fbuffer.event); 962 entry->nr = syscall_nr; 963 entry->ret = syscall_get_return_value(current, regs); 964 965 trace_event_buffer_commit(&fbuffer); 966 } 967 968 static int reg_event_syscall_enter(struct trace_event_file *file, 969 struct trace_event_call *call) 970 { 971 struct syscall_metadata *sys_data = call->data; 972 struct trace_array *tr = file->tr; 973 int ret = 0; 974 int num; 975 976 num = sys_data->syscall_nr; 977 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 978 return -ENOSYS; 979 guard(mutex)(&syscall_trace_lock); 980 if (sys_data->user_mask) { 981 ret = syscall_fault_buffer_enable(); 982 if (ret < 0) 983 return ret; 984 } 985 if (!tr->sys_refcount_enter) { 986 ret = register_trace_sys_enter(ftrace_syscall_enter, tr); 987 if (ret < 0) { 988 if (sys_data->user_mask) 989 syscall_fault_buffer_disable(); 990 return ret; 991 } 992 } 993 WRITE_ONCE(tr->enter_syscall_files[num], file); 994 tr->sys_refcount_enter++; 995 return 0; 996 } 997 998 static void unreg_event_syscall_enter(struct trace_event_file *file, 999 struct trace_event_call *call) 1000 { 1001 struct syscall_metadata *sys_data = call->data; 1002 struct trace_array *tr = file->tr; 1003 int num; 1004 1005 num = sys_data->syscall_nr; 1006 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 1007 return; 1008 guard(mutex)(&syscall_trace_lock); 1009 tr->sys_refcount_enter--; 1010 WRITE_ONCE(tr->enter_syscall_files[num], NULL); 1011 if (!tr->sys_refcount_enter) 1012 unregister_trace_sys_enter(ftrace_syscall_enter, tr); 1013 if (sys_data->user_mask) 1014 syscall_fault_buffer_disable(); 1015 } 1016 1017 static int reg_event_syscall_exit(struct trace_event_file *file, 1018 struct trace_event_call *call) 1019 { 1020 struct trace_array *tr = file->tr; 1021 int ret = 0; 1022 int num; 1023 1024 num = ((struct syscall_metadata *)call->data)->syscall_nr; 1025 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 1026 return -ENOSYS; 1027 mutex_lock(&syscall_trace_lock); 1028 if (!tr->sys_refcount_exit) 1029 ret = register_trace_sys_exit(ftrace_syscall_exit, tr); 1030 if (!ret) { 1031 WRITE_ONCE(tr->exit_syscall_files[num], file); 1032 tr->sys_refcount_exit++; 1033 } 1034 mutex_unlock(&syscall_trace_lock); 1035 return ret; 1036 } 1037 1038 static void unreg_event_syscall_exit(struct trace_event_file *file, 1039 struct trace_event_call *call) 1040 { 1041 struct trace_array *tr = file->tr; 1042 int num; 1043 1044 num = ((struct syscall_metadata *)call->data)->syscall_nr; 1045 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) 1046 return; 1047 mutex_lock(&syscall_trace_lock); 1048 tr->sys_refcount_exit--; 1049 WRITE_ONCE(tr->exit_syscall_files[num], NULL); 1050 if (!tr->sys_refcount_exit) 1051 unregister_trace_sys_exit(ftrace_syscall_exit, tr); 1052 mutex_unlock(&syscall_trace_lock); 1053 } 1054 1055 /* 1056 * For system calls that reference user space memory that can 1057 * be recorded into the event, set the system call meta data's user_mask 1058 * to the "args" index that points to the user space memory to retrieve. 1059 */ 1060 static void check_faultable_syscall(struct trace_event_call *call, int nr) 1061 { 1062 struct syscall_metadata *sys_data = call->data; 1063 unsigned long mask; 1064 1065 /* Only work on entry */ 1066 if (sys_data->enter_event != call) 1067 return; 1068 1069 sys_data->user_arg_size = -1; 1070 1071 switch (nr) { 1072 /* user arg 1 with size arg at 2 */ 1073 case __NR_write: 1074 #ifdef __NR_mq_timedsend 1075 case __NR_mq_timedsend: 1076 #endif 1077 case __NR_pwrite64: 1078 sys_data->user_mask = BIT(1); 1079 sys_data->user_arg_size = 2; 1080 break; 1081 /* user arg 0 with size arg at 1 as string */ 1082 case __NR_setdomainname: 1083 case __NR_sethostname: 1084 sys_data->user_mask = BIT(0); 1085 sys_data->user_arg_size = 1; 1086 sys_data->user_arg_is_str = 1; 1087 break; 1088 #ifdef __NR_kexec_file_load 1089 /* user arg 4 with size arg at 3 as string */ 1090 case __NR_kexec_file_load: 1091 sys_data->user_mask = BIT(4); 1092 sys_data->user_arg_size = 3; 1093 sys_data->user_arg_is_str = 1; 1094 break; 1095 #endif 1096 /* user arg at position 0 */ 1097 #ifdef __NR_access 1098 case __NR_access: 1099 #endif 1100 case __NR_acct: 1101 case __NR_chdir: 1102 #ifdef __NR_chown 1103 case __NR_chown: 1104 #endif 1105 #ifdef __NR_chmod 1106 case __NR_chmod: 1107 #endif 1108 case __NR_chroot: 1109 #ifdef __NR_creat 1110 case __NR_creat: 1111 #endif 1112 case __NR_delete_module: 1113 case __NR_execve: 1114 case __NR_fsopen: 1115 #ifdef __NR_lchown 1116 case __NR_lchown: 1117 #endif 1118 #ifdef __NR_open 1119 case __NR_open: 1120 #endif 1121 case __NR_memfd_create: 1122 #ifdef __NR_mkdir 1123 case __NR_mkdir: 1124 #endif 1125 #ifdef __NR_mknod 1126 case __NR_mknod: 1127 #endif 1128 case __NR_mq_open: 1129 case __NR_mq_unlink: 1130 #ifdef __NR_readlink 1131 case __NR_readlink: 1132 #endif 1133 #ifdef __NR_rmdir 1134 case __NR_rmdir: 1135 #endif 1136 case __NR_shmdt: 1137 #ifdef __NR_statfs 1138 case __NR_statfs: 1139 #endif 1140 case __NR_swapon: 1141 case __NR_swapoff: 1142 #ifdef __NR_truncate 1143 case __NR_truncate: 1144 #endif 1145 #ifdef __NR_unlink 1146 case __NR_unlink: 1147 #endif 1148 case __NR_umount2: 1149 #ifdef __NR_utime 1150 case __NR_utime: 1151 #endif 1152 #ifdef __NR_utimes 1153 case __NR_utimes: 1154 #endif 1155 sys_data->user_mask = BIT(0); 1156 break; 1157 /* user arg at position 1 */ 1158 case __NR_execveat: 1159 case __NR_faccessat: 1160 case __NR_faccessat2: 1161 case __NR_finit_module: 1162 case __NR_fchmodat: 1163 case __NR_fchmodat2: 1164 case __NR_fchownat: 1165 case __NR_fgetxattr: 1166 case __NR_flistxattr: 1167 case __NR_fsetxattr: 1168 case __NR_fspick: 1169 case __NR_fremovexattr: 1170 #ifdef __NR_futimesat 1171 case __NR_futimesat: 1172 #endif 1173 case __NR_inotify_add_watch: 1174 case __NR_mkdirat: 1175 case __NR_mknodat: 1176 case __NR_mount_setattr: 1177 case __NR_name_to_handle_at: 1178 #ifdef __NR_newfstatat 1179 case __NR_newfstatat: 1180 #endif 1181 case __NR_openat: 1182 case __NR_openat2: 1183 case __NR_open_tree: 1184 case __NR_open_tree_attr: 1185 case __NR_readlinkat: 1186 case __NR_quotactl: 1187 case __NR_syslog: 1188 case __NR_statx: 1189 case __NR_unlinkat: 1190 #ifdef __NR_utimensat 1191 case __NR_utimensat: 1192 #endif 1193 sys_data->user_mask = BIT(1); 1194 break; 1195 /* user arg at position 2 */ 1196 case __NR_init_module: 1197 case __NR_fsconfig: 1198 sys_data->user_mask = BIT(2); 1199 break; 1200 /* user arg at position 4 */ 1201 case __NR_fanotify_mark: 1202 sys_data->user_mask = BIT(4); 1203 break; 1204 /* 2 user args, 0 and 1 */ 1205 case __NR_add_key: 1206 case __NR_getxattr: 1207 case __NR_lgetxattr: 1208 case __NR_lremovexattr: 1209 #ifdef __NR_link 1210 case __NR_link: 1211 #endif 1212 case __NR_listxattr: 1213 case __NR_llistxattr: 1214 case __NR_lsetxattr: 1215 case __NR_pivot_root: 1216 case __NR_removexattr: 1217 #ifdef __NR_rename 1218 case __NR_rename: 1219 #endif 1220 case __NR_request_key: 1221 case __NR_setxattr: 1222 #ifdef __NR_symlink 1223 case __NR_symlink: 1224 #endif 1225 sys_data->user_mask = BIT(0) | BIT(1); 1226 break; 1227 /* 2 user args, 0 and 2 */ 1228 case __NR_symlinkat: 1229 sys_data->user_mask = BIT(0) | BIT(2); 1230 break; 1231 /* 2 user args, 1 and 3 */ 1232 case __NR_getxattrat: 1233 case __NR_linkat: 1234 case __NR_listxattrat: 1235 case __NR_move_mount: 1236 #ifdef __NR_renameat 1237 case __NR_renameat: 1238 #endif 1239 case __NR_renameat2: 1240 case __NR_removexattrat: 1241 case __NR_setxattrat: 1242 sys_data->user_mask = BIT(1) | BIT(3); 1243 break; 1244 case __NR_mount: /* Just dev_name and dir_name, TODO add type */ 1245 sys_data->user_mask = BIT(0) | BIT(1) | BIT(2); 1246 break; 1247 default: 1248 sys_data->user_mask = 0; 1249 return; 1250 } 1251 1252 if (sys_data->user_arg_size < 0) 1253 return; 1254 1255 /* 1256 * The user_arg_size can only be used when the system call 1257 * is reading only a single address from user space. 1258 */ 1259 mask = sys_data->user_mask; 1260 if (WARN_ON(mask & (mask - 1))) 1261 sys_data->user_arg_size = -1; 1262 } 1263 1264 static int __init init_syscall_trace(struct trace_event_call *call) 1265 { 1266 int id; 1267 int num; 1268 1269 num = ((struct syscall_metadata *)call->data)->syscall_nr; 1270 if (num < 0 || num >= NR_syscalls) { 1271 pr_debug("syscall %s metadata not mapped, disabling ftrace event\n", 1272 ((struct syscall_metadata *)call->data)->name); 1273 return -ENOSYS; 1274 } 1275 1276 check_faultable_syscall(call, num); 1277 1278 if (set_syscall_print_fmt(call) < 0) 1279 return -ENOMEM; 1280 1281 id = trace_event_raw_init(call); 1282 1283 if (id < 0) { 1284 free_syscall_print_fmt(call); 1285 return id; 1286 } 1287 1288 return id; 1289 } 1290 1291 static struct trace_event_fields __refdata syscall_enter_fields_array[] = { 1292 SYSCALL_FIELD(int, __syscall_nr), 1293 { .type = TRACE_FUNCTION_TYPE, 1294 .define_fields = syscall_enter_define_fields }, 1295 {} 1296 }; 1297 1298 struct trace_event_functions enter_syscall_print_funcs = { 1299 .trace = print_syscall_enter, 1300 }; 1301 1302 struct trace_event_functions exit_syscall_print_funcs = { 1303 .trace = print_syscall_exit, 1304 }; 1305 1306 struct trace_event_class __refdata event_class_syscall_enter = { 1307 .system = "syscalls", 1308 .reg = syscall_enter_register, 1309 .fields_array = syscall_enter_fields_array, 1310 .get_fields = syscall_get_enter_fields, 1311 .raw_init = init_syscall_trace, 1312 }; 1313 1314 struct trace_event_class __refdata event_class_syscall_exit = { 1315 .system = "syscalls", 1316 .reg = syscall_exit_register, 1317 .fields_array = (struct trace_event_fields[]){ 1318 SYSCALL_FIELD(int, __syscall_nr), 1319 SYSCALL_FIELD(long, ret), 1320 {} 1321 }, 1322 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), 1323 .raw_init = init_syscall_trace, 1324 }; 1325 1326 unsigned long __init __weak arch_syscall_addr(int nr) 1327 { 1328 return (unsigned long)sys_call_table[nr]; 1329 } 1330 1331 void __init init_ftrace_syscalls(void) 1332 { 1333 struct syscall_metadata *meta; 1334 unsigned long addr; 1335 int i; 1336 void *ret; 1337 1338 if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) { 1339 syscalls_metadata = kzalloc_objs(*syscalls_metadata, 1340 NR_syscalls); 1341 if (!syscalls_metadata) { 1342 WARN_ON(1); 1343 return; 1344 } 1345 } 1346 1347 for (i = 0; i < NR_syscalls; i++) { 1348 addr = arch_syscall_addr(i); 1349 meta = find_syscall_meta(addr); 1350 if (!meta) 1351 continue; 1352 1353 meta->syscall_nr = i; 1354 1355 if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) { 1356 syscalls_metadata[i] = meta; 1357 } else { 1358 ret = xa_store(&syscalls_metadata_sparse, i, meta, 1359 GFP_KERNEL); 1360 WARN(xa_is_err(ret), 1361 "Syscall memory allocation failed\n"); 1362 } 1363 1364 } 1365 } 1366 1367 #ifdef CONFIG_PERF_EVENTS 1368 1369 static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls); 1370 static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); 1371 static int sys_perf_refcount_enter; 1372 static int sys_perf_refcount_exit; 1373 1374 static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs, 1375 struct syscall_metadata *sys_data, 1376 struct syscall_trace_enter *rec) 1377 { 1378 struct syscall_tp_t { 1379 struct trace_entry ent; 1380 int syscall_nr; 1381 unsigned long args[SYSCALL_DEFINE_MAXARGS]; 1382 } __aligned(8) param; 1383 int i; 1384 1385 BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *)); 1386 1387 /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ 1388 perf_fetch_caller_regs(regs); 1389 *(struct pt_regs **)¶m = regs; 1390 param.syscall_nr = rec->nr; 1391 for (i = 0; i < sys_data->nb_args; i++) 1392 param.args[i] = rec->args[i]; 1393 return trace_call_bpf(call, ¶m); 1394 } 1395 1396 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) 1397 { 1398 struct syscall_metadata *sys_data; 1399 struct syscall_trace_enter *rec; 1400 struct pt_regs *fake_regs; 1401 struct hlist_head *head; 1402 unsigned long args[6]; 1403 bool valid_prog_array; 1404 bool mayfault; 1405 char *user_ptr; 1406 int user_sizes[SYSCALL_FAULT_MAX_CNT] = {}; 1407 int buf_size = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT; 1408 int syscall_nr; 1409 int rctx; 1410 int size = 0; 1411 int uargs = 0; 1412 1413 /* 1414 * Syscall probe called with preemption enabled, but the ring 1415 * buffer and per-cpu data require preemption to be disabled. 1416 */ 1417 might_fault(); 1418 guard(preempt_notrace)(); 1419 1420 syscall_nr = trace_get_syscall_nr(current, regs); 1421 if (syscall_nr < 0 || syscall_nr >= NR_syscalls) 1422 return; 1423 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) 1424 return; 1425 1426 sys_data = syscall_nr_to_meta(syscall_nr); 1427 if (!sys_data) 1428 return; 1429 1430 syscall_get_arguments(current, regs, args); 1431 1432 /* Check if this syscall event faults in user space memory */ 1433 mayfault = sys_data->user_mask != 0; 1434 1435 if (mayfault) { 1436 if (syscall_get_data(sys_data, args, &user_ptr, 1437 &size, user_sizes, &uargs, buf_size) < 0) 1438 return; 1439 } 1440 1441 head = this_cpu_ptr(sys_data->enter_event->perf_events); 1442 valid_prog_array = bpf_prog_array_valid(sys_data->enter_event); 1443 if (!valid_prog_array && hlist_empty(head)) 1444 return; 1445 1446 /* get the size after alignment with the u32 buffer size field */ 1447 size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); 1448 size = ALIGN(size + sizeof(u32), sizeof(u64)); 1449 size -= sizeof(u32); 1450 1451 rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); 1452 if (!rec) 1453 return; 1454 1455 rec->nr = syscall_nr; 1456 memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args); 1457 1458 if (mayfault) 1459 syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs); 1460 1461 if ((valid_prog_array && 1462 !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) || 1463 hlist_empty(head)) { 1464 perf_swevent_put_recursion_context(rctx); 1465 return; 1466 } 1467 1468 perf_trace_buf_submit(rec, size, rctx, 1469 sys_data->enter_event->event.type, 1, regs, 1470 head, NULL); 1471 } 1472 1473 static int perf_sysenter_enable(struct trace_event_call *call) 1474 { 1475 struct syscall_metadata *sys_data = call->data; 1476 int num; 1477 int ret; 1478 1479 num = sys_data->syscall_nr; 1480 1481 guard(mutex)(&syscall_trace_lock); 1482 if (sys_data->user_mask) { 1483 ret = syscall_fault_buffer_enable(); 1484 if (ret < 0) 1485 return ret; 1486 } 1487 if (!sys_perf_refcount_enter) { 1488 ret = register_trace_sys_enter(perf_syscall_enter, NULL); 1489 if (ret) { 1490 pr_info("event trace: Could not activate syscall entry trace point"); 1491 if (sys_data->user_mask) 1492 syscall_fault_buffer_disable(); 1493 return ret; 1494 } 1495 } 1496 set_bit(num, enabled_perf_enter_syscalls); 1497 sys_perf_refcount_enter++; 1498 return 0; 1499 } 1500 1501 static void perf_sysenter_disable(struct trace_event_call *call) 1502 { 1503 struct syscall_metadata *sys_data = call->data; 1504 int num; 1505 1506 num = sys_data->syscall_nr; 1507 1508 guard(mutex)(&syscall_trace_lock); 1509 sys_perf_refcount_enter--; 1510 clear_bit(num, enabled_perf_enter_syscalls); 1511 if (!sys_perf_refcount_enter) 1512 unregister_trace_sys_enter(perf_syscall_enter, NULL); 1513 if (sys_data->user_mask) 1514 syscall_fault_buffer_disable(); 1515 } 1516 1517 static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs, 1518 struct syscall_trace_exit *rec) 1519 { 1520 struct syscall_tp_t { 1521 struct trace_entry ent; 1522 int syscall_nr; 1523 unsigned long ret; 1524 } __aligned(8) param; 1525 1526 /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */ 1527 perf_fetch_caller_regs(regs); 1528 *(struct pt_regs **)¶m = regs; 1529 param.syscall_nr = rec->nr; 1530 param.ret = rec->ret; 1531 return trace_call_bpf(call, ¶m); 1532 } 1533 1534 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) 1535 { 1536 struct syscall_metadata *sys_data; 1537 struct syscall_trace_exit *rec; 1538 struct pt_regs *fake_regs; 1539 struct hlist_head *head; 1540 bool valid_prog_array; 1541 int syscall_nr; 1542 int rctx; 1543 int size; 1544 1545 /* 1546 * Syscall probe called with preemption enabled, but the ring 1547 * buffer and per-cpu data require preemption to be disabled. 1548 */ 1549 might_fault(); 1550 guard(preempt_notrace)(); 1551 1552 syscall_nr = trace_get_syscall_nr(current, regs); 1553 if (syscall_nr < 0 || syscall_nr >= NR_syscalls) 1554 return; 1555 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) 1556 return; 1557 1558 sys_data = syscall_nr_to_meta(syscall_nr); 1559 if (!sys_data) 1560 return; 1561 1562 head = this_cpu_ptr(sys_data->exit_event->perf_events); 1563 valid_prog_array = bpf_prog_array_valid(sys_data->exit_event); 1564 if (!valid_prog_array && hlist_empty(head)) 1565 return; 1566 1567 /* We can probably do that at build time */ 1568 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); 1569 size -= sizeof(u32); 1570 1571 rec = perf_trace_buf_alloc(size, &fake_regs, &rctx); 1572 if (!rec) 1573 return; 1574 1575 rec->nr = syscall_nr; 1576 rec->ret = syscall_get_return_value(current, regs); 1577 1578 if ((valid_prog_array && 1579 !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) || 1580 hlist_empty(head)) { 1581 perf_swevent_put_recursion_context(rctx); 1582 return; 1583 } 1584 1585 perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, 1586 1, regs, head, NULL); 1587 } 1588 1589 static int perf_sysexit_enable(struct trace_event_call *call) 1590 { 1591 int num; 1592 1593 num = ((struct syscall_metadata *)call->data)->syscall_nr; 1594 1595 guard(mutex)(&syscall_trace_lock); 1596 if (!sys_perf_refcount_exit) { 1597 int ret = register_trace_sys_exit(perf_syscall_exit, NULL); 1598 if (ret) { 1599 pr_info("event trace: Could not activate syscall exit trace point"); 1600 return ret; 1601 } 1602 } 1603 set_bit(num, enabled_perf_exit_syscalls); 1604 sys_perf_refcount_exit++; 1605 return 0; 1606 } 1607 1608 static void perf_sysexit_disable(struct trace_event_call *call) 1609 { 1610 int num; 1611 1612 num = ((struct syscall_metadata *)call->data)->syscall_nr; 1613 1614 guard(mutex)(&syscall_trace_lock); 1615 sys_perf_refcount_exit--; 1616 clear_bit(num, enabled_perf_exit_syscalls); 1617 if (!sys_perf_refcount_exit) 1618 unregister_trace_sys_exit(perf_syscall_exit, NULL); 1619 } 1620 1621 #endif /* CONFIG_PERF_EVENTS */ 1622 1623 static int syscall_enter_register(struct trace_event_call *event, 1624 enum trace_reg type, void *data) 1625 { 1626 struct trace_event_file *file = data; 1627 1628 switch (type) { 1629 case TRACE_REG_REGISTER: 1630 return reg_event_syscall_enter(file, event); 1631 case TRACE_REG_UNREGISTER: 1632 unreg_event_syscall_enter(file, event); 1633 return 0; 1634 1635 #ifdef CONFIG_PERF_EVENTS 1636 case TRACE_REG_PERF_REGISTER: 1637 return perf_sysenter_enable(event); 1638 case TRACE_REG_PERF_UNREGISTER: 1639 perf_sysenter_disable(event); 1640 return 0; 1641 case TRACE_REG_PERF_OPEN: 1642 case TRACE_REG_PERF_CLOSE: 1643 case TRACE_REG_PERF_ADD: 1644 case TRACE_REG_PERF_DEL: 1645 return 0; 1646 #endif 1647 } 1648 return 0; 1649 } 1650 1651 static int syscall_exit_register(struct trace_event_call *event, 1652 enum trace_reg type, void *data) 1653 { 1654 struct trace_event_file *file = data; 1655 1656 switch (type) { 1657 case TRACE_REG_REGISTER: 1658 return reg_event_syscall_exit(file, event); 1659 case TRACE_REG_UNREGISTER: 1660 unreg_event_syscall_exit(file, event); 1661 return 0; 1662 1663 #ifdef CONFIG_PERF_EVENTS 1664 case TRACE_REG_PERF_REGISTER: 1665 return perf_sysexit_enable(event); 1666 case TRACE_REG_PERF_UNREGISTER: 1667 perf_sysexit_disable(event); 1668 return 0; 1669 case TRACE_REG_PERF_OPEN: 1670 case TRACE_REG_PERF_CLOSE: 1671 case TRACE_REG_PERF_ADD: 1672 case TRACE_REG_PERF_DEL: 1673 return 0; 1674 #endif 1675 } 1676 return 0; 1677 } 1678