1 // SPDX-License-Identifier: GPL-2.0
2 #include <trace/syscall.h>
3 #include <trace/events/syscalls.h>
4 #include <linux/kernel_stat.h>
5 #include <linux/syscalls.h>
6 #include <linux/slab.h>
7 #include <linux/kernel.h>
8 #include <linux/module.h> /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
9 #include <linux/ftrace.h>
10 #include <linux/perf_event.h>
11 #include <linux/xarray.h>
12 #include <asm/syscall.h>
13
14 #include "trace_output.h"
15 #include "trace.h"
16
17 static DEFINE_MUTEX(syscall_trace_lock);
18
19 static int syscall_enter_register(struct trace_event_call *event,
20 enum trace_reg type, void *data);
21 static int syscall_exit_register(struct trace_event_call *event,
22 enum trace_reg type, void *data);
23
24 static struct list_head *
syscall_get_enter_fields(struct trace_event_call * call)25 syscall_get_enter_fields(struct trace_event_call *call)
26 {
27 struct syscall_metadata *entry = call->data;
28
29 return &entry->enter_fields;
30 }
31
32 extern struct syscall_metadata *__start_syscalls_metadata[];
33 extern struct syscall_metadata *__stop_syscalls_metadata[];
34
35 static DEFINE_XARRAY(syscalls_metadata_sparse);
36 static struct syscall_metadata **syscalls_metadata;
37
38 #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
arch_syscall_match_sym_name(const char * sym,const char * name)39 static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
40 {
41 /*
42 * Only compare after the "sys" prefix. Archs that use
43 * syscall wrappers may have syscalls symbols aliases prefixed
44 * with ".SyS" or ".sys" instead of "sys", leading to an unwanted
45 * mismatch.
46 */
47 return !strcmp(sym + 3, name + 3);
48 }
49 #endif
50
51 #ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
52 /*
53 * Some architectures that allow for 32bit applications
54 * to run on a 64bit kernel, do not map the syscalls for
55 * the 32bit tasks the same as they do for 64bit tasks.
56 *
57 * *cough*x86*cough*
58 *
59 * In such a case, instead of reporting the wrong syscalls,
60 * simply ignore them.
61 *
62 * For an arch to ignore the compat syscalls it needs to
63 * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as
64 * define the function arch_trace_is_compat_syscall() to let
65 * the tracing system know that it should ignore it.
66 */
67 static int
trace_get_syscall_nr(struct task_struct * task,struct pt_regs * regs)68 trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
69 {
70 if (unlikely(arch_trace_is_compat_syscall(regs)))
71 return -1;
72
73 return syscall_get_nr(task, regs);
74 }
75 #else
76 static inline int
trace_get_syscall_nr(struct task_struct * task,struct pt_regs * regs)77 trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
78 {
79 return syscall_get_nr(task, regs);
80 }
81 #endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */
82
83 static __init struct syscall_metadata *
find_syscall_meta(unsigned long syscall)84 find_syscall_meta(unsigned long syscall)
85 {
86 struct syscall_metadata **start;
87 struct syscall_metadata **stop;
88 char str[KSYM_SYMBOL_LEN];
89
90
91 start = __start_syscalls_metadata;
92 stop = __stop_syscalls_metadata;
93 kallsyms_lookup(syscall, NULL, NULL, NULL, str);
94
95 if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
96 return NULL;
97
98 for ( ; start < stop; start++) {
99 if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
100 return *start;
101 }
102 return NULL;
103 }
104
syscall_nr_to_meta(int nr)105 static struct syscall_metadata *syscall_nr_to_meta(int nr)
106 {
107 if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR))
108 return xa_load(&syscalls_metadata_sparse, (unsigned long)nr);
109
110 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
111 return NULL;
112
113 return syscalls_metadata[nr];
114 }
115
get_syscall_name(int syscall)116 const char *get_syscall_name(int syscall)
117 {
118 struct syscall_metadata *entry;
119
120 entry = syscall_nr_to_meta(syscall);
121 if (!entry)
122 return NULL;
123
124 return entry->name;
125 }
126
127 /* Added to user strings or arrays when max limit is reached */
128 #define EXTRA "..."
129
get_dynamic_len_ptr(struct syscall_trace_enter * trace,struct syscall_metadata * entry,int * offset_p,int * len_p,unsigned char ** ptr_p)130 static void get_dynamic_len_ptr(struct syscall_trace_enter *trace,
131 struct syscall_metadata *entry,
132 int *offset_p, int *len_p, unsigned char **ptr_p)
133 {
134 unsigned char *ptr;
135 int offset = *offset_p;
136 int val;
137
138 /* This arg points to a user space string */
139 ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset;
140 val = *(int *)ptr;
141
142 /* The value is a dynamic string (len << 16 | offset) */
143 ptr = (void *)trace + (val & 0xffff);
144 *len_p = val >> 16;
145 offset += 4;
146
147 *ptr_p = ptr;
148 *offset_p = offset;
149 }
150
151 static enum print_line_t
sys_enter_openat_print(struct syscall_trace_enter * trace,struct syscall_metadata * entry,struct trace_seq * s,struct trace_event * event)152 sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadata *entry,
153 struct trace_seq *s, struct trace_event *event)
154 {
155 unsigned char *ptr;
156 int offset = 0;
157 int bits, len;
158 bool done = false;
159 static const struct trace_print_flags __flags[] =
160 {
161 { O_TMPFILE, "O_TMPFILE" },
162 { O_WRONLY, "O_WRONLY" },
163 { O_RDWR, "O_RDWR" },
164 { O_CREAT, "O_CREAT" },
165 { O_EXCL, "O_EXCL" },
166 { O_NOCTTY, "O_NOCTTY" },
167 { O_TRUNC, "O_TRUNC" },
168 { O_APPEND, "O_APPEND" },
169 { O_NONBLOCK, "O_NONBLOCK" },
170 { O_DSYNC, "O_DSYNC" },
171 { O_DIRECT, "O_DIRECT" },
172 { O_LARGEFILE, "O_LARGEFILE" },
173 { O_DIRECTORY, "O_DIRECTORY" },
174 { O_NOFOLLOW, "O_NOFOLLOW" },
175 { O_NOATIME, "O_NOATIME" },
176 { O_CLOEXEC, "O_CLOEXEC" },
177 { -1, NULL }
178 };
179
180 trace_seq_printf(s, "%s(", entry->name);
181
182 for (int i = 0; !done && i < entry->nb_args; i++) {
183
184 if (trace_seq_has_overflowed(s))
185 goto end;
186
187 if (i)
188 trace_seq_puts(s, ", ");
189
190 switch (i) {
191 case 2:
192 bits = trace->args[2];
193
194 trace_seq_puts(s, "flags: ");
195
196 /* No need to show mode when not creating the file */
197 if (!(bits & (O_CREAT|O_TMPFILE)))
198 done = true;
199
200 if (!(bits & O_ACCMODE)) {
201 if (!bits) {
202 trace_seq_puts(s, "O_RDONLY");
203 continue;
204 }
205 trace_seq_puts(s, "O_RDONLY|");
206 }
207
208 trace_print_flags_seq(s, "|", bits, __flags);
209 /*
210 * trace_print_flags_seq() adds a '\0' to the
211 * buffer, but this needs to append more to the seq.
212 */
213 if (!trace_seq_has_overflowed(s))
214 trace_seq_pop(s);
215
216 continue;
217 case 3:
218 trace_seq_printf(s, "%s: 0%03o", entry->args[i],
219 (unsigned int)trace->args[i]);
220 continue;
221 }
222
223 trace_seq_printf(s, "%s: %lu", entry->args[i],
224 trace->args[i]);
225
226 if (!(BIT(i) & entry->user_mask))
227 continue;
228
229 get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr);
230 trace_seq_printf(s, " \"%.*s\"", len, ptr);
231 }
232
233 trace_seq_putc(s, ')');
234 end:
235 trace_seq_putc(s, '\n');
236
237 return trace_handle_return(s);
238 }
239
240 static enum print_line_t
print_syscall_enter(struct trace_iterator * iter,int flags,struct trace_event * event)241 print_syscall_enter(struct trace_iterator *iter, int flags,
242 struct trace_event *event)
243 {
244 struct trace_array *tr = iter->tr;
245 struct trace_seq *s = &iter->seq;
246 struct trace_entry *ent = iter->ent;
247 struct syscall_trace_enter *trace;
248 struct syscall_metadata *entry;
249 int i, syscall, val, len;
250 unsigned char *ptr;
251 int offset = 0;
252
253 trace = (typeof(trace))ent;
254 syscall = trace->nr;
255 entry = syscall_nr_to_meta(syscall);
256
257 if (!entry)
258 goto end;
259
260 if (entry->enter_event->event.type != ent->type) {
261 WARN_ON_ONCE(1);
262 goto end;
263 }
264
265 switch (entry->syscall_nr) {
266 case __NR_openat:
267 if (!tr || !(tr->trace_flags & TRACE_ITER(VERBOSE)))
268 return sys_enter_openat_print(trace, entry, s, event);
269 break;
270 default:
271 break;
272 }
273
274 trace_seq_printf(s, "%s(", entry->name);
275
276 for (i = 0; i < entry->nb_args; i++) {
277 bool printable = false;
278 char *str;
279
280 if (trace_seq_has_overflowed(s))
281 goto end;
282
283 if (i)
284 trace_seq_puts(s, ", ");
285
286 /* parameter types */
287 if (tr && tr->trace_flags & TRACE_ITER(VERBOSE))
288 trace_seq_printf(s, "%s ", entry->types[i]);
289
290 /* parameter values */
291 if (trace->args[i] < 10)
292 trace_seq_printf(s, "%s: %lu", entry->args[i],
293 trace->args[i]);
294 else
295 trace_seq_printf(s, "%s: 0x%lx", entry->args[i],
296 trace->args[i]);
297
298 if (!(BIT(i) & entry->user_mask))
299 continue;
300
301 get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr);
302
303 if (entry->user_arg_size < 0 || entry->user_arg_is_str) {
304 trace_seq_printf(s, " \"%.*s\"", len, ptr);
305 continue;
306 }
307
308 val = trace->args[entry->user_arg_size];
309
310 str = ptr;
311 trace_seq_puts(s, " (");
312 for (int x = 0; x < len; x++, ptr++) {
313 if (isascii(*ptr) && isprint(*ptr))
314 printable = true;
315 if (x)
316 trace_seq_putc(s, ':');
317 trace_seq_printf(s, "%02x", *ptr);
318 }
319 if (len < val)
320 trace_seq_printf(s, ", %s", EXTRA);
321
322 trace_seq_putc(s, ')');
323
324 /* If nothing is printable, don't bother printing anything */
325 if (!printable)
326 continue;
327
328 trace_seq_puts(s, " \"");
329 for (int x = 0; x < len; x++) {
330 if (isascii(str[x]) && isprint(str[x]))
331 trace_seq_putc(s, str[x]);
332 else
333 trace_seq_putc(s, '.');
334 }
335 if (len < val)
336 trace_seq_printf(s, "\"%s", EXTRA);
337 else
338 trace_seq_putc(s, '"');
339 }
340
341 trace_seq_putc(s, ')');
342 end:
343 trace_seq_putc(s, '\n');
344
345 return trace_handle_return(s);
346 }
347
348 static enum print_line_t
print_syscall_exit(struct trace_iterator * iter,int flags,struct trace_event * event)349 print_syscall_exit(struct trace_iterator *iter, int flags,
350 struct trace_event *event)
351 {
352 struct trace_seq *s = &iter->seq;
353 struct trace_entry *ent = iter->ent;
354 struct syscall_trace_exit *trace;
355 int syscall;
356 struct syscall_metadata *entry;
357
358 trace = (typeof(trace))ent;
359 syscall = trace->nr;
360 entry = syscall_nr_to_meta(syscall);
361
362 if (!entry) {
363 trace_seq_putc(s, '\n');
364 goto out;
365 }
366
367 if (entry->exit_event->event.type != ent->type) {
368 WARN_ON_ONCE(1);
369 return TRACE_TYPE_UNHANDLED;
370 }
371
372 trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
373 trace->ret);
374
375 out:
376 return trace_handle_return(s);
377 }
378
379 #define SYSCALL_FIELD(_type, _name) { \
380 .type = #_type, .name = #_name, \
381 .size = sizeof(_type), .align = __alignof__(_type), \
382 .is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER }
383
384 /* When len=0, we just calculate the needed length */
385 #define LEN_OR_ZERO (len ? len - pos : 0)
386
387 static int __init
sys_enter_openat_print_fmt(struct syscall_metadata * entry,char * buf,int len)388 sys_enter_openat_print_fmt(struct syscall_metadata *entry, char *buf, int len)
389 {
390 int pos = 0;
391
392 pos += snprintf(buf + pos, LEN_OR_ZERO,
393 "\"dfd: 0x%%08lx, filename: 0x%%08lx \\\"%%s\\\", flags: %%s%%s, mode: 0%%03o\",");
394 pos += snprintf(buf + pos, LEN_OR_ZERO,
395 " ((unsigned long)(REC->dfd)),");
396 pos += snprintf(buf + pos, LEN_OR_ZERO,
397 " ((unsigned long)(REC->filename)),");
398 pos += snprintf(buf + pos, LEN_OR_ZERO,
399 " __get_str(__filename_val),");
400 pos += snprintf(buf + pos, LEN_OR_ZERO,
401 " (REC->flags & ~3) && !(REC->flags & 3) ? \"O_RDONLY|\" : \"\", ");
402 pos += snprintf(buf + pos, LEN_OR_ZERO,
403 " REC->flags ? __print_flags(REC->flags, \"|\", ");
404 pos += snprintf(buf + pos, LEN_OR_ZERO,
405 "{ 0x%x, \"O_WRONLY\" }, ", O_WRONLY);
406 pos += snprintf(buf + pos, LEN_OR_ZERO,
407 "{ 0x%x, \"O_RDWR\" }, ", O_RDWR);
408 pos += snprintf(buf + pos, LEN_OR_ZERO,
409 "{ 0x%x, \"O_CREAT\" }, ", O_CREAT);
410 pos += snprintf(buf + pos, LEN_OR_ZERO,
411 "{ 0x%x, \"O_EXCL\" }, ", O_EXCL);
412 pos += snprintf(buf + pos, LEN_OR_ZERO,
413 "{ 0x%x, \"O_NOCTTY\" }, ", O_NOCTTY);
414 pos += snprintf(buf + pos, LEN_OR_ZERO,
415 "{ 0x%x, \"O_TRUNC\" }, ", O_TRUNC);
416 pos += snprintf(buf + pos, LEN_OR_ZERO,
417 "{ 0x%x, \"O_APPEND\" }, ", O_APPEND);
418 pos += snprintf(buf + pos, LEN_OR_ZERO,
419 "{ 0x%x, \"O_NONBLOCK\" }, ", O_NONBLOCK);
420 pos += snprintf(buf + pos, LEN_OR_ZERO,
421 "{ 0x%x, \"O_DSYNC\" }, ", O_DSYNC);
422 pos += snprintf(buf + pos, LEN_OR_ZERO,
423 "{ 0x%x, \"O_DIRECT\" }, ", O_DIRECT);
424 pos += snprintf(buf + pos, LEN_OR_ZERO,
425 "{ 0x%x, \"O_LARGEFILE\" }, ", O_LARGEFILE);
426 pos += snprintf(buf + pos, LEN_OR_ZERO,
427 "{ 0x%x, \"O_DIRECTORY\" }, ", O_DIRECTORY);
428 pos += snprintf(buf + pos, LEN_OR_ZERO,
429 "{ 0x%x, \"O_NOFOLLOW\" }, ", O_NOFOLLOW);
430 pos += snprintf(buf + pos, LEN_OR_ZERO,
431 "{ 0x%x, \"O_NOATIME\" }, ", O_NOATIME);
432 pos += snprintf(buf + pos, LEN_OR_ZERO,
433 "{ 0x%x, \"O_CLOEXEC\" }) : \"O_RDONLY\", ", O_CLOEXEC);
434
435 pos += snprintf(buf + pos, LEN_OR_ZERO,
436 " ((unsigned long)(REC->mode))");
437 return pos;
438 }
439
440 static int __init
__set_enter_print_fmt(struct syscall_metadata * entry,char * buf,int len)441 __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
442 {
443 bool is_string = entry->user_arg_is_str;
444 int i;
445 int pos = 0;
446
447 switch (entry->syscall_nr) {
448 case __NR_openat:
449 return sys_enter_openat_print_fmt(entry, buf, len);
450 default:
451 break;
452 }
453
454 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
455 for (i = 0; i < entry->nb_args; i++) {
456 if (i)
457 pos += snprintf(buf + pos, LEN_OR_ZERO, ", ");
458 pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx",
459 entry->args[i], sizeof(unsigned long));
460
461 if (!(BIT(i) & entry->user_mask))
462 continue;
463
464 /* Add the format for the user space string or array */
465 if (entry->user_arg_size < 0 || is_string)
466 pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\"");
467 else
468 pos += snprintf(buf + pos, LEN_OR_ZERO, " (%%s)");
469 }
470 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
471
472 for (i = 0; i < entry->nb_args; i++) {
473 pos += snprintf(buf + pos, LEN_OR_ZERO,
474 ", ((unsigned long)(REC->%s))", entry->args[i]);
475 if (!(BIT(i) & entry->user_mask))
476 continue;
477 /* The user space data for arg has name __<arg>_val */
478 if (entry->user_arg_size < 0 || is_string) {
479 pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)",
480 entry->args[i]);
481 } else {
482 pos += snprintf(buf + pos, LEN_OR_ZERO, ", __print_dynamic_array(__%s_val, 1)",
483 entry->args[i]);
484 }
485 }
486
487 #undef LEN_OR_ZERO
488
489 /* return the length of print_fmt */
490 return pos;
491 }
492
set_syscall_print_fmt(struct trace_event_call * call)493 static int __init set_syscall_print_fmt(struct trace_event_call *call)
494 {
495 char *print_fmt;
496 int len;
497 struct syscall_metadata *entry = call->data;
498
499 if (entry->enter_event != call) {
500 call->print_fmt = "\"0x%lx\", REC->ret";
501 return 0;
502 }
503
504 /* First: called with 0 length to calculate the needed length */
505 len = __set_enter_print_fmt(entry, NULL, 0);
506
507 print_fmt = kmalloc(len + 1, GFP_KERNEL);
508 if (!print_fmt)
509 return -ENOMEM;
510
511 /* Second: actually write the @print_fmt */
512 __set_enter_print_fmt(entry, print_fmt, len + 1);
513 call->print_fmt = print_fmt;
514
515 return 0;
516 }
517
free_syscall_print_fmt(struct trace_event_call * call)518 static void __init free_syscall_print_fmt(struct trace_event_call *call)
519 {
520 struct syscall_metadata *entry = call->data;
521
522 if (entry->enter_event == call)
523 kfree(call->print_fmt);
524 }
525
syscall_enter_define_fields(struct trace_event_call * call)526 static int __init syscall_enter_define_fields(struct trace_event_call *call)
527 {
528 struct syscall_trace_enter trace;
529 struct syscall_metadata *meta = call->data;
530 unsigned long mask;
531 char *arg;
532 int offset = offsetof(typeof(trace), args);
533 int ret = 0;
534 int len;
535 int i;
536
537 for (i = 0; i < meta->nb_args; i++) {
538 ret = trace_define_field(call, meta->types[i],
539 meta->args[i], offset,
540 sizeof(unsigned long), 0,
541 FILTER_OTHER);
542 if (ret)
543 break;
544 offset += sizeof(unsigned long);
545 }
546
547 if (ret || !meta->user_mask)
548 return ret;
549
550 mask = meta->user_mask;
551
552 while (mask) {
553 int idx = ffs(mask) - 1;
554 mask &= ~BIT(idx);
555
556 /*
557 * User space data is faulted into a temporary buffer and then
558 * added as a dynamic string or array to the end of the event.
559 * The user space data name for the arg pointer is
560 * "__<arg>_val".
561 */
562 len = strlen(meta->args[idx]) + sizeof("___val");
563 arg = kmalloc(len, GFP_KERNEL);
564 if (WARN_ON_ONCE(!arg)) {
565 meta->user_mask = 0;
566 return -ENOMEM;
567 }
568
569 snprintf(arg, len, "__%s_val", meta->args[idx]);
570
571 ret = trace_define_field(call, "__data_loc char[]",
572 arg, offset, sizeof(int), 0,
573 FILTER_OTHER);
574 if (ret) {
575 kfree(arg);
576 break;
577 }
578 offset += 4;
579 }
580 return ret;
581 }
582
583 /*
584 * Create a per CPU temporary buffer to copy user space pointers into.
585 *
586 * SYSCALL_FAULT_USER_MAX is the amount to copy from user space.
587 * (defined in kernel/trace/trace.h)
588
589 * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space plus the
590 * nul terminating byte and possibly appended EXTRA (4 bytes).
591 *
592 * SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use
593 * to copy memory from user space addresses into that will hold
594 * 3 args as only 3 args are allowed to be copied from system calls.
595 */
596 #define SYSCALL_FAULT_ARG_SZ (SYSCALL_FAULT_USER_MAX + 1 + 4)
597 #define SYSCALL_FAULT_MAX_CNT 3
598 #define SYSCALL_FAULT_BUF_SZ (SYSCALL_FAULT_ARG_SZ * SYSCALL_FAULT_MAX_CNT)
599
600 /* Use the tracing per CPU buffer infrastructure to copy from user space */
601 struct syscall_user_buffer {
602 struct trace_user_buf_info buf;
603 struct rcu_head rcu;
604 };
605
606 static struct syscall_user_buffer *syscall_buffer;
607
syscall_fault_buffer_enable(void)608 static int syscall_fault_buffer_enable(void)
609 {
610 struct syscall_user_buffer *sbuf;
611 int ret;
612
613 lockdep_assert_held(&syscall_trace_lock);
614
615 if (syscall_buffer) {
616 trace_user_fault_get(&syscall_buffer->buf);
617 return 0;
618 }
619
620 sbuf = kmalloc(sizeof(*sbuf), GFP_KERNEL);
621 if (!sbuf)
622 return -ENOMEM;
623
624 ret = trace_user_fault_init(&sbuf->buf, SYSCALL_FAULT_BUF_SZ);
625 if (ret < 0) {
626 kfree(sbuf);
627 return ret;
628 }
629
630 WRITE_ONCE(syscall_buffer, sbuf);
631
632 return 0;
633 }
634
rcu_free_syscall_buffer(struct rcu_head * rcu)635 static void rcu_free_syscall_buffer(struct rcu_head *rcu)
636 {
637 struct syscall_user_buffer *sbuf =
638 container_of(rcu, struct syscall_user_buffer, rcu);
639
640 trace_user_fault_destroy(&sbuf->buf);
641 kfree(sbuf);
642 }
643
644
syscall_fault_buffer_disable(void)645 static void syscall_fault_buffer_disable(void)
646 {
647 struct syscall_user_buffer *sbuf = syscall_buffer;
648
649 lockdep_assert_held(&syscall_trace_lock);
650
651 if (trace_user_fault_put(&sbuf->buf))
652 return;
653
654 WRITE_ONCE(syscall_buffer, NULL);
655 call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer);
656 }
657
658 struct syscall_args {
659 char *ptr_array[SYSCALL_FAULT_MAX_CNT];
660 int read[SYSCALL_FAULT_MAX_CNT];
661 int uargs;
662 };
663
syscall_copy_user(char * buf,const char __user * ptr,size_t size,void * data)664 static int syscall_copy_user(char *buf, const char __user *ptr,
665 size_t size, void *data)
666 {
667 struct syscall_args *args = data;
668 int ret;
669
670 for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
671 ptr = (char __user *)args->ptr_array[i];
672 ret = strncpy_from_user(buf, ptr, size);
673 args->read[i] = ret;
674 }
675 return 0;
676 }
677
syscall_copy_user_array(char * buf,const char __user * ptr,size_t size,void * data)678 static int syscall_copy_user_array(char *buf, const char __user *ptr,
679 size_t size, void *data)
680 {
681 struct syscall_args *args = data;
682 int ret;
683
684 for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
685 ptr = (char __user *)args->ptr_array[i];
686 ret = __copy_from_user(buf, ptr, size);
687 args->read[i] = ret ? -1 : size;
688 }
689 return 0;
690 }
691
sys_fault_user(unsigned int buf_size,struct syscall_metadata * sys_data,struct syscall_user_buffer * sbuf,unsigned long * args,unsigned int data_size[SYSCALL_FAULT_MAX_CNT])692 static char *sys_fault_user(unsigned int buf_size,
693 struct syscall_metadata *sys_data,
694 struct syscall_user_buffer *sbuf,
695 unsigned long *args,
696 unsigned int data_size[SYSCALL_FAULT_MAX_CNT])
697 {
698 trace_user_buf_copy syscall_copy = syscall_copy_user;
699 unsigned long mask = sys_data->user_mask;
700 unsigned long size = SYSCALL_FAULT_ARG_SZ - 1;
701 struct syscall_args sargs;
702 bool array = false;
703 char *buffer;
704 char *buf;
705 int ret;
706 int i = 0;
707
708 /* The extra is appended to the user data in the buffer */
709 BUILD_BUG_ON(SYSCALL_FAULT_USER_MAX + sizeof(EXTRA) >=
710 SYSCALL_FAULT_ARG_SZ);
711
712 /*
713 * If this system call event has a size argument, use
714 * it to define how much of user space memory to read,
715 * and read it as an array and not a string.
716 */
717 if (sys_data->user_arg_size >= 0) {
718 array = true;
719 size = args[sys_data->user_arg_size];
720 if (size > SYSCALL_FAULT_ARG_SZ - 1)
721 size = SYSCALL_FAULT_ARG_SZ - 1;
722 syscall_copy = syscall_copy_user_array;
723 }
724
725 while (mask) {
726 int idx = ffs(mask) - 1;
727 mask &= ~BIT(idx);
728
729 if (WARN_ON_ONCE(i == SYSCALL_FAULT_MAX_CNT))
730 break;
731
732 /* Get the pointer to user space memory to read */
733 sargs.ptr_array[i++] = (char *)args[idx];
734 }
735
736 sargs.uargs = i;
737
738 /* Clear the values that are not used */
739 for (; i < SYSCALL_FAULT_MAX_CNT; i++) {
740 data_size[i] = -1; /* Denotes no pointer */
741 }
742
743 /* A zero size means do not even try */
744 if (!buf_size)
745 return NULL;
746
747 buffer = trace_user_fault_read(&sbuf->buf, NULL, size,
748 syscall_copy, &sargs);
749 if (!buffer)
750 return NULL;
751
752 buf = buffer;
753 for (i = 0; i < sargs.uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
754
755 ret = sargs.read[i];
756 if (ret < 0)
757 continue;
758 buf[ret] = '\0';
759
760 /* For strings, replace any non-printable characters with '.' */
761 if (!array) {
762 for (int x = 0; x < ret; x++) {
763 if (!isprint(buf[x]))
764 buf[x] = '.';
765 }
766
767 size = min(buf_size, SYSCALL_FAULT_USER_MAX);
768
769 /*
770 * If the text was truncated due to our max limit,
771 * add "..." to the string.
772 */
773 if (ret > size) {
774 strscpy(buf + size, EXTRA, sizeof(EXTRA));
775 ret = size + sizeof(EXTRA);
776 } else {
777 buf[ret++] = '\0';
778 }
779 } else {
780 ret = min((unsigned int)ret, buf_size);
781 }
782 data_size[i] = ret;
783 }
784
785 return buffer;
786 }
787
788 static int
syscall_get_data(struct syscall_metadata * sys_data,unsigned long * args,char ** buffer,int * size,int * user_sizes,int * uargs,int buf_size)789 syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args,
790 char **buffer, int *size, int *user_sizes, int *uargs,
791 int buf_size)
792 {
793 struct syscall_user_buffer *sbuf;
794 int i;
795
796 /* If the syscall_buffer is NULL, tracing is being shutdown */
797 sbuf = READ_ONCE(syscall_buffer);
798 if (!sbuf)
799 return -1;
800
801 *buffer = sys_fault_user(buf_size, sys_data, sbuf, args, user_sizes);
802 /*
803 * user_size is the amount of data to append.
804 * Need to add 4 for the meta field that points to
805 * the user memory at the end of the event and also
806 * stores its size.
807 */
808 for (i = 0; i < SYSCALL_FAULT_MAX_CNT; i++) {
809 if (user_sizes[i] < 0)
810 break;
811 *size += user_sizes[i] + 4;
812 }
813 /* Save the number of user read arguments of this syscall */
814 *uargs = i;
815 return 0;
816 }
817
syscall_put_data(struct syscall_metadata * sys_data,struct syscall_trace_enter * entry,char * buffer,int size,int * user_sizes,int uargs)818 static void syscall_put_data(struct syscall_metadata *sys_data,
819 struct syscall_trace_enter *entry,
820 char *buffer, int size, int *user_sizes, int uargs)
821 {
822 char *buf = buffer;
823 void *ptr;
824 int val;
825
826 /*
827 * Set the pointer to point to the meta data of the event
828 * that has information about the stored user space memory.
829 */
830 ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
831
832 /*
833 * The meta data will store the offset of the user data from
834 * the beginning of the event. That is after the static arguments
835 * and the meta data fields.
836 */
837 val = (ptr - (void *)entry) + 4 * uargs;
838
839 for (int i = 0; i < uargs; i++) {
840
841 if (i)
842 val += user_sizes[i - 1];
843
844 /* Store the offset and the size into the meta data */
845 *(int *)ptr = val | (user_sizes[i] << 16);
846
847 /* Skip the meta data */
848 ptr += 4;
849 }
850
851 for (int i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
852 /* Nothing to do if the user space was empty or faulted */
853 if (!user_sizes[i])
854 continue;
855
856 memcpy(ptr, buf, user_sizes[i]);
857 ptr += user_sizes[i];
858 }
859 }
860
ftrace_syscall_enter(void * data,struct pt_regs * regs,long id)861 static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
862 {
863 struct trace_array *tr = data;
864 struct trace_event_file *trace_file;
865 struct syscall_trace_enter *entry;
866 struct syscall_metadata *sys_data;
867 struct trace_event_buffer fbuffer;
868 unsigned long args[6];
869 char *user_ptr;
870 int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
871 int syscall_nr;
872 int size = 0;
873 int uargs = 0;
874 bool mayfault;
875
876 /*
877 * Syscall probe called with preemption enabled, but the ring
878 * buffer and per-cpu data require preemption to be disabled.
879 */
880 might_fault();
881
882 syscall_nr = trace_get_syscall_nr(current, regs);
883 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
884 return;
885
886 trace_file = READ_ONCE(tr->enter_syscall_files[syscall_nr]);
887 if (!trace_file)
888 return;
889
890 if (trace_trigger_soft_disabled(trace_file))
891 return;
892
893 sys_data = syscall_nr_to_meta(syscall_nr);
894 if (!sys_data)
895 return;
896
897 /* Check if this syscall event faults in user space memory */
898 mayfault = sys_data->user_mask != 0;
899
900 guard(preempt_notrace)();
901
902 syscall_get_arguments(current, regs, args);
903
904 if (mayfault) {
905 if (syscall_get_data(sys_data, args, &user_ptr,
906 &size, user_sizes, &uargs, tr->syscall_buf_sz) < 0)
907 return;
908 }
909
910 size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
911
912 entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
913 if (!entry)
914 return;
915
916 entry = ring_buffer_event_data(fbuffer.event);
917 entry->nr = syscall_nr;
918
919 memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
920
921 if (mayfault)
922 syscall_put_data(sys_data, entry, user_ptr, size, user_sizes, uargs);
923
924 trace_event_buffer_commit(&fbuffer);
925 }
926
ftrace_syscall_exit(void * data,struct pt_regs * regs,long ret)927 static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
928 {
929 struct trace_array *tr = data;
930 struct trace_event_file *trace_file;
931 struct syscall_trace_exit *entry;
932 struct syscall_metadata *sys_data;
933 struct trace_event_buffer fbuffer;
934 int syscall_nr;
935
936 /*
937 * Syscall probe called with preemption enabled, but the ring
938 * buffer and per-cpu data require preemption to be disabled.
939 */
940 might_fault();
941 guard(preempt_notrace)();
942
943 syscall_nr = trace_get_syscall_nr(current, regs);
944 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
945 return;
946
947 trace_file = READ_ONCE(tr->exit_syscall_files[syscall_nr]);
948 if (!trace_file)
949 return;
950
951 if (trace_trigger_soft_disabled(trace_file))
952 return;
953
954 sys_data = syscall_nr_to_meta(syscall_nr);
955 if (!sys_data)
956 return;
957
958 entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry));
959 if (!entry)
960 return;
961
962 entry = ring_buffer_event_data(fbuffer.event);
963 entry->nr = syscall_nr;
964 entry->ret = syscall_get_return_value(current, regs);
965
966 trace_event_buffer_commit(&fbuffer);
967 }
968
reg_event_syscall_enter(struct trace_event_file * file,struct trace_event_call * call)969 static int reg_event_syscall_enter(struct trace_event_file *file,
970 struct trace_event_call *call)
971 {
972 struct syscall_metadata *sys_data = call->data;
973 struct trace_array *tr = file->tr;
974 int ret = 0;
975 int num;
976
977 num = sys_data->syscall_nr;
978 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
979 return -ENOSYS;
980 guard(mutex)(&syscall_trace_lock);
981 if (sys_data->user_mask) {
982 ret = syscall_fault_buffer_enable();
983 if (ret < 0)
984 return ret;
985 }
986 if (!tr->sys_refcount_enter) {
987 ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
988 if (ret < 0) {
989 if (sys_data->user_mask)
990 syscall_fault_buffer_disable();
991 return ret;
992 }
993 }
994 WRITE_ONCE(tr->enter_syscall_files[num], file);
995 tr->sys_refcount_enter++;
996 return 0;
997 }
998
unreg_event_syscall_enter(struct trace_event_file * file,struct trace_event_call * call)999 static void unreg_event_syscall_enter(struct trace_event_file *file,
1000 struct trace_event_call *call)
1001 {
1002 struct syscall_metadata *sys_data = call->data;
1003 struct trace_array *tr = file->tr;
1004 int num;
1005
1006 num = sys_data->syscall_nr;
1007 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
1008 return;
1009 guard(mutex)(&syscall_trace_lock);
1010 tr->sys_refcount_enter--;
1011 WRITE_ONCE(tr->enter_syscall_files[num], NULL);
1012 if (!tr->sys_refcount_enter)
1013 unregister_trace_sys_enter(ftrace_syscall_enter, tr);
1014 if (sys_data->user_mask)
1015 syscall_fault_buffer_disable();
1016 }
1017
reg_event_syscall_exit(struct trace_event_file * file,struct trace_event_call * call)1018 static int reg_event_syscall_exit(struct trace_event_file *file,
1019 struct trace_event_call *call)
1020 {
1021 struct trace_array *tr = file->tr;
1022 int ret = 0;
1023 int num;
1024
1025 num = ((struct syscall_metadata *)call->data)->syscall_nr;
1026 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
1027 return -ENOSYS;
1028 mutex_lock(&syscall_trace_lock);
1029 if (!tr->sys_refcount_exit)
1030 ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
1031 if (!ret) {
1032 WRITE_ONCE(tr->exit_syscall_files[num], file);
1033 tr->sys_refcount_exit++;
1034 }
1035 mutex_unlock(&syscall_trace_lock);
1036 return ret;
1037 }
1038
unreg_event_syscall_exit(struct trace_event_file * file,struct trace_event_call * call)1039 static void unreg_event_syscall_exit(struct trace_event_file *file,
1040 struct trace_event_call *call)
1041 {
1042 struct trace_array *tr = file->tr;
1043 int num;
1044
1045 num = ((struct syscall_metadata *)call->data)->syscall_nr;
1046 if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
1047 return;
1048 mutex_lock(&syscall_trace_lock);
1049 tr->sys_refcount_exit--;
1050 WRITE_ONCE(tr->exit_syscall_files[num], NULL);
1051 if (!tr->sys_refcount_exit)
1052 unregister_trace_sys_exit(ftrace_syscall_exit, tr);
1053 mutex_unlock(&syscall_trace_lock);
1054 }
1055
1056 /*
1057 * For system calls that reference user space memory that can
1058 * be recorded into the event, set the system call meta data's user_mask
1059 * to the "args" index that points to the user space memory to retrieve.
1060 */
check_faultable_syscall(struct trace_event_call * call,int nr)1061 static void check_faultable_syscall(struct trace_event_call *call, int nr)
1062 {
1063 struct syscall_metadata *sys_data = call->data;
1064 unsigned long mask;
1065
1066 /* Only work on entry */
1067 if (sys_data->enter_event != call)
1068 return;
1069
1070 sys_data->user_arg_size = -1;
1071
1072 switch (nr) {
1073 /* user arg 1 with size arg at 2 */
1074 case __NR_write:
1075 #ifdef __NR_mq_timedsend
1076 case __NR_mq_timedsend:
1077 #endif
1078 case __NR_pwrite64:
1079 sys_data->user_mask = BIT(1);
1080 sys_data->user_arg_size = 2;
1081 break;
1082 /* user arg 0 with size arg at 1 as string */
1083 case __NR_setdomainname:
1084 case __NR_sethostname:
1085 sys_data->user_mask = BIT(0);
1086 sys_data->user_arg_size = 1;
1087 sys_data->user_arg_is_str = 1;
1088 break;
1089 #ifdef __NR_kexec_file_load
1090 /* user arg 4 with size arg at 3 as string */
1091 case __NR_kexec_file_load:
1092 sys_data->user_mask = BIT(4);
1093 sys_data->user_arg_size = 3;
1094 sys_data->user_arg_is_str = 1;
1095 break;
1096 #endif
1097 /* user arg at position 0 */
1098 #ifdef __NR_access
1099 case __NR_access:
1100 #endif
1101 case __NR_acct:
1102 case __NR_chdir:
1103 #ifdef __NR_chown
1104 case __NR_chown:
1105 #endif
1106 #ifdef __NR_chmod
1107 case __NR_chmod:
1108 #endif
1109 case __NR_chroot:
1110 #ifdef __NR_creat
1111 case __NR_creat:
1112 #endif
1113 case __NR_delete_module:
1114 case __NR_execve:
1115 case __NR_fsopen:
1116 #ifdef __NR_lchown
1117 case __NR_lchown:
1118 #endif
1119 #ifdef __NR_open
1120 case __NR_open:
1121 #endif
1122 case __NR_memfd_create:
1123 #ifdef __NR_mkdir
1124 case __NR_mkdir:
1125 #endif
1126 #ifdef __NR_mknod
1127 case __NR_mknod:
1128 #endif
1129 case __NR_mq_open:
1130 case __NR_mq_unlink:
1131 #ifdef __NR_readlink
1132 case __NR_readlink:
1133 #endif
1134 #ifdef __NR_rmdir
1135 case __NR_rmdir:
1136 #endif
1137 case __NR_shmdt:
1138 #ifdef __NR_statfs
1139 case __NR_statfs:
1140 #endif
1141 case __NR_swapon:
1142 case __NR_swapoff:
1143 #ifdef __NR_truncate
1144 case __NR_truncate:
1145 #endif
1146 #ifdef __NR_unlink
1147 case __NR_unlink:
1148 #endif
1149 case __NR_umount2:
1150 #ifdef __NR_utime
1151 case __NR_utime:
1152 #endif
1153 #ifdef __NR_utimes
1154 case __NR_utimes:
1155 #endif
1156 sys_data->user_mask = BIT(0);
1157 break;
1158 /* user arg at position 1 */
1159 case __NR_execveat:
1160 case __NR_faccessat:
1161 case __NR_faccessat2:
1162 case __NR_finit_module:
1163 case __NR_fchmodat:
1164 case __NR_fchmodat2:
1165 case __NR_fchownat:
1166 case __NR_fgetxattr:
1167 case __NR_flistxattr:
1168 case __NR_fsetxattr:
1169 case __NR_fspick:
1170 case __NR_fremovexattr:
1171 #ifdef __NR_futimesat
1172 case __NR_futimesat:
1173 #endif
1174 case __NR_inotify_add_watch:
1175 case __NR_mkdirat:
1176 case __NR_mknodat:
1177 case __NR_mount_setattr:
1178 case __NR_name_to_handle_at:
1179 #ifdef __NR_newfstatat
1180 case __NR_newfstatat:
1181 #endif
1182 case __NR_openat:
1183 case __NR_openat2:
1184 case __NR_open_tree:
1185 case __NR_open_tree_attr:
1186 case __NR_readlinkat:
1187 case __NR_quotactl:
1188 case __NR_syslog:
1189 case __NR_statx:
1190 case __NR_unlinkat:
1191 #ifdef __NR_utimensat
1192 case __NR_utimensat:
1193 #endif
1194 sys_data->user_mask = BIT(1);
1195 break;
1196 /* user arg at position 2 */
1197 case __NR_init_module:
1198 case __NR_fsconfig:
1199 sys_data->user_mask = BIT(2);
1200 break;
1201 /* user arg at position 4 */
1202 case __NR_fanotify_mark:
1203 sys_data->user_mask = BIT(4);
1204 break;
1205 /* 2 user args, 0 and 1 */
1206 case __NR_add_key:
1207 case __NR_getxattr:
1208 case __NR_lgetxattr:
1209 case __NR_lremovexattr:
1210 #ifdef __NR_link
1211 case __NR_link:
1212 #endif
1213 case __NR_listxattr:
1214 case __NR_llistxattr:
1215 case __NR_lsetxattr:
1216 case __NR_pivot_root:
1217 case __NR_removexattr:
1218 #ifdef __NR_rename
1219 case __NR_rename:
1220 #endif
1221 case __NR_request_key:
1222 case __NR_setxattr:
1223 #ifdef __NR_symlink
1224 case __NR_symlink:
1225 #endif
1226 sys_data->user_mask = BIT(0) | BIT(1);
1227 break;
1228 /* 2 user args, 0 and 2 */
1229 case __NR_symlinkat:
1230 sys_data->user_mask = BIT(0) | BIT(2);
1231 break;
1232 /* 2 user args, 1 and 3 */
1233 case __NR_getxattrat:
1234 case __NR_linkat:
1235 case __NR_listxattrat:
1236 case __NR_move_mount:
1237 #ifdef __NR_renameat
1238 case __NR_renameat:
1239 #endif
1240 case __NR_renameat2:
1241 case __NR_removexattrat:
1242 case __NR_setxattrat:
1243 sys_data->user_mask = BIT(1) | BIT(3);
1244 break;
1245 case __NR_mount: /* Just dev_name and dir_name, TODO add type */
1246 sys_data->user_mask = BIT(0) | BIT(1) | BIT(2);
1247 break;
1248 default:
1249 sys_data->user_mask = 0;
1250 return;
1251 }
1252
1253 if (sys_data->user_arg_size < 0)
1254 return;
1255
1256 /*
1257 * The user_arg_size can only be used when the system call
1258 * is reading only a single address from user space.
1259 */
1260 mask = sys_data->user_mask;
1261 if (WARN_ON(mask & (mask - 1)))
1262 sys_data->user_arg_size = -1;
1263 }
1264
init_syscall_trace(struct trace_event_call * call)1265 static int __init init_syscall_trace(struct trace_event_call *call)
1266 {
1267 int id;
1268 int num;
1269
1270 num = ((struct syscall_metadata *)call->data)->syscall_nr;
1271 if (num < 0 || num >= NR_syscalls) {
1272 pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
1273 ((struct syscall_metadata *)call->data)->name);
1274 return -ENOSYS;
1275 }
1276
1277 check_faultable_syscall(call, num);
1278
1279 if (set_syscall_print_fmt(call) < 0)
1280 return -ENOMEM;
1281
1282 id = trace_event_raw_init(call);
1283
1284 if (id < 0) {
1285 free_syscall_print_fmt(call);
1286 return id;
1287 }
1288
1289 return id;
1290 }
1291
1292 static struct trace_event_fields __refdata syscall_enter_fields_array[] = {
1293 SYSCALL_FIELD(int, __syscall_nr),
1294 { .type = TRACE_FUNCTION_TYPE,
1295 .define_fields = syscall_enter_define_fields },
1296 {}
1297 };
1298
1299 struct trace_event_functions enter_syscall_print_funcs = {
1300 .trace = print_syscall_enter,
1301 };
1302
1303 struct trace_event_functions exit_syscall_print_funcs = {
1304 .trace = print_syscall_exit,
1305 };
1306
1307 struct trace_event_class __refdata event_class_syscall_enter = {
1308 .system = "syscalls",
1309 .reg = syscall_enter_register,
1310 .fields_array = syscall_enter_fields_array,
1311 .get_fields = syscall_get_enter_fields,
1312 .raw_init = init_syscall_trace,
1313 };
1314
1315 struct trace_event_class __refdata event_class_syscall_exit = {
1316 .system = "syscalls",
1317 .reg = syscall_exit_register,
1318 .fields_array = (struct trace_event_fields[]){
1319 SYSCALL_FIELD(int, __syscall_nr),
1320 SYSCALL_FIELD(long, ret),
1321 {}
1322 },
1323 .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields),
1324 .raw_init = init_syscall_trace,
1325 };
1326
arch_syscall_addr(int nr)1327 unsigned long __init __weak arch_syscall_addr(int nr)
1328 {
1329 return (unsigned long)sys_call_table[nr];
1330 }
1331
init_ftrace_syscalls(void)1332 void __init init_ftrace_syscalls(void)
1333 {
1334 struct syscall_metadata *meta;
1335 unsigned long addr;
1336 int i;
1337 void *ret;
1338
1339 if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
1340 syscalls_metadata = kcalloc(NR_syscalls,
1341 sizeof(*syscalls_metadata),
1342 GFP_KERNEL);
1343 if (!syscalls_metadata) {
1344 WARN_ON(1);
1345 return;
1346 }
1347 }
1348
1349 for (i = 0; i < NR_syscalls; i++) {
1350 addr = arch_syscall_addr(i);
1351 meta = find_syscall_meta(addr);
1352 if (!meta)
1353 continue;
1354
1355 meta->syscall_nr = i;
1356
1357 if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
1358 syscalls_metadata[i] = meta;
1359 } else {
1360 ret = xa_store(&syscalls_metadata_sparse, i, meta,
1361 GFP_KERNEL);
1362 WARN(xa_is_err(ret),
1363 "Syscall memory allocation failed\n");
1364 }
1365
1366 }
1367 }
1368
1369 #ifdef CONFIG_PERF_EVENTS
1370
1371 static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
1372 static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
1373 static int sys_perf_refcount_enter;
1374 static int sys_perf_refcount_exit;
1375
perf_call_bpf_enter(struct trace_event_call * call,struct pt_regs * regs,struct syscall_metadata * sys_data,struct syscall_trace_enter * rec)1376 static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
1377 struct syscall_metadata *sys_data,
1378 struct syscall_trace_enter *rec)
1379 {
1380 struct syscall_tp_t {
1381 struct trace_entry ent;
1382 int syscall_nr;
1383 unsigned long args[SYSCALL_DEFINE_MAXARGS];
1384 } __aligned(8) param;
1385 int i;
1386
1387 BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
1388
1389 /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */
1390 perf_fetch_caller_regs(regs);
1391 *(struct pt_regs **)¶m = regs;
1392 param.syscall_nr = rec->nr;
1393 for (i = 0; i < sys_data->nb_args; i++)
1394 param.args[i] = rec->args[i];
1395 return trace_call_bpf(call, ¶m);
1396 }
1397
perf_syscall_enter(void * ignore,struct pt_regs * regs,long id)1398 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
1399 {
1400 struct syscall_metadata *sys_data;
1401 struct syscall_trace_enter *rec;
1402 struct pt_regs *fake_regs;
1403 struct hlist_head *head;
1404 unsigned long args[6];
1405 bool valid_prog_array;
1406 bool mayfault;
1407 char *user_ptr;
1408 int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
1409 int buf_size = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT;
1410 int syscall_nr;
1411 int rctx;
1412 int size = 0;
1413 int uargs = 0;
1414
1415 /*
1416 * Syscall probe called with preemption enabled, but the ring
1417 * buffer and per-cpu data require preemption to be disabled.
1418 */
1419 might_fault();
1420 guard(preempt_notrace)();
1421
1422 syscall_nr = trace_get_syscall_nr(current, regs);
1423 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
1424 return;
1425 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
1426 return;
1427
1428 sys_data = syscall_nr_to_meta(syscall_nr);
1429 if (!sys_data)
1430 return;
1431
1432 syscall_get_arguments(current, regs, args);
1433
1434 /* Check if this syscall event faults in user space memory */
1435 mayfault = sys_data->user_mask != 0;
1436
1437 if (mayfault) {
1438 if (syscall_get_data(sys_data, args, &user_ptr,
1439 &size, user_sizes, &uargs, buf_size) < 0)
1440 return;
1441 }
1442
1443 head = this_cpu_ptr(sys_data->enter_event->perf_events);
1444 valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
1445 if (!valid_prog_array && hlist_empty(head))
1446 return;
1447
1448 /* get the size after alignment with the u32 buffer size field */
1449 size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
1450 size = ALIGN(size + sizeof(u32), sizeof(u64));
1451 size -= sizeof(u32);
1452
1453 rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
1454 if (!rec)
1455 return;
1456
1457 rec->nr = syscall_nr;
1458 memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
1459
1460 if (mayfault)
1461 syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs);
1462
1463 if ((valid_prog_array &&
1464 !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
1465 hlist_empty(head)) {
1466 perf_swevent_put_recursion_context(rctx);
1467 return;
1468 }
1469
1470 perf_trace_buf_submit(rec, size, rctx,
1471 sys_data->enter_event->event.type, 1, regs,
1472 head, NULL);
1473 }
1474
perf_sysenter_enable(struct trace_event_call * call)1475 static int perf_sysenter_enable(struct trace_event_call *call)
1476 {
1477 struct syscall_metadata *sys_data = call->data;
1478 int num;
1479 int ret;
1480
1481 num = sys_data->syscall_nr;
1482
1483 guard(mutex)(&syscall_trace_lock);
1484 if (sys_data->user_mask) {
1485 ret = syscall_fault_buffer_enable();
1486 if (ret < 0)
1487 return ret;
1488 }
1489 if (!sys_perf_refcount_enter) {
1490 ret = register_trace_sys_enter(perf_syscall_enter, NULL);
1491 if (ret) {
1492 pr_info("event trace: Could not activate syscall entry trace point");
1493 if (sys_data->user_mask)
1494 syscall_fault_buffer_disable();
1495 return ret;
1496 }
1497 }
1498 set_bit(num, enabled_perf_enter_syscalls);
1499 sys_perf_refcount_enter++;
1500 return 0;
1501 }
1502
perf_sysenter_disable(struct trace_event_call * call)1503 static void perf_sysenter_disable(struct trace_event_call *call)
1504 {
1505 struct syscall_metadata *sys_data = call->data;
1506 int num;
1507
1508 num = sys_data->syscall_nr;
1509
1510 guard(mutex)(&syscall_trace_lock);
1511 sys_perf_refcount_enter--;
1512 clear_bit(num, enabled_perf_enter_syscalls);
1513 if (!sys_perf_refcount_enter)
1514 unregister_trace_sys_enter(perf_syscall_enter, NULL);
1515 if (sys_data->user_mask)
1516 syscall_fault_buffer_disable();
1517 }
1518
perf_call_bpf_exit(struct trace_event_call * call,struct pt_regs * regs,struct syscall_trace_exit * rec)1519 static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
1520 struct syscall_trace_exit *rec)
1521 {
1522 struct syscall_tp_t {
1523 struct trace_entry ent;
1524 int syscall_nr;
1525 unsigned long ret;
1526 } __aligned(8) param;
1527
1528 /* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. ¶m) */
1529 perf_fetch_caller_regs(regs);
1530 *(struct pt_regs **)¶m = regs;
1531 param.syscall_nr = rec->nr;
1532 param.ret = rec->ret;
1533 return trace_call_bpf(call, ¶m);
1534 }
1535
perf_syscall_exit(void * ignore,struct pt_regs * regs,long ret)1536 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
1537 {
1538 struct syscall_metadata *sys_data;
1539 struct syscall_trace_exit *rec;
1540 struct pt_regs *fake_regs;
1541 struct hlist_head *head;
1542 bool valid_prog_array;
1543 int syscall_nr;
1544 int rctx;
1545 int size;
1546
1547 /*
1548 * Syscall probe called with preemption enabled, but the ring
1549 * buffer and per-cpu data require preemption to be disabled.
1550 */
1551 might_fault();
1552 guard(preempt_notrace)();
1553
1554 syscall_nr = trace_get_syscall_nr(current, regs);
1555 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
1556 return;
1557 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
1558 return;
1559
1560 sys_data = syscall_nr_to_meta(syscall_nr);
1561 if (!sys_data)
1562 return;
1563
1564 head = this_cpu_ptr(sys_data->exit_event->perf_events);
1565 valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
1566 if (!valid_prog_array && hlist_empty(head))
1567 return;
1568
1569 /* We can probably do that at build time */
1570 size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
1571 size -= sizeof(u32);
1572
1573 rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
1574 if (!rec)
1575 return;
1576
1577 rec->nr = syscall_nr;
1578 rec->ret = syscall_get_return_value(current, regs);
1579
1580 if ((valid_prog_array &&
1581 !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
1582 hlist_empty(head)) {
1583 perf_swevent_put_recursion_context(rctx);
1584 return;
1585 }
1586
1587 perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
1588 1, regs, head, NULL);
1589 }
1590
perf_sysexit_enable(struct trace_event_call * call)1591 static int perf_sysexit_enable(struct trace_event_call *call)
1592 {
1593 int num;
1594
1595 num = ((struct syscall_metadata *)call->data)->syscall_nr;
1596
1597 guard(mutex)(&syscall_trace_lock);
1598 if (!sys_perf_refcount_exit) {
1599 int ret = register_trace_sys_exit(perf_syscall_exit, NULL);
1600 if (ret) {
1601 pr_info("event trace: Could not activate syscall exit trace point");
1602 return ret;
1603 }
1604 }
1605 set_bit(num, enabled_perf_exit_syscalls);
1606 sys_perf_refcount_exit++;
1607 return 0;
1608 }
1609
perf_sysexit_disable(struct trace_event_call * call)1610 static void perf_sysexit_disable(struct trace_event_call *call)
1611 {
1612 int num;
1613
1614 num = ((struct syscall_metadata *)call->data)->syscall_nr;
1615
1616 guard(mutex)(&syscall_trace_lock);
1617 sys_perf_refcount_exit--;
1618 clear_bit(num, enabled_perf_exit_syscalls);
1619 if (!sys_perf_refcount_exit)
1620 unregister_trace_sys_exit(perf_syscall_exit, NULL);
1621 }
1622
1623 #endif /* CONFIG_PERF_EVENTS */
1624
syscall_enter_register(struct trace_event_call * event,enum trace_reg type,void * data)1625 static int syscall_enter_register(struct trace_event_call *event,
1626 enum trace_reg type, void *data)
1627 {
1628 struct trace_event_file *file = data;
1629
1630 switch (type) {
1631 case TRACE_REG_REGISTER:
1632 return reg_event_syscall_enter(file, event);
1633 case TRACE_REG_UNREGISTER:
1634 unreg_event_syscall_enter(file, event);
1635 return 0;
1636
1637 #ifdef CONFIG_PERF_EVENTS
1638 case TRACE_REG_PERF_REGISTER:
1639 return perf_sysenter_enable(event);
1640 case TRACE_REG_PERF_UNREGISTER:
1641 perf_sysenter_disable(event);
1642 return 0;
1643 case TRACE_REG_PERF_OPEN:
1644 case TRACE_REG_PERF_CLOSE:
1645 case TRACE_REG_PERF_ADD:
1646 case TRACE_REG_PERF_DEL:
1647 return 0;
1648 #endif
1649 }
1650 return 0;
1651 }
1652
syscall_exit_register(struct trace_event_call * event,enum trace_reg type,void * data)1653 static int syscall_exit_register(struct trace_event_call *event,
1654 enum trace_reg type, void *data)
1655 {
1656 struct trace_event_file *file = data;
1657
1658 switch (type) {
1659 case TRACE_REG_REGISTER:
1660 return reg_event_syscall_exit(file, event);
1661 case TRACE_REG_UNREGISTER:
1662 unreg_event_syscall_exit(file, event);
1663 return 0;
1664
1665 #ifdef CONFIG_PERF_EVENTS
1666 case TRACE_REG_PERF_REGISTER:
1667 return perf_sysexit_enable(event);
1668 case TRACE_REG_PERF_UNREGISTER:
1669 perf_sysexit_disable(event);
1670 return 0;
1671 case TRACE_REG_PERF_OPEN:
1672 case TRACE_REG_PERF_CLOSE:
1673 case TRACE_REG_PERF_ADD:
1674 case TRACE_REG_PERF_DEL:
1675 return 0;
1676 #endif
1677 }
1678 return 0;
1679 }
1680