xref: /linux/kernel/trace/trace_syscalls.c (revision 69c5079b49fa120c1a108b6e28b3a6a8e4ae2db5)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <trace/syscall.h>
3 #include <trace/events/syscalls.h>
4 #include <linux/kernel_stat.h>
5 #include <linux/syscalls.h>
6 #include <linux/slab.h>
7 #include <linux/kernel.h>
8 #include <linux/module.h>	/* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
9 #include <linux/ftrace.h>
10 #include <linux/perf_event.h>
11 #include <linux/xarray.h>
12 #include <asm/syscall.h>
13 
14 #include "trace_output.h"
15 #include "trace.h"
16 
17 static DEFINE_MUTEX(syscall_trace_lock);
18 
19 static int syscall_enter_register(struct trace_event_call *event,
20 				 enum trace_reg type, void *data);
21 static int syscall_exit_register(struct trace_event_call *event,
22 				 enum trace_reg type, void *data);
23 
24 static struct list_head *
syscall_get_enter_fields(struct trace_event_call * call)25 syscall_get_enter_fields(struct trace_event_call *call)
26 {
27 	struct syscall_metadata *entry = call->data;
28 
29 	return &entry->enter_fields;
30 }
31 
32 extern struct syscall_metadata *__start_syscalls_metadata[];
33 extern struct syscall_metadata *__stop_syscalls_metadata[];
34 
35 static DEFINE_XARRAY(syscalls_metadata_sparse);
36 static struct syscall_metadata **syscalls_metadata;
37 
38 #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
arch_syscall_match_sym_name(const char * sym,const char * name)39 static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
40 {
41 	/*
42 	 * Only compare after the "sys" prefix. Archs that use
43 	 * syscall wrappers may have syscalls symbols aliases prefixed
44 	 * with ".SyS" or ".sys" instead of "sys", leading to an unwanted
45 	 * mismatch.
46 	 */
47 	return !strcmp(sym + 3, name + 3);
48 }
49 #endif
50 
51 #ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
52 /*
53  * Some architectures that allow for 32bit applications
54  * to run on a 64bit kernel, do not map the syscalls for
55  * the 32bit tasks the same as they do for 64bit tasks.
56  *
57  *     *cough*x86*cough*
58  *
59  * In such a case, instead of reporting the wrong syscalls,
60  * simply ignore them.
61  *
62  * For an arch to ignore the compat syscalls it needs to
63  * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as
64  * define the function arch_trace_is_compat_syscall() to let
65  * the tracing system know that it should ignore it.
66  */
67 static int
trace_get_syscall_nr(struct task_struct * task,struct pt_regs * regs)68 trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
69 {
70 	if (unlikely(arch_trace_is_compat_syscall(regs)))
71 		return -1;
72 
73 	return syscall_get_nr(task, regs);
74 }
75 #else
76 static inline int
trace_get_syscall_nr(struct task_struct * task,struct pt_regs * regs)77 trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
78 {
79 	return syscall_get_nr(task, regs);
80 }
81 #endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */
82 
83 static __init struct syscall_metadata *
find_syscall_meta(unsigned long syscall)84 find_syscall_meta(unsigned long syscall)
85 {
86 	struct syscall_metadata **start;
87 	struct syscall_metadata **stop;
88 	char str[KSYM_SYMBOL_LEN];
89 
90 
91 	start = __start_syscalls_metadata;
92 	stop = __stop_syscalls_metadata;
93 	kallsyms_lookup(syscall, NULL, NULL, NULL, str);
94 
95 	if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
96 		return NULL;
97 
98 	for ( ; start < stop; start++) {
99 		if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
100 			return *start;
101 	}
102 	return NULL;
103 }
104 
syscall_nr_to_meta(int nr)105 static struct syscall_metadata *syscall_nr_to_meta(int nr)
106 {
107 	if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR))
108 		return xa_load(&syscalls_metadata_sparse, (unsigned long)nr);
109 
110 	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
111 		return NULL;
112 
113 	return syscalls_metadata[nr];
114 }
115 
get_syscall_name(int syscall)116 const char *get_syscall_name(int syscall)
117 {
118 	struct syscall_metadata *entry;
119 
120 	entry = syscall_nr_to_meta(syscall);
121 	if (!entry)
122 		return NULL;
123 
124 	return entry->name;
125 }
126 
127 /* Added to user strings or arrays when max limit is reached */
128 #define EXTRA "..."
129 
get_dynamic_len_ptr(struct syscall_trace_enter * trace,struct syscall_metadata * entry,int * offset_p,int * len_p,unsigned char ** ptr_p)130 static void get_dynamic_len_ptr(struct syscall_trace_enter *trace,
131 				struct syscall_metadata *entry,
132 				int *offset_p, int *len_p, unsigned char **ptr_p)
133 {
134 	unsigned char *ptr;
135 	int offset = *offset_p;
136 	int val;
137 
138 	/* This arg points to a user space string */
139 	ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset;
140 	val = *(int *)ptr;
141 
142 	/* The value is a dynamic string (len << 16 | offset) */
143 	ptr = (void *)trace + (val & 0xffff);
144 	*len_p = val >> 16;
145 	offset += 4;
146 
147 	*ptr_p = ptr;
148 	*offset_p = offset;
149 }
150 
151 static enum print_line_t
sys_enter_openat_print(struct syscall_trace_enter * trace,struct syscall_metadata * entry,struct trace_seq * s,struct trace_event * event)152 sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadata *entry,
153 		       struct trace_seq *s, struct trace_event *event)
154 {
155 	unsigned char *ptr;
156 	int offset = 0;
157 	int bits, len;
158 	bool done = false;
159 	static const struct trace_print_flags __flags[] =
160 		{
161 			{ O_TMPFILE, "O_TMPFILE" },
162 			{ O_WRONLY, "O_WRONLY" },
163 			{ O_RDWR, "O_RDWR" },
164 			{ O_CREAT, "O_CREAT" },
165 			{ O_EXCL, "O_EXCL" },
166 			{ O_NOCTTY, "O_NOCTTY" },
167 			{ O_TRUNC, "O_TRUNC" },
168 			{ O_APPEND, "O_APPEND" },
169 			{ O_NONBLOCK, "O_NONBLOCK" },
170 			{ O_DSYNC, "O_DSYNC" },
171 			{ O_DIRECT, "O_DIRECT" },
172 			{ O_LARGEFILE, "O_LARGEFILE" },
173 			{ O_DIRECTORY, "O_DIRECTORY" },
174 			{ O_NOFOLLOW, "O_NOFOLLOW" },
175 			{ O_NOATIME, "O_NOATIME" },
176 			{ O_CLOEXEC, "O_CLOEXEC" },
177 			{ -1, NULL }
178 		};
179 
180 	trace_seq_printf(s, "%s(", entry->name);
181 
182 	for (int i = 0; !done && i < entry->nb_args; i++) {
183 
184 		if (trace_seq_has_overflowed(s))
185 			goto end;
186 
187 		if (i)
188 			trace_seq_puts(s, ", ");
189 
190 		switch (i) {
191 		case 2:
192 			bits = trace->args[2];
193 
194 			trace_seq_puts(s, "flags: ");
195 
196 			/* No need to show mode when not creating the file */
197 			if (!(bits & (O_CREAT|O_TMPFILE)))
198 				done = true;
199 
200 			if (!(bits & O_ACCMODE)) {
201 				if (!bits) {
202 					trace_seq_puts(s, "O_RDONLY");
203 					continue;
204 				}
205 				trace_seq_puts(s, "O_RDONLY|");
206 			}
207 
208 			trace_print_flags_seq(s, "|", bits, __flags);
209 			/*
210 			 * trace_print_flags_seq() adds a '\0' to the
211 			 * buffer, but this needs to append more to the seq.
212 			 */
213 			if (!trace_seq_has_overflowed(s))
214 				trace_seq_pop(s);
215 
216 			continue;
217 		case 3:
218 			trace_seq_printf(s, "%s: 0%03o", entry->args[i],
219 					 (unsigned int)trace->args[i]);
220 			continue;
221 		}
222 
223 		trace_seq_printf(s, "%s: %lu", entry->args[i],
224 				 trace->args[i]);
225 
226 		if (!(BIT(i) & entry->user_mask))
227 			continue;
228 
229 		get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr);
230 		trace_seq_printf(s, " \"%.*s\"", len, ptr);
231 	}
232 
233 	trace_seq_putc(s, ')');
234 end:
235 	trace_seq_putc(s, '\n');
236 
237 	return trace_handle_return(s);
238 }
239 
240 static enum print_line_t
print_syscall_enter(struct trace_iterator * iter,int flags,struct trace_event * event)241 print_syscall_enter(struct trace_iterator *iter, int flags,
242 		    struct trace_event *event)
243 {
244 	struct trace_array *tr = iter->tr;
245 	struct trace_seq *s = &iter->seq;
246 	struct trace_entry *ent = iter->ent;
247 	struct syscall_trace_enter *trace;
248 	struct syscall_metadata *entry;
249 	int i, syscall, val, len;
250 	unsigned char *ptr;
251 	int offset = 0;
252 
253 	trace = (typeof(trace))ent;
254 	syscall = trace->nr;
255 	entry = syscall_nr_to_meta(syscall);
256 
257 	if (!entry)
258 		goto end;
259 
260 	if (entry->enter_event->event.type != ent->type) {
261 		WARN_ON_ONCE(1);
262 		goto end;
263 	}
264 
265 	switch (entry->syscall_nr) {
266 	case __NR_openat:
267 		if (!tr || !(tr->trace_flags & TRACE_ITER(VERBOSE)))
268 			return sys_enter_openat_print(trace, entry, s, event);
269 		break;
270 	default:
271 		break;
272 	}
273 
274 	trace_seq_printf(s, "%s(", entry->name);
275 
276 	for (i = 0; i < entry->nb_args; i++) {
277 		bool printable = false;
278 		char *str;
279 
280 		if (trace_seq_has_overflowed(s))
281 			goto end;
282 
283 		if (i)
284 			trace_seq_puts(s, ", ");
285 
286 		/* parameter types */
287 		if (tr && tr->trace_flags & TRACE_ITER(VERBOSE))
288 			trace_seq_printf(s, "%s ", entry->types[i]);
289 
290 		/* parameter values */
291 		if (trace->args[i] < 10)
292 			trace_seq_printf(s, "%s: %lu", entry->args[i],
293 					 trace->args[i]);
294 		else
295 			trace_seq_printf(s, "%s: 0x%lx", entry->args[i],
296 					 trace->args[i]);
297 
298 		if (!(BIT(i) & entry->user_mask))
299 			continue;
300 
301 		get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr);
302 
303 		if (entry->user_arg_size < 0 || entry->user_arg_is_str) {
304 			trace_seq_printf(s, " \"%.*s\"", len, ptr);
305 			continue;
306 		}
307 
308 		val = trace->args[entry->user_arg_size];
309 
310 		str = ptr;
311 		trace_seq_puts(s, " (");
312 		for (int x = 0; x < len; x++, ptr++) {
313 			if (isascii(*ptr) && isprint(*ptr))
314 				printable = true;
315 			if (x)
316 				trace_seq_putc(s, ':');
317 			trace_seq_printf(s, "%02x", *ptr);
318 		}
319 		if (len < val)
320 			trace_seq_printf(s, ", %s", EXTRA);
321 
322 		trace_seq_putc(s, ')');
323 
324 		/* If nothing is printable, don't bother printing anything */
325 		if (!printable)
326 			continue;
327 
328 		trace_seq_puts(s, " \"");
329 		for (int x = 0; x < len; x++) {
330 			if (isascii(str[x]) && isprint(str[x]))
331 				trace_seq_putc(s, str[x]);
332 			else
333 				trace_seq_putc(s, '.');
334 		}
335 		if (len < val)
336 			trace_seq_printf(s, "\"%s", EXTRA);
337 		else
338 			trace_seq_putc(s, '"');
339 	}
340 
341 	trace_seq_putc(s, ')');
342 end:
343 	trace_seq_putc(s, '\n');
344 
345 	return trace_handle_return(s);
346 }
347 
348 static enum print_line_t
print_syscall_exit(struct trace_iterator * iter,int flags,struct trace_event * event)349 print_syscall_exit(struct trace_iterator *iter, int flags,
350 		   struct trace_event *event)
351 {
352 	struct trace_seq *s = &iter->seq;
353 	struct trace_entry *ent = iter->ent;
354 	struct syscall_trace_exit *trace;
355 	int syscall;
356 	struct syscall_metadata *entry;
357 
358 	trace = (typeof(trace))ent;
359 	syscall = trace->nr;
360 	entry = syscall_nr_to_meta(syscall);
361 
362 	if (!entry) {
363 		trace_seq_putc(s, '\n');
364 		goto out;
365 	}
366 
367 	if (entry->exit_event->event.type != ent->type) {
368 		WARN_ON_ONCE(1);
369 		return TRACE_TYPE_UNHANDLED;
370 	}
371 
372 	trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
373 				trace->ret);
374 
375  out:
376 	return trace_handle_return(s);
377 }
378 
379 #define SYSCALL_FIELD(_type, _name) {					\
380 	.type = #_type, .name = #_name,					\
381 	.size = sizeof(_type), .align = __alignof__(_type),		\
382 	.is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER }
383 
384 /* When len=0, we just calculate the needed length */
385 #define LEN_OR_ZERO (len ? len - pos : 0)
386 
387 static int __init
sys_enter_openat_print_fmt(struct syscall_metadata * entry,char * buf,int len)388 sys_enter_openat_print_fmt(struct syscall_metadata *entry, char *buf, int len)
389 {
390 	int pos = 0;
391 
392 	pos += snprintf(buf + pos, LEN_OR_ZERO,
393 			"\"dfd: 0x%%08lx, filename: 0x%%08lx \\\"%%s\\\", flags: %%s%%s, mode: 0%%03o\",");
394 	pos += snprintf(buf + pos, LEN_OR_ZERO,
395 			" ((unsigned long)(REC->dfd)),");
396 	pos += snprintf(buf + pos, LEN_OR_ZERO,
397 			" ((unsigned long)(REC->filename)),");
398 	pos += snprintf(buf + pos, LEN_OR_ZERO,
399 			" __get_str(__filename_val),");
400 	pos += snprintf(buf + pos, LEN_OR_ZERO,
401 			" (REC->flags & ~3) && !(REC->flags & 3) ? \"O_RDONLY|\" : \"\", ");
402 	pos += snprintf(buf + pos, LEN_OR_ZERO,
403 			" REC->flags ? __print_flags(REC->flags, \"|\", ");
404 	pos += snprintf(buf + pos, LEN_OR_ZERO,
405 			"{ 0x%x, \"O_WRONLY\" }, ", O_WRONLY);
406 	pos += snprintf(buf + pos, LEN_OR_ZERO,
407 			"{ 0x%x, \"O_RDWR\" }, ", O_RDWR);
408 	pos += snprintf(buf + pos, LEN_OR_ZERO,
409 			"{ 0x%x, \"O_CREAT\" }, ", O_CREAT);
410 	pos += snprintf(buf + pos, LEN_OR_ZERO,
411 			"{ 0x%x, \"O_EXCL\" }, ", O_EXCL);
412 	pos += snprintf(buf + pos, LEN_OR_ZERO,
413 			"{ 0x%x, \"O_NOCTTY\" }, ", O_NOCTTY);
414 	pos += snprintf(buf + pos, LEN_OR_ZERO,
415 			"{ 0x%x, \"O_TRUNC\" }, ", O_TRUNC);
416 	pos += snprintf(buf + pos, LEN_OR_ZERO,
417 			"{ 0x%x, \"O_APPEND\" }, ", O_APPEND);
418 	pos += snprintf(buf + pos, LEN_OR_ZERO,
419 			"{ 0x%x, \"O_NONBLOCK\" }, ", O_NONBLOCK);
420 	pos += snprintf(buf + pos, LEN_OR_ZERO,
421 			"{ 0x%x, \"O_DSYNC\" }, ", O_DSYNC);
422 	pos += snprintf(buf + pos, LEN_OR_ZERO,
423 			"{ 0x%x, \"O_DIRECT\" }, ", O_DIRECT);
424 	pos += snprintf(buf + pos, LEN_OR_ZERO,
425 			"{ 0x%x, \"O_LARGEFILE\" }, ", O_LARGEFILE);
426 	pos += snprintf(buf + pos, LEN_OR_ZERO,
427 			"{ 0x%x, \"O_DIRECTORY\" }, ", O_DIRECTORY);
428 	pos += snprintf(buf + pos, LEN_OR_ZERO,
429 			"{ 0x%x, \"O_NOFOLLOW\" }, ", O_NOFOLLOW);
430 	pos += snprintf(buf + pos, LEN_OR_ZERO,
431 			"{ 0x%x, \"O_NOATIME\" }, ", O_NOATIME);
432 	pos += snprintf(buf + pos, LEN_OR_ZERO,
433 			"{ 0x%x, \"O_CLOEXEC\" }) : \"O_RDONLY\", ", O_CLOEXEC);
434 
435 	pos += snprintf(buf + pos, LEN_OR_ZERO,
436 			" ((unsigned long)(REC->mode))");
437 	return pos;
438 }
439 
440 static int __init
__set_enter_print_fmt(struct syscall_metadata * entry,char * buf,int len)441 __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
442 {
443 	bool is_string = entry->user_arg_is_str;
444 	int i;
445 	int pos = 0;
446 
447 	switch (entry->syscall_nr) {
448 	case __NR_openat:
449 		return sys_enter_openat_print_fmt(entry, buf, len);
450 	default:
451 		break;
452 	}
453 
454 	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
455 	for (i = 0; i < entry->nb_args; i++) {
456 		if (i)
457 			pos += snprintf(buf + pos, LEN_OR_ZERO, ", ");
458 		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx",
459 				entry->args[i], sizeof(unsigned long));
460 
461 		if (!(BIT(i) & entry->user_mask))
462 			continue;
463 
464 		/* Add the format for the user space string or array */
465 		if (entry->user_arg_size < 0 || is_string)
466 			pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\"");
467 		else
468 			pos += snprintf(buf + pos, LEN_OR_ZERO, " (%%s)");
469 	}
470 	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
471 
472 	for (i = 0; i < entry->nb_args; i++) {
473 		pos += snprintf(buf + pos, LEN_OR_ZERO,
474 				", ((unsigned long)(REC->%s))", entry->args[i]);
475 		if (!(BIT(i) & entry->user_mask))
476 			continue;
477 		/* The user space data for arg has name __<arg>_val */
478 		if (entry->user_arg_size < 0 || is_string) {
479 			pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)",
480 					entry->args[i]);
481 		} else {
482 			pos += snprintf(buf + pos, LEN_OR_ZERO, ", __print_dynamic_array(__%s_val, 1)",
483 					entry->args[i]);
484 		}
485 	}
486 
487 #undef LEN_OR_ZERO
488 
489 	/* return the length of print_fmt */
490 	return pos;
491 }
492 
set_syscall_print_fmt(struct trace_event_call * call)493 static int __init set_syscall_print_fmt(struct trace_event_call *call)
494 {
495 	char *print_fmt;
496 	int len;
497 	struct syscall_metadata *entry = call->data;
498 
499 	if (entry->enter_event != call) {
500 		call->print_fmt = "\"0x%lx\", REC->ret";
501 		return 0;
502 	}
503 
504 	/* First: called with 0 length to calculate the needed length */
505 	len = __set_enter_print_fmt(entry, NULL, 0);
506 
507 	print_fmt = kmalloc(len + 1, GFP_KERNEL);
508 	if (!print_fmt)
509 		return -ENOMEM;
510 
511 	/* Second: actually write the @print_fmt */
512 	__set_enter_print_fmt(entry, print_fmt, len + 1);
513 	call->print_fmt = print_fmt;
514 
515 	return 0;
516 }
517 
free_syscall_print_fmt(struct trace_event_call * call)518 static void __init free_syscall_print_fmt(struct trace_event_call *call)
519 {
520 	struct syscall_metadata *entry = call->data;
521 
522 	if (entry->enter_event == call)
523 		kfree(call->print_fmt);
524 }
525 
syscall_enter_define_fields(struct trace_event_call * call)526 static int __init syscall_enter_define_fields(struct trace_event_call *call)
527 {
528 	struct syscall_trace_enter trace;
529 	struct syscall_metadata *meta = call->data;
530 	unsigned long mask;
531 	char *arg;
532 	int offset = offsetof(typeof(trace), args);
533 	int ret = 0;
534 	int len;
535 	int i;
536 
537 	for (i = 0; i < meta->nb_args; i++) {
538 		ret = trace_define_field(call, meta->types[i],
539 					 meta->args[i], offset,
540 					 sizeof(unsigned long), 0,
541 					 FILTER_OTHER);
542 		if (ret)
543 			break;
544 		offset += sizeof(unsigned long);
545 	}
546 
547 	if (ret || !meta->user_mask)
548 		return ret;
549 
550 	mask = meta->user_mask;
551 
552 	while (mask) {
553 		int idx = ffs(mask) - 1;
554 		mask &= ~BIT(idx);
555 
556 		/*
557 		 * User space data is faulted into a temporary buffer and then
558 		 * added as a dynamic string or array to the end of the event.
559 		 * The user space data name for the arg pointer is
560 		 * "__<arg>_val".
561 		 */
562 		len = strlen(meta->args[idx]) + sizeof("___val");
563 		arg = kmalloc(len, GFP_KERNEL);
564 		if (WARN_ON_ONCE(!arg)) {
565 			meta->user_mask = 0;
566 			return -ENOMEM;
567 		}
568 
569 		snprintf(arg, len, "__%s_val", meta->args[idx]);
570 
571 		ret = trace_define_field(call, "__data_loc char[]",
572 					 arg, offset, sizeof(int), 0,
573 					 FILTER_OTHER);
574 		if (ret) {
575 			kfree(arg);
576 			break;
577 		}
578 		offset += 4;
579 	}
580 	return ret;
581 }
582 
583 /*
584  * Create a per CPU temporary buffer to copy user space pointers into.
585  *
586  * SYSCALL_FAULT_USER_MAX is the amount to copy from user space.
587  *  (defined in kernel/trace/trace.h)
588 
589  * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space plus the
590  *   nul terminating byte and possibly appended EXTRA (4 bytes).
591  *
592  * SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use
593  * to copy memory from user space addresses into that will hold
594  * 3 args as only 3 args are allowed to be copied from system calls.
595  */
596 #define SYSCALL_FAULT_ARG_SZ (SYSCALL_FAULT_USER_MAX + 1 + 4)
597 #define SYSCALL_FAULT_MAX_CNT 3
598 #define SYSCALL_FAULT_BUF_SZ (SYSCALL_FAULT_ARG_SZ * SYSCALL_FAULT_MAX_CNT)
599 
600 /* Use the tracing per CPU buffer infrastructure to copy from user space */
601 struct syscall_user_buffer {
602 	struct trace_user_buf_info	buf;
603 	struct rcu_head			rcu;
604 };
605 
606 static struct syscall_user_buffer *syscall_buffer;
607 
syscall_fault_buffer_enable(void)608 static int syscall_fault_buffer_enable(void)
609 {
610 	struct syscall_user_buffer *sbuf;
611 	int ret;
612 
613 	lockdep_assert_held(&syscall_trace_lock);
614 
615 	if (syscall_buffer) {
616 		trace_user_fault_get(&syscall_buffer->buf);
617 		return 0;
618 	}
619 
620 	sbuf = kmalloc(sizeof(*sbuf), GFP_KERNEL);
621 	if (!sbuf)
622 		return -ENOMEM;
623 
624 	ret = trace_user_fault_init(&sbuf->buf, SYSCALL_FAULT_BUF_SZ);
625 	if (ret < 0) {
626 		kfree(sbuf);
627 		return ret;
628 	}
629 
630 	WRITE_ONCE(syscall_buffer, sbuf);
631 
632 	return 0;
633 }
634 
rcu_free_syscall_buffer(struct rcu_head * rcu)635 static void rcu_free_syscall_buffer(struct rcu_head *rcu)
636 {
637 	struct syscall_user_buffer *sbuf =
638 		container_of(rcu, struct syscall_user_buffer, rcu);
639 
640 	trace_user_fault_destroy(&sbuf->buf);
641 	kfree(sbuf);
642 }
643 
644 
syscall_fault_buffer_disable(void)645 static void syscall_fault_buffer_disable(void)
646 {
647 	struct syscall_user_buffer *sbuf = syscall_buffer;
648 
649 	lockdep_assert_held(&syscall_trace_lock);
650 
651 	if (trace_user_fault_put(&sbuf->buf))
652 		return;
653 
654 	WRITE_ONCE(syscall_buffer, NULL);
655 	call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer);
656 }
657 
658 struct syscall_args {
659 	char		*ptr_array[SYSCALL_FAULT_MAX_CNT];
660 	int		read[SYSCALL_FAULT_MAX_CNT];
661 	int		uargs;
662 };
663 
syscall_copy_user(char * buf,const char __user * ptr,size_t size,void * data)664 static int syscall_copy_user(char *buf, const char __user *ptr,
665 			     size_t size, void *data)
666 {
667 	struct syscall_args *args = data;
668 	int ret;
669 
670 	for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
671 		ptr = (char __user *)args->ptr_array[i];
672 		ret = strncpy_from_user(buf, ptr, size);
673 		args->read[i] = ret;
674 	}
675 	return 0;
676 }
677 
syscall_copy_user_array(char * buf,const char __user * ptr,size_t size,void * data)678 static int syscall_copy_user_array(char *buf, const char __user *ptr,
679 				   size_t size, void *data)
680 {
681 	struct syscall_args *args = data;
682 	int ret;
683 
684 	for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
685 		ptr = (char __user *)args->ptr_array[i];
686 		ret = __copy_from_user(buf, ptr, size);
687 		args->read[i] = ret ? -1 : size;
688 	}
689 	return 0;
690 }
691 
sys_fault_user(unsigned int buf_size,struct syscall_metadata * sys_data,struct syscall_user_buffer * sbuf,unsigned long * args,unsigned int data_size[SYSCALL_FAULT_MAX_CNT])692 static char *sys_fault_user(unsigned int buf_size,
693 			    struct syscall_metadata *sys_data,
694 			    struct syscall_user_buffer *sbuf,
695 			    unsigned long *args,
696 			    unsigned int data_size[SYSCALL_FAULT_MAX_CNT])
697 {
698 	trace_user_buf_copy syscall_copy = syscall_copy_user;
699 	unsigned long mask = sys_data->user_mask;
700 	unsigned long size = SYSCALL_FAULT_ARG_SZ - 1;
701 	struct syscall_args sargs;
702 	bool array = false;
703 	char *buffer;
704 	char *buf;
705 	int ret;
706 	int i = 0;
707 
708 	/* The extra is appended to the user data in the buffer */
709 	BUILD_BUG_ON(SYSCALL_FAULT_USER_MAX + sizeof(EXTRA) >=
710 		     SYSCALL_FAULT_ARG_SZ);
711 
712 	/*
713 	 * If this system call event has a size argument, use
714 	 * it to define how much of user space memory to read,
715 	 * and read it as an array and not a string.
716 	 */
717 	if (sys_data->user_arg_size >= 0) {
718 		array = true;
719 		size = args[sys_data->user_arg_size];
720 		if (size > SYSCALL_FAULT_ARG_SZ - 1)
721 			size = SYSCALL_FAULT_ARG_SZ - 1;
722 		syscall_copy = syscall_copy_user_array;
723 	}
724 
725 	while (mask) {
726 		int idx = ffs(mask) - 1;
727 		mask &= ~BIT(idx);
728 
729 		if (WARN_ON_ONCE(i == SYSCALL_FAULT_MAX_CNT))
730 			break;
731 
732 		/* Get the pointer to user space memory to read */
733 		sargs.ptr_array[i++] = (char *)args[idx];
734 	}
735 
736 	sargs.uargs = i;
737 
738 	/* Clear the values that are not used */
739 	for (; i < SYSCALL_FAULT_MAX_CNT; i++) {
740 		data_size[i] = -1; /* Denotes no pointer */
741 	}
742 
743 	/* A zero size means do not even try */
744 	if (!buf_size)
745 		return NULL;
746 
747 	buffer = trace_user_fault_read(&sbuf->buf, NULL, size,
748 				       syscall_copy, &sargs);
749 	if (!buffer)
750 		return NULL;
751 
752 	buf = buffer;
753 	for (i = 0; i < sargs.uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
754 
755 		ret = sargs.read[i];
756 		if (ret < 0)
757 			continue;
758 		buf[ret] = '\0';
759 
760 		/* For strings, replace any non-printable characters with '.' */
761 		if (!array) {
762 			for (int x = 0; x < ret; x++) {
763 				if (!isprint(buf[x]))
764 					buf[x] = '.';
765 			}
766 
767 			size = min(buf_size, SYSCALL_FAULT_USER_MAX);
768 
769 			/*
770 			 * If the text was truncated due to our max limit,
771 			 * add "..." to the string.
772 			 */
773 			if (ret > size) {
774 				strscpy(buf + size, EXTRA, sizeof(EXTRA));
775 				ret = size + sizeof(EXTRA);
776 			} else {
777 				buf[ret++] = '\0';
778 			}
779 		} else {
780 			ret = min((unsigned int)ret, buf_size);
781 		}
782 		data_size[i] = ret;
783 	}
784 
785 	return buffer;
786 }
787 
788 static int
syscall_get_data(struct syscall_metadata * sys_data,unsigned long * args,char ** buffer,int * size,int * user_sizes,int * uargs,int buf_size)789 syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args,
790 		 char **buffer, int *size, int *user_sizes, int *uargs,
791 		 int buf_size)
792 {
793 	struct syscall_user_buffer *sbuf;
794 	int i;
795 
796 	/* If the syscall_buffer is NULL, tracing is being shutdown */
797 	sbuf = READ_ONCE(syscall_buffer);
798 	if (!sbuf)
799 		return -1;
800 
801 	*buffer = sys_fault_user(buf_size, sys_data, sbuf, args, user_sizes);
802 	/*
803 	 * user_size is the amount of data to append.
804 	 * Need to add 4 for the meta field that points to
805 	 * the user memory at the end of the event and also
806 	 * stores its size.
807 	 */
808 	for (i = 0; i < SYSCALL_FAULT_MAX_CNT; i++) {
809 		if (user_sizes[i] < 0)
810 			break;
811 		*size += user_sizes[i] + 4;
812 	}
813 	/* Save the number of user read arguments of this syscall */
814 	*uargs = i;
815 	return 0;
816 }
817 
syscall_put_data(struct syscall_metadata * sys_data,struct syscall_trace_enter * entry,char * buffer,int size,int * user_sizes,int uargs)818 static void syscall_put_data(struct syscall_metadata *sys_data,
819 			     struct syscall_trace_enter *entry,
820 			     char *buffer, int size, int *user_sizes, int uargs)
821 {
822 	char *buf = buffer;
823 	void *ptr;
824 	int val;
825 
826 	/*
827 	 * Set the pointer to point to the meta data of the event
828 	 * that has information about the stored user space memory.
829 	 */
830 	ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
831 
832 	/*
833 	 * The meta data will store the offset of the user data from
834 	 * the beginning of the event. That is after the static arguments
835 	 * and the meta data fields.
836 	 */
837 	val = (ptr - (void *)entry) + 4 * uargs;
838 
839 	for (int i = 0; i < uargs; i++) {
840 
841 		if (i)
842 			val += user_sizes[i - 1];
843 
844 		/* Store the offset and the size into the meta data */
845 		*(int *)ptr = val | (user_sizes[i] << 16);
846 
847 		/* Skip the meta data */
848 		ptr += 4;
849 	}
850 
851 	for (int i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
852 		/* Nothing to do if the user space was empty or faulted */
853 		if (!user_sizes[i])
854 			continue;
855 
856 		memcpy(ptr, buf, user_sizes[i]);
857 		ptr += user_sizes[i];
858 	}
859 }
860 
ftrace_syscall_enter(void * data,struct pt_regs * regs,long id)861 static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
862 {
863 	struct trace_array *tr = data;
864 	struct trace_event_file *trace_file;
865 	struct syscall_trace_enter *entry;
866 	struct syscall_metadata *sys_data;
867 	struct trace_event_buffer fbuffer;
868 	unsigned long args[6];
869 	char *user_ptr;
870 	int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
871 	int syscall_nr;
872 	int size = 0;
873 	int uargs = 0;
874 	bool mayfault;
875 
876 	/*
877 	 * Syscall probe called with preemption enabled, but the ring
878 	 * buffer and per-cpu data require preemption to be disabled.
879 	 */
880 	might_fault();
881 
882 	syscall_nr = trace_get_syscall_nr(current, regs);
883 	if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
884 		return;
885 
886 	trace_file = READ_ONCE(tr->enter_syscall_files[syscall_nr]);
887 	if (!trace_file)
888 		return;
889 
890 	if (trace_trigger_soft_disabled(trace_file))
891 		return;
892 
893 	sys_data = syscall_nr_to_meta(syscall_nr);
894 	if (!sys_data)
895 		return;
896 
897 	/* Check if this syscall event faults in user space memory */
898 	mayfault = sys_data->user_mask != 0;
899 
900 	guard(preempt_notrace)();
901 
902 	syscall_get_arguments(current, regs, args);
903 
904 	if (mayfault) {
905 		if (syscall_get_data(sys_data, args, &user_ptr,
906 				     &size, user_sizes, &uargs, tr->syscall_buf_sz) < 0)
907 			return;
908 	}
909 
910 	size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
911 
912 	entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
913 	if (!entry)
914 		return;
915 
916 	entry = ring_buffer_event_data(fbuffer.event);
917 	entry->nr = syscall_nr;
918 
919 	memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
920 
921 	if (mayfault)
922 		syscall_put_data(sys_data, entry, user_ptr, size, user_sizes, uargs);
923 
924 	trace_event_buffer_commit(&fbuffer);
925 }
926 
ftrace_syscall_exit(void * data,struct pt_regs * regs,long ret)927 static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
928 {
929 	struct trace_array *tr = data;
930 	struct trace_event_file *trace_file;
931 	struct syscall_trace_exit *entry;
932 	struct syscall_metadata *sys_data;
933 	struct trace_event_buffer fbuffer;
934 	int syscall_nr;
935 
936 	/*
937 	 * Syscall probe called with preemption enabled, but the ring
938 	 * buffer and per-cpu data require preemption to be disabled.
939 	 */
940 	might_fault();
941 	guard(preempt_notrace)();
942 
943 	syscall_nr = trace_get_syscall_nr(current, regs);
944 	if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
945 		return;
946 
947 	trace_file = READ_ONCE(tr->exit_syscall_files[syscall_nr]);
948 	if (!trace_file)
949 		return;
950 
951 	if (trace_trigger_soft_disabled(trace_file))
952 		return;
953 
954 	sys_data = syscall_nr_to_meta(syscall_nr);
955 	if (!sys_data)
956 		return;
957 
958 	entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry));
959 	if (!entry)
960 		return;
961 
962 	entry = ring_buffer_event_data(fbuffer.event);
963 	entry->nr = syscall_nr;
964 	entry->ret = syscall_get_return_value(current, regs);
965 
966 	trace_event_buffer_commit(&fbuffer);
967 }
968 
reg_event_syscall_enter(struct trace_event_file * file,struct trace_event_call * call)969 static int reg_event_syscall_enter(struct trace_event_file *file,
970 				   struct trace_event_call *call)
971 {
972 	struct syscall_metadata *sys_data = call->data;
973 	struct trace_array *tr = file->tr;
974 	int ret = 0;
975 	int num;
976 
977 	num = sys_data->syscall_nr;
978 	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
979 		return -ENOSYS;
980 	guard(mutex)(&syscall_trace_lock);
981 	if (sys_data->user_mask) {
982 		ret = syscall_fault_buffer_enable();
983 		if (ret < 0)
984 			return ret;
985 	}
986 	if (!tr->sys_refcount_enter) {
987 		ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
988 		if (ret < 0) {
989 			if (sys_data->user_mask)
990 				syscall_fault_buffer_disable();
991 			return ret;
992 		}
993 	}
994 	WRITE_ONCE(tr->enter_syscall_files[num], file);
995 	tr->sys_refcount_enter++;
996 	return 0;
997 }
998 
unreg_event_syscall_enter(struct trace_event_file * file,struct trace_event_call * call)999 static void unreg_event_syscall_enter(struct trace_event_file *file,
1000 				      struct trace_event_call *call)
1001 {
1002 	struct syscall_metadata *sys_data = call->data;
1003 	struct trace_array *tr = file->tr;
1004 	int num;
1005 
1006 	num = sys_data->syscall_nr;
1007 	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
1008 		return;
1009 	guard(mutex)(&syscall_trace_lock);
1010 	tr->sys_refcount_enter--;
1011 	WRITE_ONCE(tr->enter_syscall_files[num], NULL);
1012 	if (!tr->sys_refcount_enter)
1013 		unregister_trace_sys_enter(ftrace_syscall_enter, tr);
1014 	if (sys_data->user_mask)
1015 		syscall_fault_buffer_disable();
1016 }
1017 
reg_event_syscall_exit(struct trace_event_file * file,struct trace_event_call * call)1018 static int reg_event_syscall_exit(struct trace_event_file *file,
1019 				  struct trace_event_call *call)
1020 {
1021 	struct trace_array *tr = file->tr;
1022 	int ret = 0;
1023 	int num;
1024 
1025 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
1026 	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
1027 		return -ENOSYS;
1028 	mutex_lock(&syscall_trace_lock);
1029 	if (!tr->sys_refcount_exit)
1030 		ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
1031 	if (!ret) {
1032 		WRITE_ONCE(tr->exit_syscall_files[num], file);
1033 		tr->sys_refcount_exit++;
1034 	}
1035 	mutex_unlock(&syscall_trace_lock);
1036 	return ret;
1037 }
1038 
unreg_event_syscall_exit(struct trace_event_file * file,struct trace_event_call * call)1039 static void unreg_event_syscall_exit(struct trace_event_file *file,
1040 				     struct trace_event_call *call)
1041 {
1042 	struct trace_array *tr = file->tr;
1043 	int num;
1044 
1045 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
1046 	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
1047 		return;
1048 	mutex_lock(&syscall_trace_lock);
1049 	tr->sys_refcount_exit--;
1050 	WRITE_ONCE(tr->exit_syscall_files[num], NULL);
1051 	if (!tr->sys_refcount_exit)
1052 		unregister_trace_sys_exit(ftrace_syscall_exit, tr);
1053 	mutex_unlock(&syscall_trace_lock);
1054 }
1055 
1056 /*
1057  * For system calls that reference user space memory that can
1058  * be recorded into the event, set the system call meta data's user_mask
1059  * to the "args" index that points to the user space memory to retrieve.
1060  */
check_faultable_syscall(struct trace_event_call * call,int nr)1061 static void check_faultable_syscall(struct trace_event_call *call, int nr)
1062 {
1063 	struct syscall_metadata *sys_data = call->data;
1064 	unsigned long mask;
1065 
1066 	/* Only work on entry */
1067 	if (sys_data->enter_event != call)
1068 		return;
1069 
1070 	sys_data->user_arg_size = -1;
1071 
1072 	switch (nr) {
1073 	/* user arg 1 with size arg at 2 */
1074 	case __NR_write:
1075 #ifdef __NR_mq_timedsend
1076 	case __NR_mq_timedsend:
1077 #endif
1078 	case __NR_pwrite64:
1079 		sys_data->user_mask = BIT(1);
1080 		sys_data->user_arg_size = 2;
1081 		break;
1082 	/* user arg 0 with size arg at 1 as string */
1083 	case __NR_setdomainname:
1084 	case __NR_sethostname:
1085 		sys_data->user_mask = BIT(0);
1086 		sys_data->user_arg_size = 1;
1087 		sys_data->user_arg_is_str = 1;
1088 		break;
1089 #ifdef __NR_kexec_file_load
1090 	/* user arg 4 with size arg at 3 as string */
1091 	case __NR_kexec_file_load:
1092 		sys_data->user_mask = BIT(4);
1093 		sys_data->user_arg_size = 3;
1094 		sys_data->user_arg_is_str = 1;
1095 		break;
1096 #endif
1097 	/* user arg at position 0 */
1098 #ifdef __NR_access
1099 	case __NR_access:
1100 #endif
1101 	case __NR_acct:
1102 	case __NR_chdir:
1103 #ifdef  __NR_chown
1104 	case __NR_chown:
1105 #endif
1106 #ifdef  __NR_chmod
1107 	case __NR_chmod:
1108 #endif
1109 	case __NR_chroot:
1110 #ifdef __NR_creat
1111 	case __NR_creat:
1112 #endif
1113 	case __NR_delete_module:
1114 	case __NR_execve:
1115 	case __NR_fsopen:
1116 #ifdef __NR_lchown
1117 	case __NR_lchown:
1118 #endif
1119 #ifdef __NR_open
1120 	case __NR_open:
1121 #endif
1122 	case __NR_memfd_create:
1123 #ifdef __NR_mkdir
1124 	case __NR_mkdir:
1125 #endif
1126 #ifdef __NR_mknod
1127 	case __NR_mknod:
1128 #endif
1129 	case __NR_mq_open:
1130 	case __NR_mq_unlink:
1131 #ifdef __NR_readlink
1132 	case __NR_readlink:
1133 #endif
1134 #ifdef  __NR_rmdir
1135 	case __NR_rmdir:
1136 #endif
1137 	case __NR_shmdt:
1138 #ifdef __NR_statfs
1139 	case __NR_statfs:
1140 #endif
1141 	case __NR_swapon:
1142 	case __NR_swapoff:
1143 #ifdef __NR_truncate
1144 	case __NR_truncate:
1145 #endif
1146 #ifdef __NR_unlink
1147 	case __NR_unlink:
1148 #endif
1149 	case __NR_umount2:
1150 #ifdef __NR_utime
1151 	case __NR_utime:
1152 #endif
1153 #ifdef __NR_utimes
1154 	case __NR_utimes:
1155 #endif
1156 		sys_data->user_mask = BIT(0);
1157 		break;
1158 	/* user arg at position 1 */
1159 	case __NR_execveat:
1160 	case __NR_faccessat:
1161 	case __NR_faccessat2:
1162 	case __NR_finit_module:
1163 	case __NR_fchmodat:
1164 	case __NR_fchmodat2:
1165 	case __NR_fchownat:
1166 	case __NR_fgetxattr:
1167 	case __NR_flistxattr:
1168 	case __NR_fsetxattr:
1169 	case __NR_fspick:
1170 	case __NR_fremovexattr:
1171 #ifdef __NR_futimesat
1172 	case __NR_futimesat:
1173 #endif
1174 	case __NR_inotify_add_watch:
1175 	case __NR_mkdirat:
1176 	case __NR_mknodat:
1177 	case __NR_mount_setattr:
1178 	case __NR_name_to_handle_at:
1179 #ifdef __NR_newfstatat
1180 	case __NR_newfstatat:
1181 #endif
1182 	case __NR_openat:
1183 	case __NR_openat2:
1184 	case __NR_open_tree:
1185 	case __NR_open_tree_attr:
1186 	case __NR_readlinkat:
1187 	case __NR_quotactl:
1188 	case __NR_syslog:
1189 	case __NR_statx:
1190 	case __NR_unlinkat:
1191 #ifdef __NR_utimensat
1192 	case __NR_utimensat:
1193 #endif
1194 		sys_data->user_mask = BIT(1);
1195 		break;
1196 	/* user arg at position 2 */
1197 	case __NR_init_module:
1198 	case __NR_fsconfig:
1199 		sys_data->user_mask = BIT(2);
1200 		break;
1201 	/* user arg at position 4 */
1202 	case __NR_fanotify_mark:
1203 		sys_data->user_mask = BIT(4);
1204 		break;
1205 	/* 2 user args, 0 and 1 */
1206 	case __NR_add_key:
1207 	case __NR_getxattr:
1208 	case __NR_lgetxattr:
1209 	case __NR_lremovexattr:
1210 #ifdef __NR_link
1211 	case __NR_link:
1212 #endif
1213 	case __NR_listxattr:
1214 	case __NR_llistxattr:
1215 	case __NR_lsetxattr:
1216 	case __NR_pivot_root:
1217 	case __NR_removexattr:
1218 #ifdef __NR_rename
1219 	case __NR_rename:
1220 #endif
1221 	case __NR_request_key:
1222 	case __NR_setxattr:
1223 #ifdef __NR_symlink
1224 	case __NR_symlink:
1225 #endif
1226 		sys_data->user_mask = BIT(0) | BIT(1);
1227 		break;
1228 	/* 2 user args, 0 and 2 */
1229 	case __NR_symlinkat:
1230 		sys_data->user_mask = BIT(0) | BIT(2);
1231 		break;
1232 	/* 2 user args, 1 and 3 */
1233 	case __NR_getxattrat:
1234 	case __NR_linkat:
1235 	case __NR_listxattrat:
1236 	case __NR_move_mount:
1237 #ifdef __NR_renameat
1238 	case __NR_renameat:
1239 #endif
1240 	case __NR_renameat2:
1241 	case __NR_removexattrat:
1242 	case __NR_setxattrat:
1243 		sys_data->user_mask = BIT(1) | BIT(3);
1244 		break;
1245 	case __NR_mount: /* Just dev_name and dir_name, TODO add type */
1246 		sys_data->user_mask = BIT(0) | BIT(1) | BIT(2);
1247 		break;
1248 	default:
1249 		sys_data->user_mask = 0;
1250 		return;
1251 	}
1252 
1253 	if (sys_data->user_arg_size < 0)
1254 		return;
1255 
1256 	/*
1257 	 * The user_arg_size can only be used when the system call
1258 	 * is reading only a single address from user space.
1259 	 */
1260 	mask = sys_data->user_mask;
1261 	if (WARN_ON(mask & (mask - 1)))
1262 		sys_data->user_arg_size = -1;
1263 }
1264 
init_syscall_trace(struct trace_event_call * call)1265 static int __init init_syscall_trace(struct trace_event_call *call)
1266 {
1267 	int id;
1268 	int num;
1269 
1270 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
1271 	if (num < 0 || num >= NR_syscalls) {
1272 		pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
1273 				((struct syscall_metadata *)call->data)->name);
1274 		return -ENOSYS;
1275 	}
1276 
1277 	check_faultable_syscall(call, num);
1278 
1279 	if (set_syscall_print_fmt(call) < 0)
1280 		return -ENOMEM;
1281 
1282 	id = trace_event_raw_init(call);
1283 
1284 	if (id < 0) {
1285 		free_syscall_print_fmt(call);
1286 		return id;
1287 	}
1288 
1289 	return id;
1290 }
1291 
1292 static struct trace_event_fields __refdata syscall_enter_fields_array[] = {
1293 	SYSCALL_FIELD(int, __syscall_nr),
1294 	{ .type = TRACE_FUNCTION_TYPE,
1295 	  .define_fields = syscall_enter_define_fields },
1296 	{}
1297 };
1298 
1299 struct trace_event_functions enter_syscall_print_funcs = {
1300 	.trace		= print_syscall_enter,
1301 };
1302 
1303 struct trace_event_functions exit_syscall_print_funcs = {
1304 	.trace		= print_syscall_exit,
1305 };
1306 
1307 struct trace_event_class __refdata event_class_syscall_enter = {
1308 	.system		= "syscalls",
1309 	.reg		= syscall_enter_register,
1310 	.fields_array	= syscall_enter_fields_array,
1311 	.get_fields	= syscall_get_enter_fields,
1312 	.raw_init	= init_syscall_trace,
1313 };
1314 
1315 struct trace_event_class __refdata event_class_syscall_exit = {
1316 	.system		= "syscalls",
1317 	.reg		= syscall_exit_register,
1318 	.fields_array	= (struct trace_event_fields[]){
1319 		SYSCALL_FIELD(int, __syscall_nr),
1320 		SYSCALL_FIELD(long, ret),
1321 		{}
1322 	},
1323 	.fields		= LIST_HEAD_INIT(event_class_syscall_exit.fields),
1324 	.raw_init	= init_syscall_trace,
1325 };
1326 
arch_syscall_addr(int nr)1327 unsigned long __init __weak arch_syscall_addr(int nr)
1328 {
1329 	return (unsigned long)sys_call_table[nr];
1330 }
1331 
init_ftrace_syscalls(void)1332 void __init init_ftrace_syscalls(void)
1333 {
1334 	struct syscall_metadata *meta;
1335 	unsigned long addr;
1336 	int i;
1337 	void *ret;
1338 
1339 	if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
1340 		syscalls_metadata = kcalloc(NR_syscalls,
1341 					sizeof(*syscalls_metadata),
1342 					GFP_KERNEL);
1343 		if (!syscalls_metadata) {
1344 			WARN_ON(1);
1345 			return;
1346 		}
1347 	}
1348 
1349 	for (i = 0; i < NR_syscalls; i++) {
1350 		addr = arch_syscall_addr(i);
1351 		meta = find_syscall_meta(addr);
1352 		if (!meta)
1353 			continue;
1354 
1355 		meta->syscall_nr = i;
1356 
1357 		if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
1358 			syscalls_metadata[i] = meta;
1359 		} else {
1360 			ret = xa_store(&syscalls_metadata_sparse, i, meta,
1361 					GFP_KERNEL);
1362 			WARN(xa_is_err(ret),
1363 				"Syscall memory allocation failed\n");
1364 		}
1365 
1366 	}
1367 }
1368 
1369 #ifdef CONFIG_PERF_EVENTS
1370 
1371 static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
1372 static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
1373 static int sys_perf_refcount_enter;
1374 static int sys_perf_refcount_exit;
1375 
perf_call_bpf_enter(struct trace_event_call * call,struct pt_regs * regs,struct syscall_metadata * sys_data,struct syscall_trace_enter * rec)1376 static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
1377 			       struct syscall_metadata *sys_data,
1378 			       struct syscall_trace_enter *rec)
1379 {
1380 	struct syscall_tp_t {
1381 		struct trace_entry ent;
1382 		int syscall_nr;
1383 		unsigned long args[SYSCALL_DEFINE_MAXARGS];
1384 	} __aligned(8) param;
1385 	int i;
1386 
1387 	BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
1388 
1389 	/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
1390 	perf_fetch_caller_regs(regs);
1391 	*(struct pt_regs **)&param = regs;
1392 	param.syscall_nr = rec->nr;
1393 	for (i = 0; i < sys_data->nb_args; i++)
1394 		param.args[i] = rec->args[i];
1395 	return trace_call_bpf(call, &param);
1396 }
1397 
perf_syscall_enter(void * ignore,struct pt_regs * regs,long id)1398 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
1399 {
1400 	struct syscall_metadata *sys_data;
1401 	struct syscall_trace_enter *rec;
1402 	struct pt_regs *fake_regs;
1403 	struct hlist_head *head;
1404 	unsigned long args[6];
1405 	bool valid_prog_array;
1406 	bool mayfault;
1407 	char *user_ptr;
1408 	int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
1409 	int buf_size = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT;
1410 	int syscall_nr;
1411 	int rctx;
1412 	int size = 0;
1413 	int uargs = 0;
1414 
1415 	/*
1416 	 * Syscall probe called with preemption enabled, but the ring
1417 	 * buffer and per-cpu data require preemption to be disabled.
1418 	 */
1419 	might_fault();
1420 	guard(preempt_notrace)();
1421 
1422 	syscall_nr = trace_get_syscall_nr(current, regs);
1423 	if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
1424 		return;
1425 	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
1426 		return;
1427 
1428 	sys_data = syscall_nr_to_meta(syscall_nr);
1429 	if (!sys_data)
1430 		return;
1431 
1432 	syscall_get_arguments(current, regs, args);
1433 
1434 	/* Check if this syscall event faults in user space memory */
1435 	mayfault = sys_data->user_mask != 0;
1436 
1437 	if (mayfault) {
1438 		if (syscall_get_data(sys_data, args, &user_ptr,
1439 				     &size, user_sizes, &uargs, buf_size) < 0)
1440 			return;
1441 	}
1442 
1443 	head = this_cpu_ptr(sys_data->enter_event->perf_events);
1444 	valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
1445 	if (!valid_prog_array && hlist_empty(head))
1446 		return;
1447 
1448 	/* get the size after alignment with the u32 buffer size field */
1449 	size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
1450 	size = ALIGN(size + sizeof(u32), sizeof(u64));
1451 	size -= sizeof(u32);
1452 
1453 	rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
1454 	if (!rec)
1455 		return;
1456 
1457 	rec->nr = syscall_nr;
1458 	memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
1459 
1460 	if (mayfault)
1461 		syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs);
1462 
1463 	if ((valid_prog_array &&
1464 	     !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
1465 	    hlist_empty(head)) {
1466 		perf_swevent_put_recursion_context(rctx);
1467 		return;
1468 	}
1469 
1470 	perf_trace_buf_submit(rec, size, rctx,
1471 			      sys_data->enter_event->event.type, 1, regs,
1472 			      head, NULL);
1473 }
1474 
perf_sysenter_enable(struct trace_event_call * call)1475 static int perf_sysenter_enable(struct trace_event_call *call)
1476 {
1477 	struct syscall_metadata *sys_data = call->data;
1478 	int num;
1479 	int ret;
1480 
1481 	num = sys_data->syscall_nr;
1482 
1483 	guard(mutex)(&syscall_trace_lock);
1484 	if (sys_data->user_mask) {
1485 		ret = syscall_fault_buffer_enable();
1486 		if (ret < 0)
1487 			return ret;
1488 	}
1489 	if (!sys_perf_refcount_enter) {
1490 		ret = register_trace_sys_enter(perf_syscall_enter, NULL);
1491 		if (ret) {
1492 			pr_info("event trace: Could not activate syscall entry trace point");
1493 			if (sys_data->user_mask)
1494 				syscall_fault_buffer_disable();
1495 			return ret;
1496 		}
1497 	}
1498 	set_bit(num, enabled_perf_enter_syscalls);
1499 	sys_perf_refcount_enter++;
1500 	return 0;
1501 }
1502 
perf_sysenter_disable(struct trace_event_call * call)1503 static void perf_sysenter_disable(struct trace_event_call *call)
1504 {
1505 	struct syscall_metadata *sys_data = call->data;
1506 	int num;
1507 
1508 	num = sys_data->syscall_nr;
1509 
1510 	guard(mutex)(&syscall_trace_lock);
1511 	sys_perf_refcount_enter--;
1512 	clear_bit(num, enabled_perf_enter_syscalls);
1513 	if (!sys_perf_refcount_enter)
1514 		unregister_trace_sys_enter(perf_syscall_enter, NULL);
1515 	if (sys_data->user_mask)
1516 		syscall_fault_buffer_disable();
1517 }
1518 
perf_call_bpf_exit(struct trace_event_call * call,struct pt_regs * regs,struct syscall_trace_exit * rec)1519 static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
1520 			      struct syscall_trace_exit *rec)
1521 {
1522 	struct syscall_tp_t {
1523 		struct trace_entry ent;
1524 		int syscall_nr;
1525 		unsigned long ret;
1526 	} __aligned(8) param;
1527 
1528 	/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
1529 	perf_fetch_caller_regs(regs);
1530 	*(struct pt_regs **)&param = regs;
1531 	param.syscall_nr = rec->nr;
1532 	param.ret = rec->ret;
1533 	return trace_call_bpf(call, &param);
1534 }
1535 
perf_syscall_exit(void * ignore,struct pt_regs * regs,long ret)1536 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
1537 {
1538 	struct syscall_metadata *sys_data;
1539 	struct syscall_trace_exit *rec;
1540 	struct pt_regs *fake_regs;
1541 	struct hlist_head *head;
1542 	bool valid_prog_array;
1543 	int syscall_nr;
1544 	int rctx;
1545 	int size;
1546 
1547 	/*
1548 	 * Syscall probe called with preemption enabled, but the ring
1549 	 * buffer and per-cpu data require preemption to be disabled.
1550 	 */
1551 	might_fault();
1552 	guard(preempt_notrace)();
1553 
1554 	syscall_nr = trace_get_syscall_nr(current, regs);
1555 	if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
1556 		return;
1557 	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
1558 		return;
1559 
1560 	sys_data = syscall_nr_to_meta(syscall_nr);
1561 	if (!sys_data)
1562 		return;
1563 
1564 	head = this_cpu_ptr(sys_data->exit_event->perf_events);
1565 	valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
1566 	if (!valid_prog_array && hlist_empty(head))
1567 		return;
1568 
1569 	/* We can probably do that at build time */
1570 	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
1571 	size -= sizeof(u32);
1572 
1573 	rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
1574 	if (!rec)
1575 		return;
1576 
1577 	rec->nr = syscall_nr;
1578 	rec->ret = syscall_get_return_value(current, regs);
1579 
1580 	if ((valid_prog_array &&
1581 	     !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
1582 	    hlist_empty(head)) {
1583 		perf_swevent_put_recursion_context(rctx);
1584 		return;
1585 	}
1586 
1587 	perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
1588 			      1, regs, head, NULL);
1589 }
1590 
perf_sysexit_enable(struct trace_event_call * call)1591 static int perf_sysexit_enable(struct trace_event_call *call)
1592 {
1593 	int num;
1594 
1595 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
1596 
1597 	guard(mutex)(&syscall_trace_lock);
1598 	if (!sys_perf_refcount_exit) {
1599 		int ret = register_trace_sys_exit(perf_syscall_exit, NULL);
1600 		if (ret) {
1601 			pr_info("event trace: Could not activate syscall exit trace point");
1602 			return ret;
1603 		}
1604 	}
1605 	set_bit(num, enabled_perf_exit_syscalls);
1606 	sys_perf_refcount_exit++;
1607 	return 0;
1608 }
1609 
perf_sysexit_disable(struct trace_event_call * call)1610 static void perf_sysexit_disable(struct trace_event_call *call)
1611 {
1612 	int num;
1613 
1614 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
1615 
1616 	guard(mutex)(&syscall_trace_lock);
1617 	sys_perf_refcount_exit--;
1618 	clear_bit(num, enabled_perf_exit_syscalls);
1619 	if (!sys_perf_refcount_exit)
1620 		unregister_trace_sys_exit(perf_syscall_exit, NULL);
1621 }
1622 
1623 #endif /* CONFIG_PERF_EVENTS */
1624 
syscall_enter_register(struct trace_event_call * event,enum trace_reg type,void * data)1625 static int syscall_enter_register(struct trace_event_call *event,
1626 				 enum trace_reg type, void *data)
1627 {
1628 	struct trace_event_file *file = data;
1629 
1630 	switch (type) {
1631 	case TRACE_REG_REGISTER:
1632 		return reg_event_syscall_enter(file, event);
1633 	case TRACE_REG_UNREGISTER:
1634 		unreg_event_syscall_enter(file, event);
1635 		return 0;
1636 
1637 #ifdef CONFIG_PERF_EVENTS
1638 	case TRACE_REG_PERF_REGISTER:
1639 		return perf_sysenter_enable(event);
1640 	case TRACE_REG_PERF_UNREGISTER:
1641 		perf_sysenter_disable(event);
1642 		return 0;
1643 	case TRACE_REG_PERF_OPEN:
1644 	case TRACE_REG_PERF_CLOSE:
1645 	case TRACE_REG_PERF_ADD:
1646 	case TRACE_REG_PERF_DEL:
1647 		return 0;
1648 #endif
1649 	}
1650 	return 0;
1651 }
1652 
syscall_exit_register(struct trace_event_call * event,enum trace_reg type,void * data)1653 static int syscall_exit_register(struct trace_event_call *event,
1654 				 enum trace_reg type, void *data)
1655 {
1656 	struct trace_event_file *file = data;
1657 
1658 	switch (type) {
1659 	case TRACE_REG_REGISTER:
1660 		return reg_event_syscall_exit(file, event);
1661 	case TRACE_REG_UNREGISTER:
1662 		unreg_event_syscall_exit(file, event);
1663 		return 0;
1664 
1665 #ifdef CONFIG_PERF_EVENTS
1666 	case TRACE_REG_PERF_REGISTER:
1667 		return perf_sysexit_enable(event);
1668 	case TRACE_REG_PERF_UNREGISTER:
1669 		perf_sysexit_disable(event);
1670 		return 0;
1671 	case TRACE_REG_PERF_OPEN:
1672 	case TRACE_REG_PERF_CLOSE:
1673 	case TRACE_REG_PERF_ADD:
1674 	case TRACE_REG_PERF_DEL:
1675 		return 0;
1676 #endif
1677 	}
1678 	return 0;
1679 }
1680