xref: /linux/kernel/trace/trace_syscalls.c (revision 754e38d2d1aeeadddac5220f34e07cf263502a46)
1 // SPDX-License-Identifier: GPL-2.0
2 #include <trace/syscall.h>
3 #include <trace/events/syscalls.h>
4 #include <linux/kernel_stat.h>
5 #include <linux/syscalls.h>
6 #include <linux/slab.h>
7 #include <linux/kernel.h>
8 #include <linux/module.h>	/* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
9 #include <linux/ftrace.h>
10 #include <linux/perf_event.h>
11 #include <linux/xarray.h>
12 #include <asm/syscall.h>
13 
14 #include "trace_output.h"
15 #include "trace.h"
16 
17 static DEFINE_MUTEX(syscall_trace_lock);
18 
19 static int syscall_enter_register(struct trace_event_call *event,
20 				 enum trace_reg type, void *data);
21 static int syscall_exit_register(struct trace_event_call *event,
22 				 enum trace_reg type, void *data);
23 
24 static struct list_head *
25 syscall_get_enter_fields(struct trace_event_call *call)
26 {
27 	struct syscall_metadata *entry = call->data;
28 
29 	return &entry->enter_fields;
30 }
31 
32 extern struct syscall_metadata *__start_syscalls_metadata[];
33 extern struct syscall_metadata *__stop_syscalls_metadata[];
34 
35 static DEFINE_XARRAY(syscalls_metadata_sparse);
36 static struct syscall_metadata **syscalls_metadata;
37 
38 #ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
39 static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
40 {
41 	/*
42 	 * Only compare after the "sys" prefix. Archs that use
43 	 * syscall wrappers may have syscalls symbols aliases prefixed
44 	 * with ".SyS" or ".sys" instead of "sys", leading to an unwanted
45 	 * mismatch.
46 	 */
47 	return !strcmp(sym + 3, name + 3);
48 }
49 #endif
50 
51 #ifdef ARCH_TRACE_IGNORE_COMPAT_SYSCALLS
52 /*
53  * Some architectures that allow for 32bit applications
54  * to run on a 64bit kernel, do not map the syscalls for
55  * the 32bit tasks the same as they do for 64bit tasks.
56  *
57  *     *cough*x86*cough*
58  *
59  * In such a case, instead of reporting the wrong syscalls,
60  * simply ignore them.
61  *
62  * For an arch to ignore the compat syscalls it needs to
63  * define ARCH_TRACE_IGNORE_COMPAT_SYSCALLS as well as
64  * define the function arch_trace_is_compat_syscall() to let
65  * the tracing system know that it should ignore it.
66  */
67 static int
68 trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
69 {
70 	if (unlikely(arch_trace_is_compat_syscall(regs)))
71 		return -1;
72 
73 	return syscall_get_nr(task, regs);
74 }
75 #else
76 static inline int
77 trace_get_syscall_nr(struct task_struct *task, struct pt_regs *regs)
78 {
79 	return syscall_get_nr(task, regs);
80 }
81 #endif /* ARCH_TRACE_IGNORE_COMPAT_SYSCALLS */
82 
83 static __init struct syscall_metadata *
84 find_syscall_meta(unsigned long syscall)
85 {
86 	struct syscall_metadata **start;
87 	struct syscall_metadata **stop;
88 	char str[KSYM_SYMBOL_LEN];
89 
90 
91 	start = __start_syscalls_metadata;
92 	stop = __stop_syscalls_metadata;
93 	kallsyms_lookup(syscall, NULL, NULL, NULL, str);
94 
95 	if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
96 		return NULL;
97 
98 	for ( ; start < stop; start++) {
99 		if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
100 			return *start;
101 	}
102 	return NULL;
103 }
104 
105 static struct syscall_metadata *syscall_nr_to_meta(int nr)
106 {
107 	if (IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR))
108 		return xa_load(&syscalls_metadata_sparse, (unsigned long)nr);
109 
110 	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
111 		return NULL;
112 
113 	return syscalls_metadata[nr];
114 }
115 
116 const char *get_syscall_name(int syscall)
117 {
118 	struct syscall_metadata *entry;
119 
120 	entry = syscall_nr_to_meta(syscall);
121 	if (!entry)
122 		return NULL;
123 
124 	return entry->name;
125 }
126 
127 /* Added to user strings or arrays when max limit is reached */
128 #define EXTRA "..."
129 
130 static void get_dynamic_len_ptr(struct syscall_trace_enter *trace,
131 				struct syscall_metadata *entry,
132 				int *offset_p, int *len_p, unsigned char **ptr_p)
133 {
134 	unsigned char *ptr;
135 	int offset = *offset_p;
136 	int val;
137 
138 	/* This arg points to a user space string */
139 	ptr = (void *)trace->args + sizeof(long) * entry->nb_args + offset;
140 	val = *(int *)ptr;
141 
142 	/* The value is a dynamic string (len << 16 | offset) */
143 	ptr = (void *)trace + (val & 0xffff);
144 	*len_p = val >> 16;
145 	offset += 4;
146 
147 	*ptr_p = ptr;
148 	*offset_p = offset;
149 }
150 
151 static enum print_line_t
152 sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadata *entry,
153 		       struct trace_seq *s, struct trace_event *event)
154 {
155 	unsigned char *ptr;
156 	int offset = 0;
157 	int bits, len;
158 	bool done = false;
159 	static const struct trace_print_flags __flags[] =
160 		{
161 			{ O_TMPFILE, "O_TMPFILE" },
162 			{ O_WRONLY, "O_WRONLY" },
163 			{ O_RDWR, "O_RDWR" },
164 			{ O_CREAT, "O_CREAT" },
165 			{ O_EXCL, "O_EXCL" },
166 			{ O_NOCTTY, "O_NOCTTY" },
167 			{ O_TRUNC, "O_TRUNC" },
168 			{ O_APPEND, "O_APPEND" },
169 			{ O_NONBLOCK, "O_NONBLOCK" },
170 			{ O_DSYNC, "O_DSYNC" },
171 			{ O_DIRECT, "O_DIRECT" },
172 			{ O_LARGEFILE, "O_LARGEFILE" },
173 			{ O_DIRECTORY, "O_DIRECTORY" },
174 			{ O_NOFOLLOW, "O_NOFOLLOW" },
175 			{ O_NOATIME, "O_NOATIME" },
176 			{ O_CLOEXEC, "O_CLOEXEC" },
177 		};
178 
179 	trace_seq_printf(s, "%s(", entry->name);
180 
181 	for (int i = 0; !done && i < entry->nb_args; i++) {
182 
183 		if (trace_seq_has_overflowed(s))
184 			goto end;
185 
186 		if (i)
187 			trace_seq_puts(s, ", ");
188 
189 		switch (i) {
190 		case 2:
191 			bits = trace->args[2];
192 
193 			trace_seq_puts(s, "flags: ");
194 
195 			/* No need to show mode when not creating the file */
196 			if (!(bits & (O_CREAT|O_TMPFILE)))
197 				done = true;
198 
199 			if (!(bits & O_ACCMODE)) {
200 				if (!bits) {
201 					trace_seq_puts(s, "O_RDONLY");
202 					continue;
203 				}
204 				trace_seq_puts(s, "O_RDONLY|");
205 			}
206 
207 			trace_print_flags_seq(s, "|", bits, __flags, ARRAY_SIZE(__flags));
208 			/*
209 			 * trace_print_flags_seq() adds a '\0' to the
210 			 * buffer, but this needs to append more to the seq.
211 			 */
212 			if (!trace_seq_has_overflowed(s))
213 				trace_seq_pop(s);
214 
215 			continue;
216 		case 3:
217 			trace_seq_printf(s, "%s: 0%03o", entry->args[i],
218 					 (unsigned int)trace->args[i]);
219 			continue;
220 		}
221 
222 		trace_seq_printf(s, "%s: %lu", entry->args[i],
223 				 trace->args[i]);
224 
225 		if (!(BIT(i) & entry->user_mask))
226 			continue;
227 
228 		get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr);
229 		trace_seq_printf(s, " \"%.*s\"", len, ptr);
230 	}
231 
232 	trace_seq_putc(s, ')');
233 end:
234 	trace_seq_putc(s, '\n');
235 
236 	return trace_handle_return(s);
237 }
238 
239 static enum print_line_t
240 print_syscall_enter(struct trace_iterator *iter, int flags,
241 		    struct trace_event *event)
242 {
243 	struct trace_array *tr = iter->tr;
244 	struct trace_seq *s = &iter->seq;
245 	struct trace_entry *ent = iter->ent;
246 	struct syscall_trace_enter *trace;
247 	struct syscall_metadata *entry;
248 	int i, syscall, val, len;
249 	unsigned char *ptr;
250 	int offset = 0;
251 
252 	trace = (typeof(trace))ent;
253 	syscall = trace->nr;
254 	entry = syscall_nr_to_meta(syscall);
255 
256 	if (!entry)
257 		goto end;
258 
259 	if (entry->enter_event->event.type != ent->type) {
260 		WARN_ON_ONCE(1);
261 		goto end;
262 	}
263 
264 	switch (entry->syscall_nr) {
265 	case __NR_openat:
266 		if (!tr || !(tr->trace_flags & TRACE_ITER(VERBOSE)))
267 			return sys_enter_openat_print(trace, entry, s, event);
268 		break;
269 	default:
270 		break;
271 	}
272 
273 	trace_seq_printf(s, "%s(", entry->name);
274 
275 	for (i = 0; i < entry->nb_args; i++) {
276 		bool printable = false;
277 		char *str;
278 
279 		if (trace_seq_has_overflowed(s))
280 			goto end;
281 
282 		if (i)
283 			trace_seq_puts(s, ", ");
284 
285 		/* parameter types */
286 		if (tr && tr->trace_flags & TRACE_ITER(VERBOSE))
287 			trace_seq_printf(s, "%s ", entry->types[i]);
288 
289 		/* parameter values */
290 		if (trace->args[i] < 10)
291 			trace_seq_printf(s, "%s: %lu", entry->args[i],
292 					 trace->args[i]);
293 		else
294 			trace_seq_printf(s, "%s: 0x%lx", entry->args[i],
295 					 trace->args[i]);
296 
297 		if (!(BIT(i) & entry->user_mask))
298 			continue;
299 
300 		get_dynamic_len_ptr(trace, entry, &offset, &len, &ptr);
301 
302 		if (entry->user_arg_size < 0 || entry->user_arg_is_str) {
303 			trace_seq_printf(s, " \"%.*s\"", len, ptr);
304 			continue;
305 		}
306 
307 		val = trace->args[entry->user_arg_size];
308 
309 		str = ptr;
310 		trace_seq_puts(s, " (");
311 		for (int x = 0; x < len; x++, ptr++) {
312 			if (isascii(*ptr) && isprint(*ptr))
313 				printable = true;
314 			if (x)
315 				trace_seq_putc(s, ':');
316 			trace_seq_printf(s, "%02x", *ptr);
317 		}
318 		if (len < val)
319 			trace_seq_printf(s, ", %s", EXTRA);
320 
321 		trace_seq_putc(s, ')');
322 
323 		/* If nothing is printable, don't bother printing anything */
324 		if (!printable)
325 			continue;
326 
327 		trace_seq_puts(s, " \"");
328 		for (int x = 0; x < len; x++) {
329 			if (isascii(str[x]) && isprint(str[x]))
330 				trace_seq_putc(s, str[x]);
331 			else
332 				trace_seq_putc(s, '.');
333 		}
334 		if (len < val)
335 			trace_seq_printf(s, "\"%s", EXTRA);
336 		else
337 			trace_seq_putc(s, '"');
338 	}
339 
340 	trace_seq_putc(s, ')');
341 end:
342 	trace_seq_putc(s, '\n');
343 
344 	return trace_handle_return(s);
345 }
346 
347 static enum print_line_t
348 print_syscall_exit(struct trace_iterator *iter, int flags,
349 		   struct trace_event *event)
350 {
351 	struct trace_seq *s = &iter->seq;
352 	struct trace_entry *ent = iter->ent;
353 	struct syscall_trace_exit *trace;
354 	int syscall;
355 	struct syscall_metadata *entry;
356 
357 	trace = (typeof(trace))ent;
358 	syscall = trace->nr;
359 	entry = syscall_nr_to_meta(syscall);
360 
361 	if (!entry) {
362 		trace_seq_putc(s, '\n');
363 		goto out;
364 	}
365 
366 	if (entry->exit_event->event.type != ent->type) {
367 		WARN_ON_ONCE(1);
368 		return TRACE_TYPE_UNHANDLED;
369 	}
370 
371 	trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
372 				trace->ret);
373 
374  out:
375 	return trace_handle_return(s);
376 }
377 
378 #define SYSCALL_FIELD(_type, _name) {					\
379 	.type = #_type, .name = #_name,					\
380 	.size = sizeof(_type), .align = __alignof__(_type),		\
381 	.is_signed = is_signed_type(_type), .filter_type = FILTER_OTHER }
382 
383 /* When len=0, we just calculate the needed length */
384 #define LEN_OR_ZERO (len ? len - pos : 0)
385 
386 static int __init
387 sys_enter_openat_print_fmt(struct syscall_metadata *entry, char *buf, int len)
388 {
389 	int pos = 0;
390 
391 	pos += snprintf(buf + pos, LEN_OR_ZERO,
392 			"\"dfd: 0x%%08lx, filename: 0x%%08lx \\\"%%s\\\", flags: %%s%%s, mode: 0%%03o\",");
393 	pos += snprintf(buf + pos, LEN_OR_ZERO,
394 			" ((unsigned long)(REC->dfd)),");
395 	pos += snprintf(buf + pos, LEN_OR_ZERO,
396 			" ((unsigned long)(REC->filename)),");
397 	pos += snprintf(buf + pos, LEN_OR_ZERO,
398 			" __get_str(__filename_val),");
399 	pos += snprintf(buf + pos, LEN_OR_ZERO,
400 			" (REC->flags & ~3) && !(REC->flags & 3) ? \"O_RDONLY|\" : \"\", ");
401 	pos += snprintf(buf + pos, LEN_OR_ZERO,
402 			" REC->flags ? __print_flags(REC->flags, \"|\", ");
403 	pos += snprintf(buf + pos, LEN_OR_ZERO,
404 			"{ 0x%x, \"O_WRONLY\" }, ", O_WRONLY);
405 	pos += snprintf(buf + pos, LEN_OR_ZERO,
406 			"{ 0x%x, \"O_RDWR\" }, ", O_RDWR);
407 	pos += snprintf(buf + pos, LEN_OR_ZERO,
408 			"{ 0x%x, \"O_CREAT\" }, ", O_CREAT);
409 	pos += snprintf(buf + pos, LEN_OR_ZERO,
410 			"{ 0x%x, \"O_EXCL\" }, ", O_EXCL);
411 	pos += snprintf(buf + pos, LEN_OR_ZERO,
412 			"{ 0x%x, \"O_NOCTTY\" }, ", O_NOCTTY);
413 	pos += snprintf(buf + pos, LEN_OR_ZERO,
414 			"{ 0x%x, \"O_TRUNC\" }, ", O_TRUNC);
415 	pos += snprintf(buf + pos, LEN_OR_ZERO,
416 			"{ 0x%x, \"O_APPEND\" }, ", O_APPEND);
417 	pos += snprintf(buf + pos, LEN_OR_ZERO,
418 			"{ 0x%x, \"O_NONBLOCK\" }, ", O_NONBLOCK);
419 	pos += snprintf(buf + pos, LEN_OR_ZERO,
420 			"{ 0x%x, \"O_DSYNC\" }, ", O_DSYNC);
421 	pos += snprintf(buf + pos, LEN_OR_ZERO,
422 			"{ 0x%x, \"O_DIRECT\" }, ", O_DIRECT);
423 	pos += snprintf(buf + pos, LEN_OR_ZERO,
424 			"{ 0x%x, \"O_LARGEFILE\" }, ", O_LARGEFILE);
425 	pos += snprintf(buf + pos, LEN_OR_ZERO,
426 			"{ 0x%x, \"O_DIRECTORY\" }, ", O_DIRECTORY);
427 	pos += snprintf(buf + pos, LEN_OR_ZERO,
428 			"{ 0x%x, \"O_NOFOLLOW\" }, ", O_NOFOLLOW);
429 	pos += snprintf(buf + pos, LEN_OR_ZERO,
430 			"{ 0x%x, \"O_NOATIME\" }, ", O_NOATIME);
431 	pos += snprintf(buf + pos, LEN_OR_ZERO,
432 			"{ 0x%x, \"O_CLOEXEC\" }) : \"O_RDONLY\", ", O_CLOEXEC);
433 
434 	pos += snprintf(buf + pos, LEN_OR_ZERO,
435 			" ((unsigned long)(REC->mode))");
436 	return pos;
437 }
438 
439 static int __init
440 __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
441 {
442 	bool is_string = entry->user_arg_is_str;
443 	int i;
444 	int pos = 0;
445 
446 	switch (entry->syscall_nr) {
447 	case __NR_openat:
448 		return sys_enter_openat_print_fmt(entry, buf, len);
449 	default:
450 		break;
451 	}
452 
453 	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
454 	for (i = 0; i < entry->nb_args; i++) {
455 		if (i)
456 			pos += snprintf(buf + pos, LEN_OR_ZERO, ", ");
457 		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx",
458 				entry->args[i], sizeof(unsigned long));
459 
460 		if (!(BIT(i) & entry->user_mask))
461 			continue;
462 
463 		/* Add the format for the user space string or array */
464 		if (entry->user_arg_size < 0 || is_string)
465 			pos += snprintf(buf + pos, LEN_OR_ZERO, " \\\"%%s\\\"");
466 		else
467 			pos += snprintf(buf + pos, LEN_OR_ZERO, " (%%s)");
468 	}
469 	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
470 
471 	for (i = 0; i < entry->nb_args; i++) {
472 		pos += snprintf(buf + pos, LEN_OR_ZERO,
473 				", ((unsigned long)(REC->%s))", entry->args[i]);
474 		if (!(BIT(i) & entry->user_mask))
475 			continue;
476 		/* The user space data for arg has name __<arg>_val */
477 		if (entry->user_arg_size < 0 || is_string) {
478 			pos += snprintf(buf + pos, LEN_OR_ZERO, ", __get_str(__%s_val)",
479 					entry->args[i]);
480 		} else {
481 			pos += snprintf(buf + pos, LEN_OR_ZERO, ", __print_dynamic_array(__%s_val, 1)",
482 					entry->args[i]);
483 		}
484 	}
485 
486 #undef LEN_OR_ZERO
487 
488 	/* return the length of print_fmt */
489 	return pos;
490 }
491 
492 static int __init set_syscall_print_fmt(struct trace_event_call *call)
493 {
494 	char *print_fmt;
495 	int len;
496 	struct syscall_metadata *entry = call->data;
497 
498 	if (entry->enter_event != call) {
499 		call->print_fmt = "\"0x%lx\", REC->ret";
500 		return 0;
501 	}
502 
503 	/* First: called with 0 length to calculate the needed length */
504 	len = __set_enter_print_fmt(entry, NULL, 0);
505 
506 	print_fmt = kmalloc(len + 1, GFP_KERNEL);
507 	if (!print_fmt)
508 		return -ENOMEM;
509 
510 	/* Second: actually write the @print_fmt */
511 	__set_enter_print_fmt(entry, print_fmt, len + 1);
512 	call->print_fmt = print_fmt;
513 
514 	return 0;
515 }
516 
517 static void __init free_syscall_print_fmt(struct trace_event_call *call)
518 {
519 	struct syscall_metadata *entry = call->data;
520 
521 	if (entry->enter_event == call)
522 		kfree(call->print_fmt);
523 }
524 
525 static int __init syscall_enter_define_fields(struct trace_event_call *call)
526 {
527 	struct syscall_trace_enter trace;
528 	struct syscall_metadata *meta = call->data;
529 	unsigned long mask;
530 	char *arg;
531 	int offset = offsetof(typeof(trace), args);
532 	int ret = 0;
533 	int len;
534 	int i;
535 
536 	for (i = 0; i < meta->nb_args; i++) {
537 		ret = trace_define_field(call, meta->types[i],
538 					 meta->args[i], offset,
539 					 sizeof(unsigned long), 0,
540 					 FILTER_OTHER);
541 		if (ret)
542 			break;
543 		offset += sizeof(unsigned long);
544 	}
545 
546 	if (ret || !meta->user_mask)
547 		return ret;
548 
549 	mask = meta->user_mask;
550 
551 	while (mask) {
552 		int idx = ffs(mask) - 1;
553 		mask &= ~BIT(idx);
554 
555 		/*
556 		 * User space data is faulted into a temporary buffer and then
557 		 * added as a dynamic string or array to the end of the event.
558 		 * The user space data name for the arg pointer is
559 		 * "__<arg>_val".
560 		 */
561 		len = strlen(meta->args[idx]) + sizeof("___val");
562 		arg = kmalloc(len, GFP_KERNEL);
563 		if (WARN_ON_ONCE(!arg)) {
564 			meta->user_mask = 0;
565 			return -ENOMEM;
566 		}
567 
568 		snprintf(arg, len, "__%s_val", meta->args[idx]);
569 
570 		ret = trace_define_field(call, "__data_loc char[]",
571 					 arg, offset, sizeof(int), 0,
572 					 FILTER_OTHER);
573 		if (ret) {
574 			kfree(arg);
575 			break;
576 		}
577 		offset += 4;
578 	}
579 	return ret;
580 }
581 
582 /*
583  * Create a per CPU temporary buffer to copy user space pointers into.
584  *
585  * SYSCALL_FAULT_USER_MAX is the amount to copy from user space.
586  *  (defined in kernel/trace/trace.h)
587 
588  * SYSCALL_FAULT_ARG_SZ is the amount to copy from user space plus the
589  *   nul terminating byte and possibly appended EXTRA (4 bytes).
590  *
591  * SYSCALL_FAULT_BUF_SZ holds the size of the per CPU buffer to use
592  * to copy memory from user space addresses into that will hold
593  * 3 args as only 3 args are allowed to be copied from system calls.
594  */
595 #define SYSCALL_FAULT_ARG_SZ (SYSCALL_FAULT_USER_MAX + 1 + 4)
596 #define SYSCALL_FAULT_MAX_CNT 3
597 #define SYSCALL_FAULT_BUF_SZ (SYSCALL_FAULT_ARG_SZ * SYSCALL_FAULT_MAX_CNT)
598 
599 /* Use the tracing per CPU buffer infrastructure to copy from user space */
600 struct syscall_user_buffer {
601 	struct trace_user_buf_info	buf;
602 	struct rcu_head			rcu;
603 };
604 
605 static struct syscall_user_buffer *syscall_buffer;
606 
607 static int syscall_fault_buffer_enable(void)
608 {
609 	struct syscall_user_buffer *sbuf;
610 	int ret;
611 
612 	lockdep_assert_held(&syscall_trace_lock);
613 
614 	if (syscall_buffer) {
615 		trace_user_fault_get(&syscall_buffer->buf);
616 		return 0;
617 	}
618 
619 	sbuf = kmalloc_obj(*sbuf);
620 	if (!sbuf)
621 		return -ENOMEM;
622 
623 	ret = trace_user_fault_init(&sbuf->buf, SYSCALL_FAULT_BUF_SZ);
624 	if (ret < 0) {
625 		kfree(sbuf);
626 		return ret;
627 	}
628 
629 	WRITE_ONCE(syscall_buffer, sbuf);
630 
631 	return 0;
632 }
633 
634 static void rcu_free_syscall_buffer(struct rcu_head *rcu)
635 {
636 	struct syscall_user_buffer *sbuf =
637 		container_of(rcu, struct syscall_user_buffer, rcu);
638 
639 	trace_user_fault_destroy(&sbuf->buf);
640 	kfree(sbuf);
641 }
642 
643 
644 static void syscall_fault_buffer_disable(void)
645 {
646 	struct syscall_user_buffer *sbuf = syscall_buffer;
647 
648 	lockdep_assert_held(&syscall_trace_lock);
649 
650 	if (trace_user_fault_put(&sbuf->buf))
651 		return;
652 
653 	WRITE_ONCE(syscall_buffer, NULL);
654 	call_rcu_tasks_trace(&sbuf->rcu, rcu_free_syscall_buffer);
655 }
656 
657 struct syscall_args {
658 	char		*ptr_array[SYSCALL_FAULT_MAX_CNT];
659 	int		read[SYSCALL_FAULT_MAX_CNT];
660 	int		uargs;
661 };
662 
663 static int syscall_copy_user(char *buf, const char __user *ptr,
664 			     size_t size, void *data)
665 {
666 	struct syscall_args *args = data;
667 	int ret;
668 
669 	for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
670 		ptr = (char __user *)args->ptr_array[i];
671 		ret = strncpy_from_user(buf, ptr, size);
672 		args->read[i] = ret;
673 	}
674 	return 0;
675 }
676 
677 static int syscall_copy_user_array(char *buf, const char __user *ptr,
678 				   size_t size, void *data)
679 {
680 	struct syscall_args *args = data;
681 	int ret;
682 
683 	for (int i = 0; i < args->uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
684 		ptr = (char __user *)args->ptr_array[i];
685 		ret = __copy_from_user(buf, ptr, size);
686 		args->read[i] = ret ? -1 : size;
687 	}
688 	return 0;
689 }
690 
691 static char *sys_fault_user(unsigned int buf_size,
692 			    struct syscall_metadata *sys_data,
693 			    struct syscall_user_buffer *sbuf,
694 			    unsigned long *args,
695 			    unsigned int data_size[SYSCALL_FAULT_MAX_CNT])
696 {
697 	trace_user_buf_copy syscall_copy = syscall_copy_user;
698 	unsigned long mask = sys_data->user_mask;
699 	unsigned long size = SYSCALL_FAULT_ARG_SZ - 1;
700 	struct syscall_args sargs;
701 	bool array = false;
702 	char *buffer;
703 	char *buf;
704 	int ret;
705 	int i = 0;
706 
707 	/* The extra is appended to the user data in the buffer */
708 	BUILD_BUG_ON(SYSCALL_FAULT_USER_MAX + sizeof(EXTRA) >=
709 		     SYSCALL_FAULT_ARG_SZ);
710 
711 	/*
712 	 * If this system call event has a size argument, use
713 	 * it to define how much of user space memory to read,
714 	 * and read it as an array and not a string.
715 	 */
716 	if (sys_data->user_arg_size >= 0) {
717 		array = true;
718 		size = args[sys_data->user_arg_size];
719 		if (size > SYSCALL_FAULT_ARG_SZ - 1)
720 			size = SYSCALL_FAULT_ARG_SZ - 1;
721 		syscall_copy = syscall_copy_user_array;
722 	}
723 
724 	while (mask) {
725 		int idx = ffs(mask) - 1;
726 		mask &= ~BIT(idx);
727 
728 		if (WARN_ON_ONCE(i == SYSCALL_FAULT_MAX_CNT))
729 			break;
730 
731 		/* Get the pointer to user space memory to read */
732 		sargs.ptr_array[i++] = (char *)args[idx];
733 	}
734 
735 	sargs.uargs = i;
736 
737 	/* Clear the values that are not used */
738 	for (; i < SYSCALL_FAULT_MAX_CNT; i++) {
739 		data_size[i] = -1; /* Denotes no pointer */
740 	}
741 
742 	/* A zero size means do not even try */
743 	if (!buf_size)
744 		return NULL;
745 
746 	buffer = trace_user_fault_read(&sbuf->buf, NULL, size,
747 				       syscall_copy, &sargs);
748 	if (!buffer)
749 		return NULL;
750 
751 	buf = buffer;
752 	for (i = 0; i < sargs.uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
753 
754 		ret = sargs.read[i];
755 		if (ret < 0)
756 			continue;
757 		buf[ret] = '\0';
758 
759 		/* For strings, replace any non-printable characters with '.' */
760 		if (!array) {
761 			for (int x = 0; x < ret; x++) {
762 				if (!isprint(buf[x]))
763 					buf[x] = '.';
764 			}
765 
766 			size = min(buf_size, SYSCALL_FAULT_USER_MAX);
767 
768 			/*
769 			 * If the text was truncated due to our max limit,
770 			 * add "..." to the string.
771 			 */
772 			if (ret > size) {
773 				strscpy(buf + size, EXTRA, sizeof(EXTRA));
774 				ret = size + sizeof(EXTRA);
775 			} else {
776 				buf[ret++] = '\0';
777 			}
778 		} else {
779 			ret = min((unsigned int)ret, buf_size);
780 		}
781 		data_size[i] = ret;
782 	}
783 
784 	return buffer;
785 }
786 
787 static int
788 syscall_get_data(struct syscall_metadata *sys_data, unsigned long *args,
789 		 char **buffer, int *size, int *user_sizes, int *uargs,
790 		 int buf_size)
791 {
792 	struct syscall_user_buffer *sbuf;
793 	int i;
794 
795 	/* If the syscall_buffer is NULL, tracing is being shutdown */
796 	sbuf = READ_ONCE(syscall_buffer);
797 	if (!sbuf)
798 		return -1;
799 
800 	*buffer = sys_fault_user(buf_size, sys_data, sbuf, args, user_sizes);
801 	/*
802 	 * user_size is the amount of data to append.
803 	 * Need to add 4 for the meta field that points to
804 	 * the user memory at the end of the event and also
805 	 * stores its size.
806 	 */
807 	for (i = 0; i < SYSCALL_FAULT_MAX_CNT; i++) {
808 		if (user_sizes[i] < 0)
809 			break;
810 		*size += user_sizes[i] + 4;
811 	}
812 	/* Save the number of user read arguments of this syscall */
813 	*uargs = i;
814 	return 0;
815 }
816 
817 static void syscall_put_data(struct syscall_metadata *sys_data,
818 			     struct syscall_trace_enter *entry,
819 			     char *buffer, int size, int *user_sizes, int uargs)
820 {
821 	char *buf = buffer;
822 	void *ptr;
823 	int val;
824 
825 	/*
826 	 * Set the pointer to point to the meta data of the event
827 	 * that has information about the stored user space memory.
828 	 */
829 	ptr = (void *)entry->args + sizeof(unsigned long) * sys_data->nb_args;
830 
831 	/*
832 	 * The meta data will store the offset of the user data from
833 	 * the beginning of the event. That is after the static arguments
834 	 * and the meta data fields.
835 	 */
836 	val = (ptr - (void *)entry) + 4 * uargs;
837 
838 	for (int i = 0; i < uargs; i++) {
839 
840 		if (i)
841 			val += user_sizes[i - 1];
842 
843 		/* Store the offset and the size into the meta data */
844 		*(int *)ptr = val | (user_sizes[i] << 16);
845 
846 		/* Skip the meta data */
847 		ptr += 4;
848 	}
849 
850 	for (int i = 0; i < uargs; i++, buf += SYSCALL_FAULT_ARG_SZ) {
851 		/* Nothing to do if the user space was empty or faulted */
852 		if (!user_sizes[i])
853 			continue;
854 
855 		memcpy(ptr, buf, user_sizes[i]);
856 		ptr += user_sizes[i];
857 	}
858 }
859 
860 static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
861 {
862 	struct trace_array *tr = data;
863 	struct trace_event_file *trace_file;
864 	struct syscall_trace_enter *entry;
865 	struct syscall_metadata *sys_data;
866 	struct trace_event_buffer fbuffer;
867 	unsigned long args[6];
868 	char *user_ptr;
869 	int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
870 	int syscall_nr;
871 	int size = 0;
872 	int uargs = 0;
873 	bool mayfault;
874 
875 	/*
876 	 * Syscall probe called with preemption enabled, but the ring
877 	 * buffer and per-cpu data require preemption to be disabled.
878 	 */
879 	might_fault();
880 
881 	syscall_nr = trace_get_syscall_nr(current, regs);
882 	if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
883 		return;
884 
885 	trace_file = READ_ONCE(tr->enter_syscall_files[syscall_nr]);
886 	if (!trace_file)
887 		return;
888 
889 	if (trace_trigger_soft_disabled(trace_file))
890 		return;
891 
892 	sys_data = syscall_nr_to_meta(syscall_nr);
893 	if (!sys_data)
894 		return;
895 
896 	/* Check if this syscall event faults in user space memory */
897 	mayfault = sys_data->user_mask != 0;
898 
899 	guard(preempt_notrace)();
900 
901 	syscall_get_arguments(current, regs, args);
902 
903 	if (mayfault) {
904 		if (syscall_get_data(sys_data, args, &user_ptr,
905 				     &size, user_sizes, &uargs, tr->syscall_buf_sz) < 0)
906 			return;
907 	}
908 
909 	size += sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
910 
911 	entry = trace_event_buffer_reserve(&fbuffer, trace_file, size);
912 	if (!entry)
913 		return;
914 
915 	entry = ring_buffer_event_data(fbuffer.event);
916 	entry->nr = syscall_nr;
917 
918 	memcpy(entry->args, args, sizeof(unsigned long) * sys_data->nb_args);
919 
920 	if (mayfault)
921 		syscall_put_data(sys_data, entry, user_ptr, size, user_sizes, uargs);
922 
923 	trace_event_buffer_commit(&fbuffer);
924 }
925 
926 static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
927 {
928 	struct trace_array *tr = data;
929 	struct trace_event_file *trace_file;
930 	struct syscall_trace_exit *entry;
931 	struct syscall_metadata *sys_data;
932 	struct trace_event_buffer fbuffer;
933 	int syscall_nr;
934 
935 	/*
936 	 * Syscall probe called with preemption enabled, but the ring
937 	 * buffer and per-cpu data require preemption to be disabled.
938 	 */
939 	might_fault();
940 	guard(preempt_notrace)();
941 
942 	syscall_nr = trace_get_syscall_nr(current, regs);
943 	if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
944 		return;
945 
946 	trace_file = READ_ONCE(tr->exit_syscall_files[syscall_nr]);
947 	if (!trace_file)
948 		return;
949 
950 	if (trace_trigger_soft_disabled(trace_file))
951 		return;
952 
953 	sys_data = syscall_nr_to_meta(syscall_nr);
954 	if (!sys_data)
955 		return;
956 
957 	entry = trace_event_buffer_reserve(&fbuffer, trace_file, sizeof(*entry));
958 	if (!entry)
959 		return;
960 
961 	entry = ring_buffer_event_data(fbuffer.event);
962 	entry->nr = syscall_nr;
963 	entry->ret = syscall_get_return_value(current, regs);
964 
965 	trace_event_buffer_commit(&fbuffer);
966 }
967 
968 static int reg_event_syscall_enter(struct trace_event_file *file,
969 				   struct trace_event_call *call)
970 {
971 	struct syscall_metadata *sys_data = call->data;
972 	struct trace_array *tr = file->tr;
973 	int ret = 0;
974 	int num;
975 
976 	num = sys_data->syscall_nr;
977 	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
978 		return -ENOSYS;
979 	guard(mutex)(&syscall_trace_lock);
980 	if (sys_data->user_mask) {
981 		ret = syscall_fault_buffer_enable();
982 		if (ret < 0)
983 			return ret;
984 	}
985 	if (!tr->sys_refcount_enter) {
986 		ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
987 		if (ret < 0) {
988 			if (sys_data->user_mask)
989 				syscall_fault_buffer_disable();
990 			return ret;
991 		}
992 	}
993 	WRITE_ONCE(tr->enter_syscall_files[num], file);
994 	tr->sys_refcount_enter++;
995 	return 0;
996 }
997 
998 static void unreg_event_syscall_enter(struct trace_event_file *file,
999 				      struct trace_event_call *call)
1000 {
1001 	struct syscall_metadata *sys_data = call->data;
1002 	struct trace_array *tr = file->tr;
1003 	int num;
1004 
1005 	num = sys_data->syscall_nr;
1006 	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
1007 		return;
1008 	guard(mutex)(&syscall_trace_lock);
1009 	tr->sys_refcount_enter--;
1010 	WRITE_ONCE(tr->enter_syscall_files[num], NULL);
1011 	if (!tr->sys_refcount_enter)
1012 		unregister_trace_sys_enter(ftrace_syscall_enter, tr);
1013 	if (sys_data->user_mask)
1014 		syscall_fault_buffer_disable();
1015 }
1016 
1017 static int reg_event_syscall_exit(struct trace_event_file *file,
1018 				  struct trace_event_call *call)
1019 {
1020 	struct trace_array *tr = file->tr;
1021 	int ret = 0;
1022 	int num;
1023 
1024 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
1025 	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
1026 		return -ENOSYS;
1027 	mutex_lock(&syscall_trace_lock);
1028 	if (!tr->sys_refcount_exit)
1029 		ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
1030 	if (!ret) {
1031 		WRITE_ONCE(tr->exit_syscall_files[num], file);
1032 		tr->sys_refcount_exit++;
1033 	}
1034 	mutex_unlock(&syscall_trace_lock);
1035 	return ret;
1036 }
1037 
1038 static void unreg_event_syscall_exit(struct trace_event_file *file,
1039 				     struct trace_event_call *call)
1040 {
1041 	struct trace_array *tr = file->tr;
1042 	int num;
1043 
1044 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
1045 	if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
1046 		return;
1047 	mutex_lock(&syscall_trace_lock);
1048 	tr->sys_refcount_exit--;
1049 	WRITE_ONCE(tr->exit_syscall_files[num], NULL);
1050 	if (!tr->sys_refcount_exit)
1051 		unregister_trace_sys_exit(ftrace_syscall_exit, tr);
1052 	mutex_unlock(&syscall_trace_lock);
1053 }
1054 
1055 /*
1056  * For system calls that reference user space memory that can
1057  * be recorded into the event, set the system call meta data's user_mask
1058  * to the "args" index that points to the user space memory to retrieve.
1059  */
1060 static void check_faultable_syscall(struct trace_event_call *call, int nr)
1061 {
1062 	struct syscall_metadata *sys_data = call->data;
1063 	unsigned long mask;
1064 
1065 	/* Only work on entry */
1066 	if (sys_data->enter_event != call)
1067 		return;
1068 
1069 	sys_data->user_arg_size = -1;
1070 
1071 	switch (nr) {
1072 	/* user arg 1 with size arg at 2 */
1073 	case __NR_write:
1074 #ifdef __NR_mq_timedsend
1075 	case __NR_mq_timedsend:
1076 #endif
1077 	case __NR_pwrite64:
1078 		sys_data->user_mask = BIT(1);
1079 		sys_data->user_arg_size = 2;
1080 		break;
1081 	/* user arg 0 with size arg at 1 as string */
1082 	case __NR_setdomainname:
1083 	case __NR_sethostname:
1084 		sys_data->user_mask = BIT(0);
1085 		sys_data->user_arg_size = 1;
1086 		sys_data->user_arg_is_str = 1;
1087 		break;
1088 #ifdef __NR_kexec_file_load
1089 	/* user arg 4 with size arg at 3 as string */
1090 	case __NR_kexec_file_load:
1091 		sys_data->user_mask = BIT(4);
1092 		sys_data->user_arg_size = 3;
1093 		sys_data->user_arg_is_str = 1;
1094 		break;
1095 #endif
1096 	/* user arg at position 0 */
1097 #ifdef __NR_access
1098 	case __NR_access:
1099 #endif
1100 	case __NR_acct:
1101 	case __NR_chdir:
1102 #ifdef  __NR_chown
1103 	case __NR_chown:
1104 #endif
1105 #ifdef  __NR_chmod
1106 	case __NR_chmod:
1107 #endif
1108 	case __NR_chroot:
1109 #ifdef __NR_creat
1110 	case __NR_creat:
1111 #endif
1112 	case __NR_delete_module:
1113 	case __NR_execve:
1114 	case __NR_fsopen:
1115 #ifdef __NR_lchown
1116 	case __NR_lchown:
1117 #endif
1118 #ifdef __NR_open
1119 	case __NR_open:
1120 #endif
1121 	case __NR_memfd_create:
1122 #ifdef __NR_mkdir
1123 	case __NR_mkdir:
1124 #endif
1125 #ifdef __NR_mknod
1126 	case __NR_mknod:
1127 #endif
1128 	case __NR_mq_open:
1129 	case __NR_mq_unlink:
1130 #ifdef __NR_readlink
1131 	case __NR_readlink:
1132 #endif
1133 #ifdef  __NR_rmdir
1134 	case __NR_rmdir:
1135 #endif
1136 	case __NR_shmdt:
1137 #ifdef __NR_statfs
1138 	case __NR_statfs:
1139 #endif
1140 	case __NR_swapon:
1141 	case __NR_swapoff:
1142 #ifdef __NR_truncate
1143 	case __NR_truncate:
1144 #endif
1145 #ifdef __NR_unlink
1146 	case __NR_unlink:
1147 #endif
1148 	case __NR_umount2:
1149 #ifdef __NR_utime
1150 	case __NR_utime:
1151 #endif
1152 #ifdef __NR_utimes
1153 	case __NR_utimes:
1154 #endif
1155 		sys_data->user_mask = BIT(0);
1156 		break;
1157 	/* user arg at position 1 */
1158 	case __NR_execveat:
1159 	case __NR_faccessat:
1160 	case __NR_faccessat2:
1161 	case __NR_finit_module:
1162 	case __NR_fchmodat:
1163 	case __NR_fchmodat2:
1164 	case __NR_fchownat:
1165 	case __NR_fgetxattr:
1166 	case __NR_flistxattr:
1167 	case __NR_fsetxattr:
1168 	case __NR_fspick:
1169 	case __NR_fremovexattr:
1170 #ifdef __NR_futimesat
1171 	case __NR_futimesat:
1172 #endif
1173 	case __NR_inotify_add_watch:
1174 	case __NR_mkdirat:
1175 	case __NR_mknodat:
1176 	case __NR_mount_setattr:
1177 	case __NR_name_to_handle_at:
1178 #ifdef __NR_newfstatat
1179 	case __NR_newfstatat:
1180 #endif
1181 	case __NR_openat:
1182 	case __NR_openat2:
1183 	case __NR_open_tree:
1184 	case __NR_open_tree_attr:
1185 	case __NR_readlinkat:
1186 	case __NR_quotactl:
1187 	case __NR_syslog:
1188 	case __NR_statx:
1189 	case __NR_unlinkat:
1190 #ifdef __NR_utimensat
1191 	case __NR_utimensat:
1192 #endif
1193 		sys_data->user_mask = BIT(1);
1194 		break;
1195 	/* user arg at position 2 */
1196 	case __NR_init_module:
1197 	case __NR_fsconfig:
1198 		sys_data->user_mask = BIT(2);
1199 		break;
1200 	/* user arg at position 4 */
1201 	case __NR_fanotify_mark:
1202 		sys_data->user_mask = BIT(4);
1203 		break;
1204 	/* 2 user args, 0 and 1 */
1205 	case __NR_add_key:
1206 	case __NR_getxattr:
1207 	case __NR_lgetxattr:
1208 	case __NR_lremovexattr:
1209 #ifdef __NR_link
1210 	case __NR_link:
1211 #endif
1212 	case __NR_listxattr:
1213 	case __NR_llistxattr:
1214 	case __NR_lsetxattr:
1215 	case __NR_pivot_root:
1216 	case __NR_removexattr:
1217 #ifdef __NR_rename
1218 	case __NR_rename:
1219 #endif
1220 	case __NR_request_key:
1221 	case __NR_setxattr:
1222 #ifdef __NR_symlink
1223 	case __NR_symlink:
1224 #endif
1225 		sys_data->user_mask = BIT(0) | BIT(1);
1226 		break;
1227 	/* 2 user args, 0 and 2 */
1228 	case __NR_symlinkat:
1229 		sys_data->user_mask = BIT(0) | BIT(2);
1230 		break;
1231 	/* 2 user args, 1 and 3 */
1232 	case __NR_getxattrat:
1233 	case __NR_linkat:
1234 	case __NR_listxattrat:
1235 	case __NR_move_mount:
1236 #ifdef __NR_renameat
1237 	case __NR_renameat:
1238 #endif
1239 	case __NR_renameat2:
1240 	case __NR_removexattrat:
1241 	case __NR_setxattrat:
1242 		sys_data->user_mask = BIT(1) | BIT(3);
1243 		break;
1244 	case __NR_mount: /* Just dev_name and dir_name, TODO add type */
1245 		sys_data->user_mask = BIT(0) | BIT(1) | BIT(2);
1246 		break;
1247 	default:
1248 		sys_data->user_mask = 0;
1249 		return;
1250 	}
1251 
1252 	if (sys_data->user_arg_size < 0)
1253 		return;
1254 
1255 	/*
1256 	 * The user_arg_size can only be used when the system call
1257 	 * is reading only a single address from user space.
1258 	 */
1259 	mask = sys_data->user_mask;
1260 	if (WARN_ON(mask & (mask - 1)))
1261 		sys_data->user_arg_size = -1;
1262 }
1263 
1264 static int __init init_syscall_trace(struct trace_event_call *call)
1265 {
1266 	int id;
1267 	int num;
1268 
1269 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
1270 	if (num < 0 || num >= NR_syscalls) {
1271 		pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
1272 				((struct syscall_metadata *)call->data)->name);
1273 		return -ENOSYS;
1274 	}
1275 
1276 	check_faultable_syscall(call, num);
1277 
1278 	if (set_syscall_print_fmt(call) < 0)
1279 		return -ENOMEM;
1280 
1281 	id = trace_event_raw_init(call);
1282 
1283 	if (id < 0) {
1284 		free_syscall_print_fmt(call);
1285 		return id;
1286 	}
1287 
1288 	return id;
1289 }
1290 
1291 static struct trace_event_fields __refdata syscall_enter_fields_array[] = {
1292 	SYSCALL_FIELD(int, __syscall_nr),
1293 	{ .type = TRACE_FUNCTION_TYPE,
1294 	  .define_fields = syscall_enter_define_fields },
1295 	{}
1296 };
1297 
1298 struct trace_event_functions enter_syscall_print_funcs = {
1299 	.trace		= print_syscall_enter,
1300 };
1301 
1302 struct trace_event_functions exit_syscall_print_funcs = {
1303 	.trace		= print_syscall_exit,
1304 };
1305 
1306 struct trace_event_class __refdata event_class_syscall_enter = {
1307 	.system		= "syscalls",
1308 	.reg		= syscall_enter_register,
1309 	.fields_array	= syscall_enter_fields_array,
1310 	.get_fields	= syscall_get_enter_fields,
1311 	.raw_init	= init_syscall_trace,
1312 };
1313 
1314 struct trace_event_class __refdata event_class_syscall_exit = {
1315 	.system		= "syscalls",
1316 	.reg		= syscall_exit_register,
1317 	.fields_array	= (struct trace_event_fields[]){
1318 		SYSCALL_FIELD(int, __syscall_nr),
1319 		SYSCALL_FIELD(long, ret),
1320 		{}
1321 	},
1322 	.fields		= LIST_HEAD_INIT(event_class_syscall_exit.fields),
1323 	.raw_init	= init_syscall_trace,
1324 };
1325 
1326 unsigned long __init __weak arch_syscall_addr(int nr)
1327 {
1328 	return (unsigned long)sys_call_table[nr];
1329 }
1330 
1331 void __init init_ftrace_syscalls(void)
1332 {
1333 	struct syscall_metadata *meta;
1334 	unsigned long addr;
1335 	int i;
1336 	void *ret;
1337 
1338 	if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
1339 		syscalls_metadata = kzalloc_objs(*syscalls_metadata,
1340 						 NR_syscalls);
1341 		if (!syscalls_metadata) {
1342 			WARN_ON(1);
1343 			return;
1344 		}
1345 	}
1346 
1347 	for (i = 0; i < NR_syscalls; i++) {
1348 		addr = arch_syscall_addr(i);
1349 		meta = find_syscall_meta(addr);
1350 		if (!meta)
1351 			continue;
1352 
1353 		meta->syscall_nr = i;
1354 
1355 		if (!IS_ENABLED(CONFIG_HAVE_SPARSE_SYSCALL_NR)) {
1356 			syscalls_metadata[i] = meta;
1357 		} else {
1358 			ret = xa_store(&syscalls_metadata_sparse, i, meta,
1359 					GFP_KERNEL);
1360 			WARN(xa_is_err(ret),
1361 				"Syscall memory allocation failed\n");
1362 		}
1363 
1364 	}
1365 }
1366 
1367 #ifdef CONFIG_PERF_EVENTS
1368 
1369 static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
1370 static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
1371 static int sys_perf_refcount_enter;
1372 static int sys_perf_refcount_exit;
1373 
1374 static int perf_call_bpf_enter(struct trace_event_call *call, struct pt_regs *regs,
1375 			       struct syscall_metadata *sys_data,
1376 			       struct syscall_trace_enter *rec)
1377 {
1378 	struct syscall_tp_t {
1379 		struct trace_entry ent;
1380 		int syscall_nr;
1381 		unsigned long args[SYSCALL_DEFINE_MAXARGS];
1382 	} __aligned(8) param;
1383 	int i;
1384 
1385 	BUILD_BUG_ON(sizeof(param.ent) < sizeof(void *));
1386 
1387 	/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
1388 	perf_fetch_caller_regs(regs);
1389 	*(struct pt_regs **)&param = regs;
1390 	param.syscall_nr = rec->nr;
1391 	for (i = 0; i < sys_data->nb_args; i++)
1392 		param.args[i] = rec->args[i];
1393 	return trace_call_bpf(call, &param);
1394 }
1395 
1396 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
1397 {
1398 	struct syscall_metadata *sys_data;
1399 	struct syscall_trace_enter *rec;
1400 	struct pt_regs *fake_regs;
1401 	struct hlist_head *head;
1402 	unsigned long args[6];
1403 	bool valid_prog_array;
1404 	bool mayfault;
1405 	char *user_ptr;
1406 	int user_sizes[SYSCALL_FAULT_MAX_CNT] = {};
1407 	int buf_size = CONFIG_TRACE_SYSCALL_BUF_SIZE_DEFAULT;
1408 	int syscall_nr;
1409 	int rctx;
1410 	int size = 0;
1411 	int uargs = 0;
1412 
1413 	/*
1414 	 * Syscall probe called with preemption enabled, but the ring
1415 	 * buffer and per-cpu data require preemption to be disabled.
1416 	 */
1417 	might_fault();
1418 	guard(preempt_notrace)();
1419 
1420 	syscall_nr = trace_get_syscall_nr(current, regs);
1421 	if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
1422 		return;
1423 	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
1424 		return;
1425 
1426 	sys_data = syscall_nr_to_meta(syscall_nr);
1427 	if (!sys_data)
1428 		return;
1429 
1430 	syscall_get_arguments(current, regs, args);
1431 
1432 	/* Check if this syscall event faults in user space memory */
1433 	mayfault = sys_data->user_mask != 0;
1434 
1435 	if (mayfault) {
1436 		if (syscall_get_data(sys_data, args, &user_ptr,
1437 				     &size, user_sizes, &uargs, buf_size) < 0)
1438 			return;
1439 	}
1440 
1441 	head = this_cpu_ptr(sys_data->enter_event->perf_events);
1442 	valid_prog_array = bpf_prog_array_valid(sys_data->enter_event);
1443 	if (!valid_prog_array && hlist_empty(head))
1444 		return;
1445 
1446 	/* get the size after alignment with the u32 buffer size field */
1447 	size += sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
1448 	size = ALIGN(size + sizeof(u32), sizeof(u64));
1449 	size -= sizeof(u32);
1450 
1451 	rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
1452 	if (!rec)
1453 		return;
1454 
1455 	rec->nr = syscall_nr;
1456 	memcpy(&rec->args, args, sizeof(unsigned long) * sys_data->nb_args);
1457 
1458 	if (mayfault)
1459 		syscall_put_data(sys_data, rec, user_ptr, size, user_sizes, uargs);
1460 
1461 	if ((valid_prog_array &&
1462 	     !perf_call_bpf_enter(sys_data->enter_event, fake_regs, sys_data, rec)) ||
1463 	    hlist_empty(head)) {
1464 		perf_swevent_put_recursion_context(rctx);
1465 		return;
1466 	}
1467 
1468 	perf_trace_buf_submit(rec, size, rctx,
1469 			      sys_data->enter_event->event.type, 1, regs,
1470 			      head, NULL);
1471 }
1472 
1473 static int perf_sysenter_enable(struct trace_event_call *call)
1474 {
1475 	struct syscall_metadata *sys_data = call->data;
1476 	int num;
1477 	int ret;
1478 
1479 	num = sys_data->syscall_nr;
1480 
1481 	guard(mutex)(&syscall_trace_lock);
1482 	if (sys_data->user_mask) {
1483 		ret = syscall_fault_buffer_enable();
1484 		if (ret < 0)
1485 			return ret;
1486 	}
1487 	if (!sys_perf_refcount_enter) {
1488 		ret = register_trace_sys_enter(perf_syscall_enter, NULL);
1489 		if (ret) {
1490 			pr_info("event trace: Could not activate syscall entry trace point");
1491 			if (sys_data->user_mask)
1492 				syscall_fault_buffer_disable();
1493 			return ret;
1494 		}
1495 	}
1496 	set_bit(num, enabled_perf_enter_syscalls);
1497 	sys_perf_refcount_enter++;
1498 	return 0;
1499 }
1500 
1501 static void perf_sysenter_disable(struct trace_event_call *call)
1502 {
1503 	struct syscall_metadata *sys_data = call->data;
1504 	int num;
1505 
1506 	num = sys_data->syscall_nr;
1507 
1508 	guard(mutex)(&syscall_trace_lock);
1509 	sys_perf_refcount_enter--;
1510 	clear_bit(num, enabled_perf_enter_syscalls);
1511 	if (!sys_perf_refcount_enter)
1512 		unregister_trace_sys_enter(perf_syscall_enter, NULL);
1513 	if (sys_data->user_mask)
1514 		syscall_fault_buffer_disable();
1515 }
1516 
1517 static int perf_call_bpf_exit(struct trace_event_call *call, struct pt_regs *regs,
1518 			      struct syscall_trace_exit *rec)
1519 {
1520 	struct syscall_tp_t {
1521 		struct trace_entry ent;
1522 		int syscall_nr;
1523 		unsigned long ret;
1524 	} __aligned(8) param;
1525 
1526 	/* bpf prog requires 'regs' to be the first member in the ctx (a.k.a. &param) */
1527 	perf_fetch_caller_regs(regs);
1528 	*(struct pt_regs **)&param = regs;
1529 	param.syscall_nr = rec->nr;
1530 	param.ret = rec->ret;
1531 	return trace_call_bpf(call, &param);
1532 }
1533 
1534 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
1535 {
1536 	struct syscall_metadata *sys_data;
1537 	struct syscall_trace_exit *rec;
1538 	struct pt_regs *fake_regs;
1539 	struct hlist_head *head;
1540 	bool valid_prog_array;
1541 	int syscall_nr;
1542 	int rctx;
1543 	int size;
1544 
1545 	/*
1546 	 * Syscall probe called with preemption enabled, but the ring
1547 	 * buffer and per-cpu data require preemption to be disabled.
1548 	 */
1549 	might_fault();
1550 	guard(preempt_notrace)();
1551 
1552 	syscall_nr = trace_get_syscall_nr(current, regs);
1553 	if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
1554 		return;
1555 	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
1556 		return;
1557 
1558 	sys_data = syscall_nr_to_meta(syscall_nr);
1559 	if (!sys_data)
1560 		return;
1561 
1562 	head = this_cpu_ptr(sys_data->exit_event->perf_events);
1563 	valid_prog_array = bpf_prog_array_valid(sys_data->exit_event);
1564 	if (!valid_prog_array && hlist_empty(head))
1565 		return;
1566 
1567 	/* We can probably do that at build time */
1568 	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
1569 	size -= sizeof(u32);
1570 
1571 	rec = perf_trace_buf_alloc(size, &fake_regs, &rctx);
1572 	if (!rec)
1573 		return;
1574 
1575 	rec->nr = syscall_nr;
1576 	rec->ret = syscall_get_return_value(current, regs);
1577 
1578 	if ((valid_prog_array &&
1579 	     !perf_call_bpf_exit(sys_data->exit_event, fake_regs, rec)) ||
1580 	    hlist_empty(head)) {
1581 		perf_swevent_put_recursion_context(rctx);
1582 		return;
1583 	}
1584 
1585 	perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
1586 			      1, regs, head, NULL);
1587 }
1588 
1589 static int perf_sysexit_enable(struct trace_event_call *call)
1590 {
1591 	int num;
1592 
1593 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
1594 
1595 	guard(mutex)(&syscall_trace_lock);
1596 	if (!sys_perf_refcount_exit) {
1597 		int ret = register_trace_sys_exit(perf_syscall_exit, NULL);
1598 		if (ret) {
1599 			pr_info("event trace: Could not activate syscall exit trace point");
1600 			return ret;
1601 		}
1602 	}
1603 	set_bit(num, enabled_perf_exit_syscalls);
1604 	sys_perf_refcount_exit++;
1605 	return 0;
1606 }
1607 
1608 static void perf_sysexit_disable(struct trace_event_call *call)
1609 {
1610 	int num;
1611 
1612 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
1613 
1614 	guard(mutex)(&syscall_trace_lock);
1615 	sys_perf_refcount_exit--;
1616 	clear_bit(num, enabled_perf_exit_syscalls);
1617 	if (!sys_perf_refcount_exit)
1618 		unregister_trace_sys_exit(perf_syscall_exit, NULL);
1619 }
1620 
1621 #endif /* CONFIG_PERF_EVENTS */
1622 
1623 static int syscall_enter_register(struct trace_event_call *event,
1624 				 enum trace_reg type, void *data)
1625 {
1626 	struct trace_event_file *file = data;
1627 
1628 	switch (type) {
1629 	case TRACE_REG_REGISTER:
1630 		return reg_event_syscall_enter(file, event);
1631 	case TRACE_REG_UNREGISTER:
1632 		unreg_event_syscall_enter(file, event);
1633 		return 0;
1634 
1635 #ifdef CONFIG_PERF_EVENTS
1636 	case TRACE_REG_PERF_REGISTER:
1637 		return perf_sysenter_enable(event);
1638 	case TRACE_REG_PERF_UNREGISTER:
1639 		perf_sysenter_disable(event);
1640 		return 0;
1641 	case TRACE_REG_PERF_OPEN:
1642 	case TRACE_REG_PERF_CLOSE:
1643 	case TRACE_REG_PERF_ADD:
1644 	case TRACE_REG_PERF_DEL:
1645 		return 0;
1646 #endif
1647 	}
1648 	return 0;
1649 }
1650 
1651 static int syscall_exit_register(struct trace_event_call *event,
1652 				 enum trace_reg type, void *data)
1653 {
1654 	struct trace_event_file *file = data;
1655 
1656 	switch (type) {
1657 	case TRACE_REG_REGISTER:
1658 		return reg_event_syscall_exit(file, event);
1659 	case TRACE_REG_UNREGISTER:
1660 		unreg_event_syscall_exit(file, event);
1661 		return 0;
1662 
1663 #ifdef CONFIG_PERF_EVENTS
1664 	case TRACE_REG_PERF_REGISTER:
1665 		return perf_sysexit_enable(event);
1666 	case TRACE_REG_PERF_UNREGISTER:
1667 		perf_sysexit_disable(event);
1668 		return 0;
1669 	case TRACE_REG_PERF_OPEN:
1670 	case TRACE_REG_PERF_CLOSE:
1671 	case TRACE_REG_PERF_ADD:
1672 	case TRACE_REG_PERF_DEL:
1673 		return 0;
1674 #endif
1675 	}
1676 	return 0;
1677 }
1678