xref: /linux/kernel/trace/trace_syscalls.c (revision 394d83c17fac2b7bcf05cb99d1e945135767bb6b)
1 #include <trace/syscall.h>
2 #include <trace/events/syscalls.h>
3 #include <linux/slab.h>
4 #include <linux/kernel.h>
5 #include <linux/ftrace.h>
6 #include <linux/perf_event.h>
7 #include <asm/syscall.h>
8 
9 #include "trace_output.h"
10 #include "trace.h"
11 
12 static DEFINE_MUTEX(syscall_trace_lock);
13 static int sys_refcount_enter;
14 static int sys_refcount_exit;
15 static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
16 static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17 
18 static int syscall_enter_register(struct ftrace_event_call *event,
19 				 enum trace_reg type);
20 static int syscall_exit_register(struct ftrace_event_call *event,
21 				 enum trace_reg type);
22 
23 static int syscall_enter_define_fields(struct ftrace_event_call *call);
24 static int syscall_exit_define_fields(struct ftrace_event_call *call);
25 
26 static struct list_head *
27 syscall_get_enter_fields(struct ftrace_event_call *call)
28 {
29 	struct syscall_metadata *entry = call->data;
30 
31 	return &entry->enter_fields;
32 }
33 
34 struct trace_event_functions enter_syscall_print_funcs = {
35 	.trace		= print_syscall_enter,
36 };
37 
38 struct trace_event_functions exit_syscall_print_funcs = {
39 	.trace		= print_syscall_exit,
40 };
41 
42 struct ftrace_event_class event_class_syscall_enter = {
43 	.system		= "syscalls",
44 	.reg		= syscall_enter_register,
45 	.define_fields	= syscall_enter_define_fields,
46 	.get_fields	= syscall_get_enter_fields,
47 	.raw_init	= init_syscall_trace,
48 };
49 
50 struct ftrace_event_class event_class_syscall_exit = {
51 	.system		= "syscalls",
52 	.reg		= syscall_exit_register,
53 	.define_fields	= syscall_exit_define_fields,
54 	.fields		= LIST_HEAD_INIT(event_class_syscall_exit.fields),
55 	.raw_init	= init_syscall_trace,
56 };
57 
58 extern unsigned long __start_syscalls_metadata[];
59 extern unsigned long __stop_syscalls_metadata[];
60 
61 static struct syscall_metadata **syscalls_metadata;
62 
63 static struct syscall_metadata *find_syscall_meta(unsigned long syscall)
64 {
65 	struct syscall_metadata *start;
66 	struct syscall_metadata *stop;
67 	char str[KSYM_SYMBOL_LEN];
68 
69 
70 	start = (struct syscall_metadata *)__start_syscalls_metadata;
71 	stop = (struct syscall_metadata *)__stop_syscalls_metadata;
72 	kallsyms_lookup(syscall, NULL, NULL, NULL, str);
73 
74 	for ( ; start < stop; start++) {
75 		/*
76 		 * Only compare after the "sys" prefix. Archs that use
77 		 * syscall wrappers may have syscalls symbols aliases prefixed
78 		 * with "SyS" instead of "sys", leading to an unwanted
79 		 * mismatch.
80 		 */
81 		if (start->name && !strcmp(start->name + 3, str + 3))
82 			return start;
83 	}
84 	return NULL;
85 }
86 
87 static struct syscall_metadata *syscall_nr_to_meta(int nr)
88 {
89 	if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
90 		return NULL;
91 
92 	return syscalls_metadata[nr];
93 }
94 
95 enum print_line_t
96 print_syscall_enter(struct trace_iterator *iter, int flags,
97 		    struct trace_event *event)
98 {
99 	struct trace_seq *s = &iter->seq;
100 	struct trace_entry *ent = iter->ent;
101 	struct syscall_trace_enter *trace;
102 	struct syscall_metadata *entry;
103 	int i, ret, syscall;
104 
105 	trace = (typeof(trace))ent;
106 	syscall = trace->nr;
107 	entry = syscall_nr_to_meta(syscall);
108 
109 	if (!entry)
110 		goto end;
111 
112 	if (entry->enter_event->event.type != ent->type) {
113 		WARN_ON_ONCE(1);
114 		goto end;
115 	}
116 
117 	ret = trace_seq_printf(s, "%s(", entry->name);
118 	if (!ret)
119 		return TRACE_TYPE_PARTIAL_LINE;
120 
121 	for (i = 0; i < entry->nb_args; i++) {
122 		/* parameter types */
123 		if (trace_flags & TRACE_ITER_VERBOSE) {
124 			ret = trace_seq_printf(s, "%s ", entry->types[i]);
125 			if (!ret)
126 				return TRACE_TYPE_PARTIAL_LINE;
127 		}
128 		/* parameter values */
129 		ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i],
130 				       trace->args[i],
131 				       i == entry->nb_args - 1 ? "" : ", ");
132 		if (!ret)
133 			return TRACE_TYPE_PARTIAL_LINE;
134 	}
135 
136 	ret = trace_seq_putc(s, ')');
137 	if (!ret)
138 		return TRACE_TYPE_PARTIAL_LINE;
139 
140 end:
141 	ret =  trace_seq_putc(s, '\n');
142 	if (!ret)
143 		return TRACE_TYPE_PARTIAL_LINE;
144 
145 	return TRACE_TYPE_HANDLED;
146 }
147 
148 enum print_line_t
149 print_syscall_exit(struct trace_iterator *iter, int flags,
150 		   struct trace_event *event)
151 {
152 	struct trace_seq *s = &iter->seq;
153 	struct trace_entry *ent = iter->ent;
154 	struct syscall_trace_exit *trace;
155 	int syscall;
156 	struct syscall_metadata *entry;
157 	int ret;
158 
159 	trace = (typeof(trace))ent;
160 	syscall = trace->nr;
161 	entry = syscall_nr_to_meta(syscall);
162 
163 	if (!entry) {
164 		trace_seq_printf(s, "\n");
165 		return TRACE_TYPE_HANDLED;
166 	}
167 
168 	if (entry->exit_event->event.type != ent->type) {
169 		WARN_ON_ONCE(1);
170 		return TRACE_TYPE_UNHANDLED;
171 	}
172 
173 	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
174 				trace->ret);
175 	if (!ret)
176 		return TRACE_TYPE_PARTIAL_LINE;
177 
178 	return TRACE_TYPE_HANDLED;
179 }
180 
181 extern char *__bad_type_size(void);
182 
183 #define SYSCALL_FIELD(type, name)					\
184 	sizeof(type) != sizeof(trace.name) ?				\
185 		__bad_type_size() :					\
186 		#type, #name, offsetof(typeof(trace), name),		\
187 		sizeof(trace.name), is_signed_type(type)
188 
189 static
190 int  __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
191 {
192 	int i;
193 	int pos = 0;
194 
195 	/* When len=0, we just calculate the needed length */
196 #define LEN_OR_ZERO (len ? len - pos : 0)
197 
198 	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
199 	for (i = 0; i < entry->nb_args; i++) {
200 		pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s",
201 				entry->args[i], sizeof(unsigned long),
202 				i == entry->nb_args - 1 ? "" : ", ");
203 	}
204 	pos += snprintf(buf + pos, LEN_OR_ZERO, "\"");
205 
206 	for (i = 0; i < entry->nb_args; i++) {
207 		pos += snprintf(buf + pos, LEN_OR_ZERO,
208 				", ((unsigned long)(REC->%s))", entry->args[i]);
209 	}
210 
211 #undef LEN_OR_ZERO
212 
213 	/* return the length of print_fmt */
214 	return pos;
215 }
216 
217 static int set_syscall_print_fmt(struct ftrace_event_call *call)
218 {
219 	char *print_fmt;
220 	int len;
221 	struct syscall_metadata *entry = call->data;
222 
223 	if (entry->enter_event != call) {
224 		call->print_fmt = "\"0x%lx\", REC->ret";
225 		return 0;
226 	}
227 
228 	/* First: called with 0 length to calculate the needed length */
229 	len = __set_enter_print_fmt(entry, NULL, 0);
230 
231 	print_fmt = kmalloc(len + 1, GFP_KERNEL);
232 	if (!print_fmt)
233 		return -ENOMEM;
234 
235 	/* Second: actually write the @print_fmt */
236 	__set_enter_print_fmt(entry, print_fmt, len + 1);
237 	call->print_fmt = print_fmt;
238 
239 	return 0;
240 }
241 
242 static void free_syscall_print_fmt(struct ftrace_event_call *call)
243 {
244 	struct syscall_metadata *entry = call->data;
245 
246 	if (entry->enter_event == call)
247 		kfree(call->print_fmt);
248 }
249 
250 static int syscall_enter_define_fields(struct ftrace_event_call *call)
251 {
252 	struct syscall_trace_enter trace;
253 	struct syscall_metadata *meta = call->data;
254 	int ret;
255 	int i;
256 	int offset = offsetof(typeof(trace), args);
257 
258 	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
259 	if (ret)
260 		return ret;
261 
262 	for (i = 0; i < meta->nb_args; i++) {
263 		ret = trace_define_field(call, meta->types[i],
264 					 meta->args[i], offset,
265 					 sizeof(unsigned long), 0,
266 					 FILTER_OTHER);
267 		offset += sizeof(unsigned long);
268 	}
269 
270 	return ret;
271 }
272 
273 static int syscall_exit_define_fields(struct ftrace_event_call *call)
274 {
275 	struct syscall_trace_exit trace;
276 	int ret;
277 
278 	ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
279 	if (ret)
280 		return ret;
281 
282 	ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
283 				 FILTER_OTHER);
284 
285 	return ret;
286 }
287 
288 void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
289 {
290 	struct syscall_trace_enter *entry;
291 	struct syscall_metadata *sys_data;
292 	struct ring_buffer_event *event;
293 	struct ring_buffer *buffer;
294 	int size;
295 	int syscall_nr;
296 
297 	syscall_nr = syscall_get_nr(current, regs);
298 	if (syscall_nr < 0)
299 		return;
300 	if (!test_bit(syscall_nr, enabled_enter_syscalls))
301 		return;
302 
303 	sys_data = syscall_nr_to_meta(syscall_nr);
304 	if (!sys_data)
305 		return;
306 
307 	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
308 
309 	event = trace_current_buffer_lock_reserve(&buffer,
310 			sys_data->enter_event->event.type, size, 0, 0);
311 	if (!event)
312 		return;
313 
314 	entry = ring_buffer_event_data(event);
315 	entry->nr = syscall_nr;
316 	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
317 
318 	if (!filter_current_check_discard(buffer, sys_data->enter_event,
319 					  entry, event))
320 		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
321 }
322 
323 void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
324 {
325 	struct syscall_trace_exit *entry;
326 	struct syscall_metadata *sys_data;
327 	struct ring_buffer_event *event;
328 	struct ring_buffer *buffer;
329 	int syscall_nr;
330 
331 	syscall_nr = syscall_get_nr(current, regs);
332 	if (syscall_nr < 0)
333 		return;
334 	if (!test_bit(syscall_nr, enabled_exit_syscalls))
335 		return;
336 
337 	sys_data = syscall_nr_to_meta(syscall_nr);
338 	if (!sys_data)
339 		return;
340 
341 	event = trace_current_buffer_lock_reserve(&buffer,
342 			sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
343 	if (!event)
344 		return;
345 
346 	entry = ring_buffer_event_data(event);
347 	entry->nr = syscall_nr;
348 	entry->ret = syscall_get_return_value(current, regs);
349 
350 	if (!filter_current_check_discard(buffer, sys_data->exit_event,
351 					  entry, event))
352 		trace_current_buffer_unlock_commit(buffer, event, 0, 0);
353 }
354 
355 int reg_event_syscall_enter(struct ftrace_event_call *call)
356 {
357 	int ret = 0;
358 	int num;
359 
360 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
361 	if (num < 0 || num >= NR_syscalls)
362 		return -ENOSYS;
363 	mutex_lock(&syscall_trace_lock);
364 	if (!sys_refcount_enter)
365 		ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
366 	if (!ret) {
367 		set_bit(num, enabled_enter_syscalls);
368 		sys_refcount_enter++;
369 	}
370 	mutex_unlock(&syscall_trace_lock);
371 	return ret;
372 }
373 
374 void unreg_event_syscall_enter(struct ftrace_event_call *call)
375 {
376 	int num;
377 
378 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
379 	if (num < 0 || num >= NR_syscalls)
380 		return;
381 	mutex_lock(&syscall_trace_lock);
382 	sys_refcount_enter--;
383 	clear_bit(num, enabled_enter_syscalls);
384 	if (!sys_refcount_enter)
385 		unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
386 	mutex_unlock(&syscall_trace_lock);
387 }
388 
389 int reg_event_syscall_exit(struct ftrace_event_call *call)
390 {
391 	int ret = 0;
392 	int num;
393 
394 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
395 	if (num < 0 || num >= NR_syscalls)
396 		return -ENOSYS;
397 	mutex_lock(&syscall_trace_lock);
398 	if (!sys_refcount_exit)
399 		ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
400 	if (!ret) {
401 		set_bit(num, enabled_exit_syscalls);
402 		sys_refcount_exit++;
403 	}
404 	mutex_unlock(&syscall_trace_lock);
405 	return ret;
406 }
407 
408 void unreg_event_syscall_exit(struct ftrace_event_call *call)
409 {
410 	int num;
411 
412 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
413 	if (num < 0 || num >= NR_syscalls)
414 		return;
415 	mutex_lock(&syscall_trace_lock);
416 	sys_refcount_exit--;
417 	clear_bit(num, enabled_exit_syscalls);
418 	if (!sys_refcount_exit)
419 		unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
420 	mutex_unlock(&syscall_trace_lock);
421 }
422 
423 int init_syscall_trace(struct ftrace_event_call *call)
424 {
425 	int id;
426 
427 	if (set_syscall_print_fmt(call) < 0)
428 		return -ENOMEM;
429 
430 	id = trace_event_raw_init(call);
431 
432 	if (id < 0) {
433 		free_syscall_print_fmt(call);
434 		return id;
435 	}
436 
437 	return id;
438 }
439 
440 unsigned long __init arch_syscall_addr(int nr)
441 {
442 	return (unsigned long)sys_call_table[nr];
443 }
444 
445 int __init init_ftrace_syscalls(void)
446 {
447 	struct syscall_metadata *meta;
448 	unsigned long addr;
449 	int i;
450 
451 	syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
452 					NR_syscalls, GFP_KERNEL);
453 	if (!syscalls_metadata) {
454 		WARN_ON(1);
455 		return -ENOMEM;
456 	}
457 
458 	for (i = 0; i < NR_syscalls; i++) {
459 		addr = arch_syscall_addr(i);
460 		meta = find_syscall_meta(addr);
461 		if (!meta)
462 			continue;
463 
464 		meta->syscall_nr = i;
465 		syscalls_metadata[i] = meta;
466 	}
467 
468 	return 0;
469 }
470 core_initcall(init_ftrace_syscalls);
471 
472 #ifdef CONFIG_PERF_EVENTS
473 
474 static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls);
475 static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
476 static int sys_perf_refcount_enter;
477 static int sys_perf_refcount_exit;
478 
479 static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
480 {
481 	struct syscall_metadata *sys_data;
482 	struct syscall_trace_enter *rec;
483 	struct hlist_head *head;
484 	int syscall_nr;
485 	int rctx;
486 	int size;
487 
488 	syscall_nr = syscall_get_nr(current, regs);
489 	if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
490 		return;
491 
492 	sys_data = syscall_nr_to_meta(syscall_nr);
493 	if (!sys_data)
494 		return;
495 
496 	/* get the size after alignment with the u32 buffer size field */
497 	size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec);
498 	size = ALIGN(size + sizeof(u32), sizeof(u64));
499 	size -= sizeof(u32);
500 
501 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
502 		      "perf buffer not large enough"))
503 		return;
504 
505 	rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
506 				sys_data->enter_event->event.type, regs, &rctx);
507 	if (!rec)
508 		return;
509 
510 	rec->nr = syscall_nr;
511 	syscall_get_arguments(current, regs, 0, sys_data->nb_args,
512 			       (unsigned long *)&rec->args);
513 
514 	head = this_cpu_ptr(sys_data->enter_event->perf_events);
515 	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
516 }
517 
518 int perf_sysenter_enable(struct ftrace_event_call *call)
519 {
520 	int ret = 0;
521 	int num;
522 
523 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
524 
525 	mutex_lock(&syscall_trace_lock);
526 	if (!sys_perf_refcount_enter)
527 		ret = register_trace_sys_enter(perf_syscall_enter, NULL);
528 	if (ret) {
529 		pr_info("event trace: Could not activate"
530 				"syscall entry trace point");
531 	} else {
532 		set_bit(num, enabled_perf_enter_syscalls);
533 		sys_perf_refcount_enter++;
534 	}
535 	mutex_unlock(&syscall_trace_lock);
536 	return ret;
537 }
538 
539 void perf_sysenter_disable(struct ftrace_event_call *call)
540 {
541 	int num;
542 
543 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
544 
545 	mutex_lock(&syscall_trace_lock);
546 	sys_perf_refcount_enter--;
547 	clear_bit(num, enabled_perf_enter_syscalls);
548 	if (!sys_perf_refcount_enter)
549 		unregister_trace_sys_enter(perf_syscall_enter, NULL);
550 	mutex_unlock(&syscall_trace_lock);
551 }
552 
553 static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
554 {
555 	struct syscall_metadata *sys_data;
556 	struct syscall_trace_exit *rec;
557 	struct hlist_head *head;
558 	int syscall_nr;
559 	int rctx;
560 	int size;
561 
562 	syscall_nr = syscall_get_nr(current, regs);
563 	if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
564 		return;
565 
566 	sys_data = syscall_nr_to_meta(syscall_nr);
567 	if (!sys_data)
568 		return;
569 
570 	/* We can probably do that at build time */
571 	size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64));
572 	size -= sizeof(u32);
573 
574 	/*
575 	 * Impossible, but be paranoid with the future
576 	 * How to put this check outside runtime?
577 	 */
578 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
579 		"exit event has grown above perf buffer size"))
580 		return;
581 
582 	rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
583 				sys_data->exit_event->event.type, regs, &rctx);
584 	if (!rec)
585 		return;
586 
587 	rec->nr = syscall_nr;
588 	rec->ret = syscall_get_return_value(current, regs);
589 
590 	head = this_cpu_ptr(sys_data->exit_event->perf_events);
591 	perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
592 }
593 
594 int perf_sysexit_enable(struct ftrace_event_call *call)
595 {
596 	int ret = 0;
597 	int num;
598 
599 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
600 
601 	mutex_lock(&syscall_trace_lock);
602 	if (!sys_perf_refcount_exit)
603 		ret = register_trace_sys_exit(perf_syscall_exit, NULL);
604 	if (ret) {
605 		pr_info("event trace: Could not activate"
606 				"syscall exit trace point");
607 	} else {
608 		set_bit(num, enabled_perf_exit_syscalls);
609 		sys_perf_refcount_exit++;
610 	}
611 	mutex_unlock(&syscall_trace_lock);
612 	return ret;
613 }
614 
615 void perf_sysexit_disable(struct ftrace_event_call *call)
616 {
617 	int num;
618 
619 	num = ((struct syscall_metadata *)call->data)->syscall_nr;
620 
621 	mutex_lock(&syscall_trace_lock);
622 	sys_perf_refcount_exit--;
623 	clear_bit(num, enabled_perf_exit_syscalls);
624 	if (!sys_perf_refcount_exit)
625 		unregister_trace_sys_exit(perf_syscall_exit, NULL);
626 	mutex_unlock(&syscall_trace_lock);
627 }
628 
629 #endif /* CONFIG_PERF_EVENTS */
630 
631 static int syscall_enter_register(struct ftrace_event_call *event,
632 				 enum trace_reg type)
633 {
634 	switch (type) {
635 	case TRACE_REG_REGISTER:
636 		return reg_event_syscall_enter(event);
637 	case TRACE_REG_UNREGISTER:
638 		unreg_event_syscall_enter(event);
639 		return 0;
640 
641 #ifdef CONFIG_PERF_EVENTS
642 	case TRACE_REG_PERF_REGISTER:
643 		return perf_sysenter_enable(event);
644 	case TRACE_REG_PERF_UNREGISTER:
645 		perf_sysenter_disable(event);
646 		return 0;
647 #endif
648 	}
649 	return 0;
650 }
651 
652 static int syscall_exit_register(struct ftrace_event_call *event,
653 				 enum trace_reg type)
654 {
655 	switch (type) {
656 	case TRACE_REG_REGISTER:
657 		return reg_event_syscall_exit(event);
658 	case TRACE_REG_UNREGISTER:
659 		unreg_event_syscall_exit(event);
660 		return 0;
661 
662 #ifdef CONFIG_PERF_EVENTS
663 	case TRACE_REG_PERF_REGISTER:
664 		return perf_sysexit_enable(event);
665 	case TRACE_REG_PERF_UNREGISTER:
666 		perf_sysexit_disable(event);
667 		return 0;
668 #endif
669 	}
670 	return 0;
671 }
672