xref: /linux/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c (revision 36ec807b627b4c0a0a382f0ae48eac7187d14b2b)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Augment the raw_syscalls tracepoints with the contents of the pointer arguments.
4  *
5  * This exactly matches what is marshalled into the raw_syscall:sys_enter
6  * payload expected by the 'perf trace' beautifiers.
7  */
8 
9 #include "vmlinux.h"
10 #include <bpf/bpf_helpers.h>
11 #include <linux/limits.h>
12 
13 /**
14  * is_power_of_2() - check if a value is a power of two
15  * @n: the value to check
16  *
17  * Determine whether some value is a power of two, where zero is *not*
18  * considered a power of two.  Return: true if @n is a power of 2, otherwise
19  * false.
20  */
21 #define is_power_of_2(n) (n != 0 && ((n & (n - 1)) == 0))
22 
23 #define MAX_CPUS  4096
24 
25 /* bpf-output associated map */
26 struct __augmented_syscalls__ {
27 	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
28 	__type(key, int);
29 	__type(value, __u32);
30 	__uint(max_entries, MAX_CPUS);
31 } __augmented_syscalls__ SEC(".maps");
32 
33 /*
34  * What to augment at entry?
35  *
36  * Pointer arg payloads (filenames, etc) passed from userspace to the kernel
37  */
38 struct syscalls_sys_enter {
39 	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
40 	__type(key, __u32);
41 	__type(value, __u32);
42 	__uint(max_entries, 512);
43 } syscalls_sys_enter SEC(".maps");
44 
45 /*
46  * What to augment at exit?
47  *
48  * Pointer arg payloads returned from the kernel (struct stat, etc) to userspace.
49  */
50 struct syscalls_sys_exit {
51 	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
52 	__type(key, __u32);
53 	__type(value, __u32);
54 	__uint(max_entries, 512);
55 } syscalls_sys_exit SEC(".maps");
56 
57 struct syscall_enter_args {
58 	unsigned long long common_tp_fields;
59 	long		   syscall_nr;
60 	unsigned long	   args[6];
61 };
62 
63 struct syscall_exit_args {
64 	unsigned long long common_tp_fields;
65 	long		   syscall_nr;
66 	long		   ret;
67 };
68 
69 struct augmented_arg {
70 	unsigned int	size;
71 	int		err;
72 	char		value[PATH_MAX];
73 };
74 
75 struct pids_filtered {
76 	__uint(type, BPF_MAP_TYPE_HASH);
77 	__type(key, pid_t);
78 	__type(value, bool);
79 	__uint(max_entries, 64);
80 } pids_filtered SEC(".maps");
81 
82 /*
83  * Desired design of maximum size and alignment (see RFC2553)
84  */
85 #define SS_MAXSIZE   128     /* Implementation specific max size */
86 
87 typedef unsigned short sa_family_t;
88 
89 /*
90  * FIXME: Should come from system headers
91  *
92  * The definition uses anonymous union and struct in order to control the
93  * default alignment.
94  */
95 struct sockaddr_storage {
96 	union {
97 		struct {
98 			sa_family_t    ss_family; /* address family */
99 			/* Following field(s) are implementation specific */
100 			char __data[SS_MAXSIZE - sizeof(unsigned short)];
101 				/* space to achieve desired size, */
102 				/* _SS_MAXSIZE value minus size of ss_family */
103 		};
104 		void *__align; /* implementation specific desired alignment */
105 	};
106 };
107 
108 struct augmented_args_payload {
109        struct syscall_enter_args args;
110        union {
111 		struct {
112 			struct augmented_arg arg, arg2;
113 		};
114 		struct sockaddr_storage saddr;
115 		char   __data[sizeof(struct augmented_arg)];
116 	};
117 };
118 
119 // We need more tmp space than the BPF stack can give us
120 struct augmented_args_tmp {
121 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
122 	__type(key, int);
123 	__type(value, struct augmented_args_payload);
124 	__uint(max_entries, 1);
125 } augmented_args_tmp SEC(".maps");
126 
127 static inline struct augmented_args_payload *augmented_args_payload(void)
128 {
129 	int key = 0;
130 	return bpf_map_lookup_elem(&augmented_args_tmp, &key);
131 }
132 
133 static inline int augmented__output(void *ctx, struct augmented_args_payload *args, int len)
134 {
135 	/* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */
136 	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len);
137 }
138 
139 static inline
140 unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len)
141 {
142 	unsigned int augmented_len = sizeof(*augmented_arg);
143 	int string_len = bpf_probe_read_user_str(&augmented_arg->value, arg_len, arg);
144 
145 	augmented_arg->size = augmented_arg->err = 0;
146 	/*
147 	 * probe_read_str may return < 0, e.g. -EFAULT
148 	 * So we leave that in the augmented_arg->size that userspace will
149 	 */
150 	if (string_len > 0) {
151 		augmented_len -= sizeof(augmented_arg->value) - string_len;
152 		_Static_assert(is_power_of_2(sizeof(augmented_arg->value)), "sizeof(augmented_arg->value) needs to be a power of two");
153 		augmented_len &= sizeof(augmented_arg->value) - 1;
154 		augmented_arg->size = string_len;
155 	} else {
156 		/*
157 		 * So that username notice the error while still being able
158 		 * to skip this augmented arg record
159 		 */
160 		augmented_arg->err = string_len;
161 		augmented_len = offsetof(struct augmented_arg, value);
162 	}
163 
164 	return augmented_len;
165 }
166 
167 SEC("tp/raw_syscalls/sys_enter")
168 int syscall_unaugmented(struct syscall_enter_args *args)
169 {
170 	return 1;
171 }
172 
173 /*
174  * These will be tail_called from SEC("raw_syscalls:sys_enter"), so will find in
175  * augmented_args_tmp what was read by that raw_syscalls:sys_enter and go
176  * on from there, reading the first syscall arg as a string, i.e. open's
177  * filename.
178  */
179 SEC("tp/syscalls/sys_enter_connect")
180 int sys_enter_connect(struct syscall_enter_args *args)
181 {
182 	struct augmented_args_payload *augmented_args = augmented_args_payload();
183 	const void *sockaddr_arg = (const void *)args->args[1];
184 	unsigned int socklen = args->args[2];
185 	unsigned int len = sizeof(augmented_args->args);
186 
187         if (augmented_args == NULL)
188                 return 1; /* Failure: don't filter */
189 
190 	_Static_assert(is_power_of_2(sizeof(augmented_args->saddr)), "sizeof(augmented_args->saddr) needs to be a power of two");
191 	socklen &= sizeof(augmented_args->saddr) - 1;
192 
193 	bpf_probe_read_user(&augmented_args->saddr, socklen, sockaddr_arg);
194 
195 	return augmented__output(args, augmented_args, len + socklen);
196 }
197 
198 SEC("tp/syscalls/sys_enter_sendto")
199 int sys_enter_sendto(struct syscall_enter_args *args)
200 {
201 	struct augmented_args_payload *augmented_args = augmented_args_payload();
202 	const void *sockaddr_arg = (const void *)args->args[4];
203 	unsigned int socklen = args->args[5];
204 	unsigned int len = sizeof(augmented_args->args);
205 
206         if (augmented_args == NULL)
207                 return 1; /* Failure: don't filter */
208 
209 	socklen &= sizeof(augmented_args->saddr) - 1;
210 
211 	bpf_probe_read_user(&augmented_args->saddr, socklen, sockaddr_arg);
212 
213 	return augmented__output(args, augmented_args, len + socklen);
214 }
215 
216 SEC("tp/syscalls/sys_enter_open")
217 int sys_enter_open(struct syscall_enter_args *args)
218 {
219 	struct augmented_args_payload *augmented_args = augmented_args_payload();
220 	const void *filename_arg = (const void *)args->args[0];
221 	unsigned int len = sizeof(augmented_args->args);
222 
223         if (augmented_args == NULL)
224                 return 1; /* Failure: don't filter */
225 
226 	len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value));
227 
228 	return augmented__output(args, augmented_args, len);
229 }
230 
231 SEC("tp/syscalls/sys_enter_openat")
232 int sys_enter_openat(struct syscall_enter_args *args)
233 {
234 	struct augmented_args_payload *augmented_args = augmented_args_payload();
235 	const void *filename_arg = (const void *)args->args[1];
236 	unsigned int len = sizeof(augmented_args->args);
237 
238         if (augmented_args == NULL)
239                 return 1; /* Failure: don't filter */
240 
241 	len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value));
242 
243 	return augmented__output(args, augmented_args, len);
244 }
245 
246 SEC("tp/syscalls/sys_enter_rename")
247 int sys_enter_rename(struct syscall_enter_args *args)
248 {
249 	struct augmented_args_payload *augmented_args = augmented_args_payload();
250 	const void *oldpath_arg = (const void *)args->args[0],
251 		   *newpath_arg = (const void *)args->args[1];
252 	unsigned int len = sizeof(augmented_args->args), oldpath_len;
253 
254         if (augmented_args == NULL)
255                 return 1; /* Failure: don't filter */
256 
257 	oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
258 	len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value));
259 
260 	return augmented__output(args, augmented_args, len);
261 }
262 
263 SEC("tp/syscalls/sys_enter_renameat")
264 int sys_enter_renameat(struct syscall_enter_args *args)
265 {
266 	struct augmented_args_payload *augmented_args = augmented_args_payload();
267 	const void *oldpath_arg = (const void *)args->args[1],
268 		   *newpath_arg = (const void *)args->args[3];
269 	unsigned int len = sizeof(augmented_args->args), oldpath_len;
270 
271         if (augmented_args == NULL)
272                 return 1; /* Failure: don't filter */
273 
274 	oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
275 	len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value));
276 
277 	return augmented__output(args, augmented_args, len);
278 }
279 
280 #define PERF_ATTR_SIZE_VER0     64      /* sizeof first published struct */
281 
282 // we need just the start, get the size to then copy it
283 struct perf_event_attr_size {
284         __u32                   type;
285         /*
286          * Size of the attr structure, for fwd/bwd compat.
287          */
288         __u32                   size;
289 };
290 
291 SEC("tp/syscalls/sys_enter_perf_event_open")
292 int sys_enter_perf_event_open(struct syscall_enter_args *args)
293 {
294 	struct augmented_args_payload *augmented_args = augmented_args_payload();
295 	const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read;
296 	unsigned int len = sizeof(augmented_args->args);
297 
298         if (augmented_args == NULL)
299 		goto failure;
300 
301 	if (bpf_probe_read_user(&augmented_args->__data, sizeof(*attr), attr) < 0)
302 		goto failure;
303 
304 	attr_read = (const struct perf_event_attr_size *)augmented_args->__data;
305 
306 	__u32 size = attr_read->size;
307 
308 	if (!size)
309 		size = PERF_ATTR_SIZE_VER0;
310 
311 	if (size > sizeof(augmented_args->__data))
312                 goto failure;
313 
314 	// Now that we read attr->size and tested it against the size limits, read it completely
315 	if (bpf_probe_read_user(&augmented_args->__data, size, attr) < 0)
316 		goto failure;
317 
318 	return augmented__output(args, augmented_args, len + size);
319 failure:
320 	return 1; /* Failure: don't filter */
321 }
322 
323 SEC("tp/syscalls/sys_enter_clock_nanosleep")
324 int sys_enter_clock_nanosleep(struct syscall_enter_args *args)
325 {
326 	struct augmented_args_payload *augmented_args = augmented_args_payload();
327 	const void *rqtp_arg = (const void *)args->args[2];
328 	unsigned int len = sizeof(augmented_args->args);
329 	__u32 size = sizeof(struct timespec64);
330 
331         if (augmented_args == NULL)
332 		goto failure;
333 
334 	if (size > sizeof(augmented_args->__data))
335                 goto failure;
336 
337 	bpf_probe_read_user(&augmented_args->__data, size, rqtp_arg);
338 
339 	return augmented__output(args, augmented_args, len + size);
340 failure:
341 	return 1; /* Failure: don't filter */
342 }
343 
344 SEC("tp/syscalls/sys_enter_nanosleep")
345 int sys_enter_nanosleep(struct syscall_enter_args *args)
346 {
347 	struct augmented_args_payload *augmented_args = augmented_args_payload();
348 	const void *req_arg = (const void *)args->args[0];
349 	unsigned int len = sizeof(augmented_args->args);
350 	__u32 size = sizeof(struct timespec64);
351 
352         if (augmented_args == NULL)
353 		goto failure;
354 
355 	if (size > sizeof(augmented_args->__data))
356                 goto failure;
357 
358 	bpf_probe_read_user(&augmented_args->__data, size, req_arg);
359 
360 	return augmented__output(args, augmented_args, len + size);
361 failure:
362 	return 1; /* Failure: don't filter */
363 }
364 
365 static pid_t getpid(void)
366 {
367 	return bpf_get_current_pid_tgid();
368 }
369 
370 static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
371 {
372 	return bpf_map_lookup_elem(pids, &pid) != NULL;
373 }
374 
375 SEC("tp/raw_syscalls/sys_enter")
376 int sys_enter(struct syscall_enter_args *args)
377 {
378 	struct augmented_args_payload *augmented_args;
379 	/*
380 	 * We start len, the amount of data that will be in the perf ring
381 	 * buffer, if this is not filtered out by one of pid_filter__has(),
382 	 * syscall->enabled, etc, with the non-augmented raw syscall payload,
383 	 * i.e. sizeof(augmented_args->args).
384 	 *
385 	 * We'll add to this as we add augmented syscalls right after that
386 	 * initial, non-augmented raw_syscalls:sys_enter payload.
387 	 */
388 
389 	if (pid_filter__has(&pids_filtered, getpid()))
390 		return 0;
391 
392 	augmented_args = augmented_args_payload();
393 	if (augmented_args == NULL)
394 		return 1;
395 
396 	bpf_probe_read_kernel(&augmented_args->args, sizeof(augmented_args->args), args);
397 
398 	/*
399 	 * Jump to syscall specific augmenter, even if the default one,
400 	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
401 	 * unaugmented tracepoint payload.
402 	 */
403 	bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr);
404 
405 	// If not found on the PROG_ARRAY syscalls map, then we're filtering it:
406 	return 0;
407 }
408 
409 SEC("tp/raw_syscalls/sys_exit")
410 int sys_exit(struct syscall_exit_args *args)
411 {
412 	struct syscall_exit_args exit_args;
413 
414 	if (pid_filter__has(&pids_filtered, getpid()))
415 		return 0;
416 
417 	bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args);
418 	/*
419 	 * Jump to syscall specific return augmenter, even if the default one,
420 	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
421 	 * unaugmented tracepoint payload.
422 	 */
423 	bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr);
424 	/*
425 	 * If not found on the PROG_ARRAY syscalls map, then we're filtering it:
426 	 */
427 	return 0;
428 }
429 
430 char _license[] SEC("license") = "GPL";
431