xref: /linux/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c (revision c94cd9508b1335b949fd13ebd269313c65492df0)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Augment the raw_syscalls tracepoints with the contents of the pointer arguments.
4  *
5  * This exactly matches what is marshalled into the raw_syscall:sys_enter
6  * payload expected by the 'perf trace' beautifiers.
7  */
8 
9 #include "vmlinux.h"
10 #include "../trace_augment.h"
11 
12 #include <bpf/bpf_helpers.h>
13 #include <linux/limits.h>
14 
15 #define PERF_ALIGN(x, a)        __PERF_ALIGN_MASK(x, (typeof(x))(a)-1)
16 #define __PERF_ALIGN_MASK(x, mask)      (((x)+(mask))&~(mask))
17 
18 /**
19  * is_power_of_2() - check if a value is a power of two
20  * @n: the value to check
21  *
22  * Determine whether some value is a power of two, where zero is *not*
23  * considered a power of two.  Return: true if @n is a power of 2, otherwise
24  * false.
25  */
26 #define is_power_of_2(n) (n != 0 && ((n & (n - 1)) == 0))
27 
28 #define MAX_CPUS  4096
29 
30 /* bpf-output associated map */
31 struct __augmented_syscalls__ {
32 	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
33 	__type(key, int);
34 	__type(value, __u32);
35 	__uint(max_entries, MAX_CPUS);
36 } __augmented_syscalls__ SEC(".maps");
37 
38 /*
39  * What to augment at entry?
40  *
41  * Pointer arg payloads (filenames, etc) passed from userspace to the kernel
42  */
43 struct syscalls_sys_enter {
44 	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
45 	__type(key, __u32);
46 	__type(value, __u32);
47 	__uint(max_entries, 512);
48 } syscalls_sys_enter SEC(".maps");
49 
50 /*
51  * What to augment at exit?
52  *
53  * Pointer arg payloads returned from the kernel (struct stat, etc) to userspace.
54  */
55 struct syscalls_sys_exit {
56 	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
57 	__type(key, __u32);
58 	__type(value, __u32);
59 	__uint(max_entries, 512);
60 } syscalls_sys_exit SEC(".maps");
61 
62 struct syscall_enter_args {
63 	unsigned long long common_tp_fields;
64 	long		   syscall_nr;
65 	unsigned long	   args[6];
66 };
67 
68 struct syscall_exit_args {
69 	unsigned long long common_tp_fields;
70 	long		   syscall_nr;
71 	long		   ret;
72 };
73 
74 /*
75  * Desired design of maximum size and alignment (see RFC2553)
76  */
77 #define SS_MAXSIZE   128     /* Implementation specific max size */
78 
79 typedef unsigned short sa_family_t;
80 
81 /*
82  * FIXME: Should come from system headers
83  *
84  * The definition uses anonymous union and struct in order to control the
85  * default alignment.
86  */
87 struct sockaddr_storage {
88 	union {
89 		struct {
90 			sa_family_t    ss_family; /* address family */
91 			/* Following field(s) are implementation specific */
92 			char __data[SS_MAXSIZE - sizeof(unsigned short)];
93 				/* space to achieve desired size, */
94 				/* _SS_MAXSIZE value minus size of ss_family */
95 		};
96 		void *__align; /* implementation specific desired alignment */
97 	};
98 };
99 
100 struct augmented_arg {
101 	unsigned int	size;
102 	int		err;
103 	union {
104 		char   value[PATH_MAX];
105 		struct sockaddr_storage saddr;
106 	};
107 };
108 
109 struct pids_filtered {
110 	__uint(type, BPF_MAP_TYPE_HASH);
111 	__type(key, pid_t);
112 	__type(value, bool);
113 	__uint(max_entries, 64);
114 } pids_filtered SEC(".maps");
115 
116 struct augmented_args_payload {
117 	struct syscall_enter_args args;
118 	struct augmented_arg arg, arg2; // We have to reserve space for two arguments (rename, etc)
119 };
120 
121 // We need more tmp space than the BPF stack can give us
122 struct augmented_args_tmp {
123 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
124 	__type(key, int);
125 	__type(value, struct augmented_args_payload);
126 	__uint(max_entries, 1);
127 } augmented_args_tmp SEC(".maps");
128 
129 struct beauty_map_enter {
130 	__uint(type, BPF_MAP_TYPE_HASH);
131 	__type(key, int);
132 	__type(value, __u32[6]);
133 	__uint(max_entries, 512);
134 } beauty_map_enter SEC(".maps");
135 
136 struct beauty_payload_enter {
137 	struct syscall_enter_args args;
138 	struct augmented_arg aug_args[6];
139 };
140 
141 struct beauty_payload_enter_map {
142 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
143 	__type(key, int);
144 	__type(value, struct beauty_payload_enter);
145 	__uint(max_entries, 1);
146 } beauty_payload_enter_map SEC(".maps");
147 
148 static inline struct augmented_args_payload *augmented_args_payload(void)
149 {
150 	int key = 0;
151 	return bpf_map_lookup_elem(&augmented_args_tmp, &key);
152 }
153 
154 static inline int augmented__output(void *ctx, struct augmented_args_payload *args, int len)
155 {
156 	/* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */
157 	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len);
158 }
159 
160 static inline int augmented__beauty_output(void *ctx, void *data, int len)
161 {
162 	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, data, len);
163 }
164 
165 static inline
166 unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len)
167 {
168 	unsigned int augmented_len = sizeof(*augmented_arg);
169 	int string_len = bpf_probe_read_user_str(&augmented_arg->value, arg_len, arg);
170 
171 	augmented_arg->size = augmented_arg->err = 0;
172 	/*
173 	 * probe_read_str may return < 0, e.g. -EFAULT
174 	 * So we leave that in the augmented_arg->size that userspace will
175 	 */
176 	if (string_len > 0) {
177 		augmented_len -= sizeof(augmented_arg->value) - string_len;
178 		_Static_assert(is_power_of_2(sizeof(augmented_arg->value)), "sizeof(augmented_arg->value) needs to be a power of two");
179 		augmented_len &= sizeof(augmented_arg->value) - 1;
180 		augmented_arg->size = string_len;
181 	} else {
182 		/*
183 		 * So that username notice the error while still being able
184 		 * to skip this augmented arg record
185 		 */
186 		augmented_arg->err = string_len;
187 		augmented_len = offsetof(struct augmented_arg, value);
188 	}
189 
190 	return augmented_len;
191 }
192 
193 SEC("tp/raw_syscalls/sys_enter")
194 int syscall_unaugmented(struct syscall_enter_args *args)
195 {
196 	return 1;
197 }
198 
199 /*
200  * These will be tail_called from SEC("raw_syscalls:sys_enter"), so will find in
201  * augmented_args_tmp what was read by that raw_syscalls:sys_enter and go
202  * on from there, reading the first syscall arg as a string, i.e. open's
203  * filename.
204  */
205 SEC("tp/syscalls/sys_enter_connect")
206 int sys_enter_connect(struct syscall_enter_args *args)
207 {
208 	struct augmented_args_payload *augmented_args = augmented_args_payload();
209 	const void *sockaddr_arg = (const void *)args->args[1];
210 	unsigned int socklen = args->args[2];
211 	unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
212 
213         if (augmented_args == NULL)
214                 return 1; /* Failure: don't filter */
215 
216 	_Static_assert(is_power_of_2(sizeof(augmented_args->arg.saddr)), "sizeof(augmented_args->arg.saddr) needs to be a power of two");
217 	socklen &= sizeof(augmented_args->arg.saddr) - 1;
218 
219 	bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg);
220 	augmented_args->arg.size = socklen;
221 	augmented_args->arg.err = 0;
222 
223 	return augmented__output(args, augmented_args, len + socklen);
224 }
225 
226 SEC("tp/syscalls/sys_enter_sendto")
227 int sys_enter_sendto(struct syscall_enter_args *args)
228 {
229 	struct augmented_args_payload *augmented_args = augmented_args_payload();
230 	const void *sockaddr_arg = (const void *)args->args[4];
231 	unsigned int socklen = args->args[5];
232 	unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
233 
234         if (augmented_args == NULL)
235                 return 1; /* Failure: don't filter */
236 
237 	socklen &= sizeof(augmented_args->arg.saddr) - 1;
238 
239 	bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg);
240 
241 	return augmented__output(args, augmented_args, len + socklen);
242 }
243 
244 SEC("tp/syscalls/sys_enter_open")
245 int sys_enter_open(struct syscall_enter_args *args)
246 {
247 	struct augmented_args_payload *augmented_args = augmented_args_payload();
248 	const void *filename_arg = (const void *)args->args[0];
249 	unsigned int len = sizeof(augmented_args->args);
250 
251         if (augmented_args == NULL)
252                 return 1; /* Failure: don't filter */
253 
254 	len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value));
255 
256 	return augmented__output(args, augmented_args, len);
257 }
258 
259 SEC("tp/syscalls/sys_enter_openat")
260 int sys_enter_openat(struct syscall_enter_args *args)
261 {
262 	struct augmented_args_payload *augmented_args = augmented_args_payload();
263 	const void *filename_arg = (const void *)args->args[1];
264 	unsigned int len = sizeof(augmented_args->args);
265 
266         if (augmented_args == NULL)
267                 return 1; /* Failure: don't filter */
268 
269 	len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value));
270 
271 	return augmented__output(args, augmented_args, len);
272 }
273 
274 SEC("tp/syscalls/sys_enter_rename")
275 int sys_enter_rename(struct syscall_enter_args *args)
276 {
277 	struct augmented_args_payload *augmented_args = augmented_args_payload();
278 	const void *oldpath_arg = (const void *)args->args[0],
279 		   *newpath_arg = (const void *)args->args[1];
280 	unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len;
281 
282         if (augmented_args == NULL)
283                 return 1; /* Failure: don't filter */
284 
285 	len += 2 * sizeof(u64); // The overhead of size and err, just before the payload...
286 
287 	oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
288 	augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
289 	len += augmented_args->arg.size;
290 
291 	struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;
292 
293 	newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
294 	arg2->size = newpath_len;
295 
296 	len += newpath_len;
297 
298 	return augmented__output(args, augmented_args, len);
299 }
300 
301 SEC("tp/syscalls/sys_enter_renameat2")
302 int sys_enter_renameat2(struct syscall_enter_args *args)
303 {
304 	struct augmented_args_payload *augmented_args = augmented_args_payload();
305 	const void *oldpath_arg = (const void *)args->args[1],
306 		   *newpath_arg = (const void *)args->args[3];
307 	unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len;
308 
309         if (augmented_args == NULL)
310                 return 1; /* Failure: don't filter */
311 
312 	len += 2 * sizeof(u64); // The overhead of size and err, just before the payload...
313 
314 	oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
315 	augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
316 	len += augmented_args->arg.size;
317 
318 	struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;
319 
320 	newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
321 	arg2->size = newpath_len;
322 
323 	len += newpath_len;
324 
325 	return augmented__output(args, augmented_args, len);
326 }
327 
328 #define PERF_ATTR_SIZE_VER0     64      /* sizeof first published struct */
329 
330 // we need just the start, get the size to then copy it
331 struct perf_event_attr_size {
332         __u32                   type;
333         /*
334          * Size of the attr structure, for fwd/bwd compat.
335          */
336         __u32                   size;
337 };
338 
339 SEC("tp/syscalls/sys_enter_perf_event_open")
340 int sys_enter_perf_event_open(struct syscall_enter_args *args)
341 {
342 	struct augmented_args_payload *augmented_args = augmented_args_payload();
343 	const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read;
344 	unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
345 
346         if (augmented_args == NULL)
347 		goto failure;
348 
349 	if (bpf_probe_read_user(&augmented_args->arg.value, sizeof(*attr), attr) < 0)
350 		goto failure;
351 
352 	attr_read = (const struct perf_event_attr_size *)augmented_args->arg.value;
353 
354 	__u32 size = attr_read->size;
355 
356 	if (!size)
357 		size = PERF_ATTR_SIZE_VER0;
358 
359 	if (size > sizeof(augmented_args->arg.value))
360                 goto failure;
361 
362 	// Now that we read attr->size and tested it against the size limits, read it completely
363 	if (bpf_probe_read_user(&augmented_args->arg.value, size, attr) < 0)
364 		goto failure;
365 
366 	return augmented__output(args, augmented_args, len + size);
367 failure:
368 	return 1; /* Failure: don't filter */
369 }
370 
371 SEC("tp/syscalls/sys_enter_clock_nanosleep")
372 int sys_enter_clock_nanosleep(struct syscall_enter_args *args)
373 {
374 	struct augmented_args_payload *augmented_args = augmented_args_payload();
375 	const void *rqtp_arg = (const void *)args->args[2];
376 	unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
377 	__u32 size = sizeof(struct timespec64);
378 
379         if (augmented_args == NULL)
380 		goto failure;
381 
382 	if (size > sizeof(augmented_args->arg.value))
383                 goto failure;
384 
385 	bpf_probe_read_user(&augmented_args->arg.value, size, rqtp_arg);
386 
387 	return augmented__output(args, augmented_args, len + size);
388 failure:
389 	return 1; /* Failure: don't filter */
390 }
391 
392 SEC("tp/syscalls/sys_enter_nanosleep")
393 int sys_enter_nanosleep(struct syscall_enter_args *args)
394 {
395 	struct augmented_args_payload *augmented_args = augmented_args_payload();
396 	const void *req_arg = (const void *)args->args[0];
397 	unsigned int len = sizeof(augmented_args->args);
398 	__u32 size = sizeof(struct timespec64);
399 
400         if (augmented_args == NULL)
401 		goto failure;
402 
403 	if (size > sizeof(augmented_args->arg.value))
404                 goto failure;
405 
406 	bpf_probe_read_user(&augmented_args->arg.value, size, req_arg);
407 
408 	return augmented__output(args, augmented_args, len + size);
409 failure:
410 	return 1; /* Failure: don't filter */
411 }
412 
413 static pid_t getpid(void)
414 {
415 	return bpf_get_current_pid_tgid();
416 }
417 
418 static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
419 {
420 	return bpf_map_lookup_elem(pids, &pid) != NULL;
421 }
422 
423 static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
424 {
425 	bool augmented, do_output = false;
426 	int zero = 0, size, aug_size, index, output = 0,
427 	    value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
428 	unsigned int nr, *beauty_map;
429 	struct beauty_payload_enter *payload;
430 	void *arg, *payload_offset;
431 
432 	/* fall back to do predefined tail call */
433 	if (args == NULL)
434 		return 1;
435 
436 	/* use syscall number to get beauty_map entry */
437 	nr             = (__u32)args->syscall_nr;
438 	beauty_map     = bpf_map_lookup_elem(&beauty_map_enter, &nr);
439 
440 	/* set up payload for output */
441 	payload        = bpf_map_lookup_elem(&beauty_payload_enter_map, &zero);
442 	payload_offset = (void *)&payload->aug_args;
443 
444 	if (beauty_map == NULL || payload == NULL)
445 		return 1;
446 
447 	/* copy the sys_enter header, which has the syscall_nr */
448 	__builtin_memcpy(&payload->args, args, sizeof(struct syscall_enter_args));
449 
450 	/*
451 	 * Determine what type of argument and how many bytes to read from user space, using the
452 	 * value in the beauty_map. This is the relation of parameter type and its corresponding
453 	 * value in the beauty map, and how many bytes we read eventually:
454 	 *
455 	 * string: 1			      -> size of string
456 	 * struct: size of struct	      -> size of struct
457 	 * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF)
458 	 */
459 	for (int i = 0; i < 6; i++) {
460 		arg = (void *)args->args[i];
461 		augmented = false;
462 		size = beauty_map[i];
463 		aug_size = size; /* size of the augmented data read from user space */
464 
465 		if (size == 0 || arg == NULL)
466 			continue;
467 
468 		if (size == 1) { /* string */
469 			aug_size = bpf_probe_read_user_str(((struct augmented_arg *)payload_offset)->value, value_size, arg);
470 			/* minimum of 0 to pass the verifier */
471 			if (aug_size < 0)
472 				aug_size = 0;
473 
474 			augmented = true;
475 		} else if (size > 0 && size <= value_size) { /* struct */
476 			if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, size, arg))
477 				augmented = true;
478 		} else if (size < 0 && size >= -6) { /* buffer */
479 			index = -(size + 1);
480 			aug_size = args->args[index];
481 
482 			if (aug_size > TRACE_AUG_MAX_BUF)
483 				aug_size = TRACE_AUG_MAX_BUF;
484 
485 			if (aug_size > 0) {
486 				if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg))
487 					augmented = true;
488 			}
489 		}
490 
491 		/* write data to payload */
492 		if (augmented) {
493 			int written = offsetof(struct augmented_arg, value) + aug_size;
494 
495 			((struct augmented_arg *)payload_offset)->size = aug_size;
496 			output += written;
497 			payload_offset += written;
498 			do_output = true;
499 		}
500 	}
501 
502 	if (!do_output)
503 		return 1;
504 
505 	return augmented__beauty_output(ctx, payload, sizeof(struct syscall_enter_args) + output);
506 }
507 
508 SEC("tp/raw_syscalls/sys_enter")
509 int sys_enter(struct syscall_enter_args *args)
510 {
511 	struct augmented_args_payload *augmented_args;
512 	/*
513 	 * We start len, the amount of data that will be in the perf ring
514 	 * buffer, if this is not filtered out by one of pid_filter__has(),
515 	 * syscall->enabled, etc, with the non-augmented raw syscall payload,
516 	 * i.e. sizeof(augmented_args->args).
517 	 *
518 	 * We'll add to this as we add augmented syscalls right after that
519 	 * initial, non-augmented raw_syscalls:sys_enter payload.
520 	 */
521 
522 	if (pid_filter__has(&pids_filtered, getpid()))
523 		return 0;
524 
525 	augmented_args = augmented_args_payload();
526 	if (augmented_args == NULL)
527 		return 1;
528 
529 	bpf_probe_read_kernel(&augmented_args->args, sizeof(augmented_args->args), args);
530 
531 	/*
532 	 * Jump to syscall specific augmenter, even if the default one,
533 	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
534 	 * unaugmented tracepoint payload.
535 	 */
536 	if (augment_sys_enter(args, &augmented_args->args))
537 		bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr);
538 
539 	// If not found on the PROG_ARRAY syscalls map, then we're filtering it:
540 	return 0;
541 }
542 
543 SEC("tp/raw_syscalls/sys_exit")
544 int sys_exit(struct syscall_exit_args *args)
545 {
546 	struct syscall_exit_args exit_args;
547 
548 	if (pid_filter__has(&pids_filtered, getpid()))
549 		return 0;
550 
551 	bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args);
552 	/*
553 	 * Jump to syscall specific return augmenter, even if the default one,
554 	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
555 	 * unaugmented tracepoint payload.
556 	 */
557 	bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr);
558 	/*
559 	 * If not found on the PROG_ARRAY syscalls map, then we're filtering it:
560 	 */
561 	return 0;
562 }
563 
564 char _license[] SEC("license") = "GPL";
565