xref: /linux/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c (revision 89713ce5518eda6b370c7a17edbcab4f97a39f68)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Augment the raw_syscalls tracepoints with the contents of the pointer arguments.
4  *
5  * This exactly matches what is marshalled into the raw_syscall:sys_enter
6  * payload expected by the 'perf trace' beautifiers.
7  */
8 
9 #include "vmlinux.h"
10 #include "../trace_augment.h"
11 
12 #include <bpf/bpf_helpers.h>
13 #include <linux/limits.h>
14 
15 #define PERF_ALIGN(x, a)        __PERF_ALIGN_MASK(x, (typeof(x))(a)-1)
16 #define __PERF_ALIGN_MASK(x, mask)      (((x)+(mask))&~(mask))
17 
18 /**
19  * is_power_of_2() - check if a value is a power of two
20  * @n: the value to check
21  *
22  * Determine whether some value is a power of two, where zero is *not*
23  * considered a power of two.  Return: true if @n is a power of 2, otherwise
24  * false.
25  */
26 #define is_power_of_2(n) (n != 0 && ((n & (n - 1)) == 0))
27 
28 #define MAX_CPUS  4096
29 
30 /* bpf-output associated map */
31 struct __augmented_syscalls__ {
32 	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
33 	__type(key, int);
34 	__type(value, __u32);
35 	__uint(max_entries, MAX_CPUS);
36 } __augmented_syscalls__ SEC(".maps");
37 
38 /*
39  * What to augment at entry?
40  *
41  * Pointer arg payloads (filenames, etc) passed from userspace to the kernel
42  */
43 struct syscalls_sys_enter {
44 	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
45 	__type(key, __u32);
46 	__type(value, __u32);
47 	__uint(max_entries, 512);
48 } syscalls_sys_enter SEC(".maps");
49 
50 /*
51  * What to augment at exit?
52  *
53  * Pointer arg payloads returned from the kernel (struct stat, etc) to userspace.
54  */
55 struct syscalls_sys_exit {
56 	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
57 	__type(key, __u32);
58 	__type(value, __u32);
59 	__uint(max_entries, 512);
60 } syscalls_sys_exit SEC(".maps");
61 
62 struct syscall_enter_args {
63 	unsigned long long common_tp_fields;
64 	long		   syscall_nr;
65 	unsigned long	   args[6];
66 };
67 
68 struct syscall_exit_args {
69 	unsigned long long common_tp_fields;
70 	long		   syscall_nr;
71 	long		   ret;
72 };
73 
74 /*
75  * Desired design of maximum size and alignment (see RFC2553)
76  */
77 #define SS_MAXSIZE   128     /* Implementation specific max size */
78 
79 typedef unsigned short sa_family_t;
80 
81 /*
82  * FIXME: Should come from system headers
83  *
84  * The definition uses anonymous union and struct in order to control the
85  * default alignment.
86  */
87 struct sockaddr_storage {
88 	union {
89 		struct {
90 			sa_family_t    ss_family; /* address family */
91 			/* Following field(s) are implementation specific */
92 			char __data[SS_MAXSIZE - sizeof(unsigned short)];
93 				/* space to achieve desired size, */
94 				/* _SS_MAXSIZE value minus size of ss_family */
95 		};
96 		void *__align; /* implementation specific desired alignment */
97 	};
98 };
99 
100 struct augmented_arg {
101 	unsigned int	size;
102 	int		err;
103 	union {
104 		char   value[PATH_MAX];
105 		struct sockaddr_storage saddr;
106 	};
107 };
108 
109 struct pids_filtered {
110 	__uint(type, BPF_MAP_TYPE_HASH);
111 	__type(key, pid_t);
112 	__type(value, bool);
113 	__uint(max_entries, 64);
114 } pids_filtered SEC(".maps");
115 
116 struct augmented_args_payload {
117 	struct syscall_enter_args args;
118 	struct augmented_arg arg, arg2; // We have to reserve space for two arguments (rename, etc)
119 };
120 
121 // We need more tmp space than the BPF stack can give us
122 struct augmented_args_tmp {
123 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
124 	__type(key, int);
125 	__type(value, struct augmented_args_payload);
126 	__uint(max_entries, 1);
127 } augmented_args_tmp SEC(".maps");
128 
129 struct beauty_map_enter {
130 	__uint(type, BPF_MAP_TYPE_HASH);
131 	__type(key, int);
132 	__type(value, __u32[6]);
133 	__uint(max_entries, 512);
134 } beauty_map_enter SEC(".maps");
135 
136 struct beauty_payload_enter {
137 	struct syscall_enter_args args;
138 	struct augmented_arg aug_args[6];
139 };
140 
141 struct beauty_payload_enter_map {
142 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
143 	__type(key, int);
144 	__type(value, struct beauty_payload_enter);
145 	__uint(max_entries, 1);
146 } beauty_payload_enter_map SEC(".maps");
147 
148 static inline struct augmented_args_payload *augmented_args_payload(void)
149 {
150 	int key = 0;
151 	return bpf_map_lookup_elem(&augmented_args_tmp, &key);
152 }
153 
154 static inline int augmented__output(void *ctx, struct augmented_args_payload *args, int len)
155 {
156 	/* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */
157 	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len);
158 }
159 
160 static inline int augmented__beauty_output(void *ctx, void *data, int len)
161 {
162 	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, data, len);
163 }
164 
165 static inline
166 unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len)
167 {
168 	unsigned int augmented_len = sizeof(*augmented_arg);
169 	int string_len = bpf_probe_read_user_str(&augmented_arg->value, arg_len, arg);
170 
171 	augmented_arg->size = augmented_arg->err = 0;
172 	/*
173 	 * probe_read_str may return < 0, e.g. -EFAULT
174 	 * So we leave that in the augmented_arg->size that userspace will
175 	 */
176 	if (string_len > 0) {
177 		augmented_len -= sizeof(augmented_arg->value) - string_len;
178 		_Static_assert(is_power_of_2(sizeof(augmented_arg->value)), "sizeof(augmented_arg->value) needs to be a power of two");
179 		augmented_len &= sizeof(augmented_arg->value) - 1;
180 		augmented_arg->size = string_len;
181 	} else {
182 		/*
183 		 * So that username notice the error while still being able
184 		 * to skip this augmented arg record
185 		 */
186 		augmented_arg->err = string_len;
187 		augmented_len = offsetof(struct augmented_arg, value);
188 	}
189 
190 	return augmented_len;
191 }
192 
193 SEC("tp/raw_syscalls/sys_enter")
194 int syscall_unaugmented(struct syscall_enter_args *args)
195 {
196 	return 1;
197 }
198 
199 /*
200  * These will be tail_called from SEC("raw_syscalls:sys_enter"), so will find in
201  * augmented_args_tmp what was read by that raw_syscalls:sys_enter and go
202  * on from there, reading the first syscall arg as a string, i.e. open's
203  * filename.
204  */
205 SEC("tp/syscalls/sys_enter_connect")
206 int sys_enter_connect(struct syscall_enter_args *args)
207 {
208 	struct augmented_args_payload *augmented_args = augmented_args_payload();
209 	const void *sockaddr_arg = (const void *)args->args[1];
210 	unsigned int socklen = args->args[2];
211 	unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
212 
213         if (augmented_args == NULL)
214                 return 1; /* Failure: don't filter */
215 
216 	_Static_assert(is_power_of_2(sizeof(augmented_args->arg.saddr)), "sizeof(augmented_args->arg.saddr) needs to be a power of two");
217 	socklen &= sizeof(augmented_args->arg.saddr) - 1;
218 
219 	bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg);
220 	augmented_args->arg.size = socklen;
221 	augmented_args->arg.err = 0;
222 
223 	return augmented__output(args, augmented_args, len + socklen);
224 }
225 
226 SEC("tp/syscalls/sys_enter_sendto")
227 int sys_enter_sendto(struct syscall_enter_args *args)
228 {
229 	struct augmented_args_payload *augmented_args = augmented_args_payload();
230 	const void *sockaddr_arg = (const void *)args->args[4];
231 	unsigned int socklen = args->args[5];
232 	unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
233 
234         if (augmented_args == NULL)
235                 return 1; /* Failure: don't filter */
236 
237 	socklen &= sizeof(augmented_args->arg.saddr) - 1;
238 
239 	bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg);
240 
241 	return augmented__output(args, augmented_args, len + socklen);
242 }
243 
244 SEC("tp/syscalls/sys_enter_open")
245 int sys_enter_open(struct syscall_enter_args *args)
246 {
247 	struct augmented_args_payload *augmented_args = augmented_args_payload();
248 	const void *filename_arg = (const void *)args->args[0];
249 	unsigned int len = sizeof(augmented_args->args);
250 
251         if (augmented_args == NULL)
252                 return 1; /* Failure: don't filter */
253 
254 	len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value));
255 
256 	return augmented__output(args, augmented_args, len);
257 }
258 
259 SEC("tp/syscalls/sys_enter_openat")
260 int sys_enter_openat(struct syscall_enter_args *args)
261 {
262 	struct augmented_args_payload *augmented_args = augmented_args_payload();
263 	const void *filename_arg = (const void *)args->args[1];
264 	unsigned int len = sizeof(augmented_args->args);
265 
266         if (augmented_args == NULL)
267                 return 1; /* Failure: don't filter */
268 
269 	len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value));
270 
271 	return augmented__output(args, augmented_args, len);
272 }
273 
274 SEC("tp/syscalls/sys_enter_rename")
275 int sys_enter_rename(struct syscall_enter_args *args)
276 {
277 	struct augmented_args_payload *augmented_args = augmented_args_payload();
278 	const void *oldpath_arg = (const void *)args->args[0],
279 		   *newpath_arg = (const void *)args->args[1];
280 	unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len;
281 
282         if (augmented_args == NULL)
283                 return 1; /* Failure: don't filter */
284 
285 	len += 2 * sizeof(u64); // The overhead of size and err, just before the payload...
286 
287 	oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
288 	augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
289 	len += augmented_args->arg.size;
290 
291 	/* Every read from userspace is limited to value size */
292 	if (augmented_args->arg.size > sizeof(augmented_args->arg.value))
293 		return 1; /* Failure: don't filter */
294 
295 	struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;
296 
297 	newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
298 	arg2->size = newpath_len;
299 
300 	len += newpath_len;
301 
302 	return augmented__output(args, augmented_args, len);
303 }
304 
305 SEC("tp/syscalls/sys_enter_renameat2")
306 int sys_enter_renameat2(struct syscall_enter_args *args)
307 {
308 	struct augmented_args_payload *augmented_args = augmented_args_payload();
309 	const void *oldpath_arg = (const void *)args->args[1],
310 		   *newpath_arg = (const void *)args->args[3];
311 	unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len;
312 
313         if (augmented_args == NULL)
314                 return 1; /* Failure: don't filter */
315 
316 	len += 2 * sizeof(u64); // The overhead of size and err, just before the payload...
317 
318 	oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
319 	augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
320 	len += augmented_args->arg.size;
321 
322 	/* Every read from userspace is limited to value size */
323 	if (augmented_args->arg.size > sizeof(augmented_args->arg.value))
324 		return 1; /* Failure: don't filter */
325 
326 	struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;
327 
328 	newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
329 	arg2->size = newpath_len;
330 
331 	len += newpath_len;
332 
333 	return augmented__output(args, augmented_args, len);
334 }
335 
336 #define PERF_ATTR_SIZE_VER0     64      /* sizeof first published struct */
337 
338 // we need just the start, get the size to then copy it
339 struct perf_event_attr_size {
340         __u32                   type;
341         /*
342          * Size of the attr structure, for fwd/bwd compat.
343          */
344         __u32                   size;
345 };
346 
347 SEC("tp/syscalls/sys_enter_perf_event_open")
348 int sys_enter_perf_event_open(struct syscall_enter_args *args)
349 {
350 	struct augmented_args_payload *augmented_args = augmented_args_payload();
351 	const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read;
352 	unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
353 
354         if (augmented_args == NULL)
355 		goto failure;
356 
357 	if (bpf_probe_read_user(&augmented_args->arg.value, sizeof(*attr), attr) < 0)
358 		goto failure;
359 
360 	attr_read = (const struct perf_event_attr_size *)augmented_args->arg.value;
361 
362 	__u32 size = attr_read->size;
363 
364 	if (!size)
365 		size = PERF_ATTR_SIZE_VER0;
366 
367 	if (size > sizeof(augmented_args->arg.value))
368                 goto failure;
369 
370 	// Now that we read attr->size and tested it against the size limits, read it completely
371 	if (bpf_probe_read_user(&augmented_args->arg.value, size, attr) < 0)
372 		goto failure;
373 
374 	return augmented__output(args, augmented_args, len + size);
375 failure:
376 	return 1; /* Failure: don't filter */
377 }
378 
379 SEC("tp/syscalls/sys_enter_clock_nanosleep")
380 int sys_enter_clock_nanosleep(struct syscall_enter_args *args)
381 {
382 	struct augmented_args_payload *augmented_args = augmented_args_payload();
383 	const void *rqtp_arg = (const void *)args->args[2];
384 	unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
385 	__u32 size = sizeof(struct timespec64);
386 
387         if (augmented_args == NULL)
388 		goto failure;
389 
390 	if (size > sizeof(augmented_args->arg.value))
391                 goto failure;
392 
393 	bpf_probe_read_user(&augmented_args->arg.value, size, rqtp_arg);
394 
395 	return augmented__output(args, augmented_args, len + size);
396 failure:
397 	return 1; /* Failure: don't filter */
398 }
399 
400 SEC("tp/syscalls/sys_enter_nanosleep")
401 int sys_enter_nanosleep(struct syscall_enter_args *args)
402 {
403 	struct augmented_args_payload *augmented_args = augmented_args_payload();
404 	const void *req_arg = (const void *)args->args[0];
405 	unsigned int len = sizeof(augmented_args->args);
406 	__u32 size = sizeof(struct timespec64);
407 
408         if (augmented_args == NULL)
409 		goto failure;
410 
411 	if (size > sizeof(augmented_args->arg.value))
412                 goto failure;
413 
414 	bpf_probe_read_user(&augmented_args->arg.value, size, req_arg);
415 
416 	return augmented__output(args, augmented_args, len + size);
417 failure:
418 	return 1; /* Failure: don't filter */
419 }
420 
421 static pid_t getpid(void)
422 {
423 	return bpf_get_current_pid_tgid();
424 }
425 
426 static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
427 {
428 	return bpf_map_lookup_elem(pids, &pid) != NULL;
429 }
430 
431 static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
432 {
433 	bool augmented, do_output = false;
434 	int zero = 0, size, aug_size, index,
435 	    value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
436 	u64 output = 0; /* has to be u64, otherwise it won't pass the verifier */
437 	unsigned int nr, *beauty_map;
438 	struct beauty_payload_enter *payload;
439 	void *arg, *payload_offset;
440 
441 	/* fall back to do predefined tail call */
442 	if (args == NULL)
443 		return 1;
444 
445 	/* use syscall number to get beauty_map entry */
446 	nr             = (__u32)args->syscall_nr;
447 	beauty_map     = bpf_map_lookup_elem(&beauty_map_enter, &nr);
448 
449 	/* set up payload for output */
450 	payload        = bpf_map_lookup_elem(&beauty_payload_enter_map, &zero);
451 	payload_offset = (void *)&payload->aug_args;
452 
453 	if (beauty_map == NULL || payload == NULL)
454 		return 1;
455 
456 	/* copy the sys_enter header, which has the syscall_nr */
457 	__builtin_memcpy(&payload->args, args, sizeof(struct syscall_enter_args));
458 
459 	/*
460 	 * Determine what type of argument and how many bytes to read from user space, using the
461 	 * value in the beauty_map. This is the relation of parameter type and its corresponding
462 	 * value in the beauty map, and how many bytes we read eventually:
463 	 *
464 	 * string: 1			      -> size of string
465 	 * struct: size of struct	      -> size of struct
466 	 * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF)
467 	 */
468 	for (int i = 0; i < 6; i++) {
469 		arg = (void *)args->args[i];
470 		augmented = false;
471 		size = beauty_map[i];
472 		aug_size = size; /* size of the augmented data read from user space */
473 
474 		if (size == 0 || arg == NULL)
475 			continue;
476 
477 		if (size == 1) { /* string */
478 			aug_size = bpf_probe_read_user_str(((struct augmented_arg *)payload_offset)->value, value_size, arg);
479 			/* minimum of 0 to pass the verifier */
480 			if (aug_size < 0)
481 				aug_size = 0;
482 
483 			augmented = true;
484 		} else if (size > 0 && size <= value_size) { /* struct */
485 			if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, size, arg))
486 				augmented = true;
487 		} else if (size < 0 && size >= -6) { /* buffer */
488 			index = -(size + 1);
489 			barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick.
490 			index &= 7;	    // Satisfy the bounds checking with the verifier in some kernels.
491 			aug_size = args->args[index];
492 
493 			if (aug_size > TRACE_AUG_MAX_BUF)
494 				aug_size = TRACE_AUG_MAX_BUF;
495 
496 			if (aug_size > 0) {
497 				if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg))
498 					augmented = true;
499 			}
500 		}
501 
502 		/* Augmented data size is limited to sizeof(augmented_arg->unnamed union with value field) */
503 		if (aug_size > value_size)
504 			aug_size = value_size;
505 
506 		/* write data to payload */
507 		if (augmented) {
508 			int written = offsetof(struct augmented_arg, value) + aug_size;
509 
510 			if (written < 0 || written > sizeof(struct augmented_arg))
511 				return 1;
512 
513 			((struct augmented_arg *)payload_offset)->size = aug_size;
514 			output += written;
515 			payload_offset += written;
516 			do_output = true;
517 		}
518 	}
519 
520 	if (!do_output || (sizeof(struct syscall_enter_args) + output) > sizeof(struct beauty_payload_enter))
521 		return 1;
522 
523 	return augmented__beauty_output(ctx, payload, sizeof(struct syscall_enter_args) + output);
524 }
525 
526 SEC("tp/raw_syscalls/sys_enter")
527 int sys_enter(struct syscall_enter_args *args)
528 {
529 	struct augmented_args_payload *augmented_args;
530 	/*
531 	 * We start len, the amount of data that will be in the perf ring
532 	 * buffer, if this is not filtered out by one of pid_filter__has(),
533 	 * syscall->enabled, etc, with the non-augmented raw syscall payload,
534 	 * i.e. sizeof(augmented_args->args).
535 	 *
536 	 * We'll add to this as we add augmented syscalls right after that
537 	 * initial, non-augmented raw_syscalls:sys_enter payload.
538 	 */
539 
540 	if (pid_filter__has(&pids_filtered, getpid()))
541 		return 0;
542 
543 	augmented_args = augmented_args_payload();
544 	if (augmented_args == NULL)
545 		return 1;
546 
547 	bpf_probe_read_kernel(&augmented_args->args, sizeof(augmented_args->args), args);
548 
549 	/*
550 	 * Jump to syscall specific augmenter, even if the default one,
551 	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
552 	 * unaugmented tracepoint payload.
553 	 */
554 	if (augment_sys_enter(args, &augmented_args->args))
555 		bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr);
556 
557 	// If not found on the PROG_ARRAY syscalls map, then we're filtering it:
558 	return 0;
559 }
560 
561 SEC("tp/raw_syscalls/sys_exit")
562 int sys_exit(struct syscall_exit_args *args)
563 {
564 	struct syscall_exit_args exit_args;
565 
566 	if (pid_filter__has(&pids_filtered, getpid()))
567 		return 0;
568 
569 	bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args);
570 	/*
571 	 * Jump to syscall specific return augmenter, even if the default one,
572 	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
573 	 * unaugmented tracepoint payload.
574 	 */
575 	bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr);
576 	/*
577 	 * If not found on the PROG_ARRAY syscalls map, then we're filtering it:
578 	 */
579 	return 0;
580 }
581 
582 char _license[] SEC("license") = "GPL";
583