xref: /linux/tools/perf/util/bpf_skel/augmented_raw_syscalls.bpf.c (revision 3f2a5ba784b808109cac0aac921213e43143a216)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Augment the raw_syscalls tracepoints with the contents of the pointer arguments.
4  *
5  * This exactly matches what is marshalled into the raw_syscall:sys_enter
6  * payload expected by the 'perf trace' beautifiers.
7  */
8 
9 #include "vmlinux.h"
10 
11 #include <bpf/bpf_helpers.h>
12 #include <linux/limits.h>
13 
14 #define PERF_ALIGN(x, a)        __PERF_ALIGN_MASK(x, (typeof(x))(a)-1)
15 #define __PERF_ALIGN_MASK(x, mask)      (((x)+(mask))&~(mask))
16 
17 /**
18  * is_power_of_2() - check if a value is a power of two
19  * @n: the value to check
20  *
21  * Determine whether some value is a power of two, where zero is *not*
22  * considered a power of two.  Return: true if @n is a power of 2, otherwise
23  * false.
24  */
25 #define is_power_of_2(n) (n != 0 && ((n & (n - 1)) == 0))
26 
27 #define MAX_CPUS  4096
28 
29 #define TRACE_AUG_MAX_BUF 32 /* for buffer augmentation in perf trace */
30 
31 /* bpf-output associated map */
32 struct __augmented_syscalls__ {
33 	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
34 	__type(key, int);
35 	__type(value, __u32);
36 	__uint(max_entries, MAX_CPUS);
37 } __augmented_syscalls__ SEC(".maps");
38 
39 /*
40  * What to augment at entry?
41  *
42  * Pointer arg payloads (filenames, etc) passed from userspace to the kernel
43  */
44 struct syscalls_sys_enter {
45 	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
46 	__type(key, __u32);
47 	__type(value, __u32);
48 	__uint(max_entries, 512);
49 } syscalls_sys_enter SEC(".maps");
50 
51 /*
52  * What to augment at exit?
53  *
54  * Pointer arg payloads returned from the kernel (struct stat, etc) to userspace.
55  */
56 struct syscalls_sys_exit {
57 	__uint(type, BPF_MAP_TYPE_PROG_ARRAY);
58 	__type(key, __u32);
59 	__type(value, __u32);
60 	__uint(max_entries, 512);
61 } syscalls_sys_exit SEC(".maps");
62 
63 struct syscall_enter_args {
64 	unsigned long long common_tp_fields;
65 	long		   syscall_nr;
66 	unsigned long	   args[6];
67 };
68 
69 struct syscall_exit_args {
70 	unsigned long long common_tp_fields;
71 	long		   syscall_nr;
72 	long		   ret;
73 };
74 
75 /*
76  * Desired design of maximum size and alignment (see RFC2553)
77  */
78 #define SS_MAXSIZE   128     /* Implementation specific max size */
79 
80 typedef unsigned short sa_family_t;
81 
82 /*
83  * FIXME: Should come from system headers
84  *
85  * The definition uses anonymous union and struct in order to control the
86  * default alignment.
87  */
88 struct sockaddr_storage {
89 	union {
90 		struct {
91 			sa_family_t    ss_family; /* address family */
92 			/* Following field(s) are implementation specific */
93 			char __data[SS_MAXSIZE - sizeof(unsigned short)];
94 				/* space to achieve desired size, */
95 				/* _SS_MAXSIZE value minus size of ss_family */
96 		};
97 		void *__align; /* implementation specific desired alignment */
98 	};
99 };
100 
101 struct augmented_arg {
102 	unsigned int	size;
103 	int		err;
104 	union {
105 		char   value[PATH_MAX];
106 		struct sockaddr_storage saddr;
107 	};
108 };
109 
110 struct pids_filtered {
111 	__uint(type, BPF_MAP_TYPE_HASH);
112 	__type(key, pid_t);
113 	__type(value, bool);
114 	__uint(max_entries, 64);
115 } pids_filtered SEC(".maps");
116 
117 struct augmented_args_payload {
118 	struct syscall_enter_args args;
119 	struct augmented_arg arg, arg2; // We have to reserve space for two arguments (rename, etc)
120 };
121 
122 // We need more tmp space than the BPF stack can give us
123 struct augmented_args_tmp {
124 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
125 	__type(key, int);
126 	__type(value, struct augmented_args_payload);
127 	__uint(max_entries, 1);
128 } augmented_args_tmp SEC(".maps");
129 
130 struct beauty_map_enter {
131 	__uint(type, BPF_MAP_TYPE_HASH);
132 	__type(key, int);
133 	__type(value, __u32[6]);
134 	__uint(max_entries, 512);
135 } beauty_map_enter SEC(".maps");
136 
137 struct beauty_payload_enter {
138 	struct syscall_enter_args args;
139 	struct augmented_arg aug_args[6];
140 };
141 
142 struct beauty_payload_enter_map {
143 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
144 	__type(key, int);
145 	__type(value, struct beauty_payload_enter);
146 	__uint(max_entries, 1);
147 } beauty_payload_enter_map SEC(".maps");
148 
149 static inline struct augmented_args_payload *augmented_args_payload(void)
150 {
151 	int key = 0;
152 	return bpf_map_lookup_elem(&augmented_args_tmp, &key);
153 }
154 
155 static inline int augmented__output(void *ctx, struct augmented_args_payload *args, int len)
156 {
157 	/* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */
158 	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len);
159 }
160 
161 static inline int augmented__beauty_output(void *ctx, void *data, int len)
162 {
163 	return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, data, len);
164 }
165 
166 static inline
167 unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len)
168 {
169 	unsigned int augmented_len = sizeof(*augmented_arg);
170 	int string_len = bpf_probe_read_user_str(&augmented_arg->value, arg_len, arg);
171 
172 	augmented_arg->size = augmented_arg->err = 0;
173 	/*
174 	 * probe_read_str may return < 0, e.g. -EFAULT
175 	 * So we leave that in the augmented_arg->size that userspace will
176 	 */
177 	if (string_len > 0) {
178 		augmented_len -= sizeof(augmented_arg->value) - string_len;
179 		_Static_assert(is_power_of_2(sizeof(augmented_arg->value)), "sizeof(augmented_arg->value) needs to be a power of two");
180 		augmented_len &= sizeof(augmented_arg->value) - 1;
181 		augmented_arg->size = string_len;
182 	} else {
183 		/*
184 		 * So that username notice the error while still being able
185 		 * to skip this augmented arg record
186 		 */
187 		augmented_arg->err = string_len;
188 		augmented_len = offsetof(struct augmented_arg, value);
189 	}
190 
191 	return augmented_len;
192 }
193 
194 SEC("tp/raw_syscalls/sys_enter")
195 int syscall_unaugmented(struct syscall_enter_args *args)
196 {
197 	return 1;
198 }
199 
200 /*
201  * These will be tail_called from SEC("raw_syscalls:sys_enter"), so will find in
202  * augmented_args_tmp what was read by that raw_syscalls:sys_enter and go
203  * on from there, reading the first syscall arg as a string, i.e. open's
204  * filename.
205  */
206 SEC("tp/syscalls/sys_enter_connect")
207 int sys_enter_connect(struct syscall_enter_args *args)
208 {
209 	struct augmented_args_payload *augmented_args = augmented_args_payload();
210 	const void *sockaddr_arg = (const void *)args->args[1];
211 	unsigned int socklen = args->args[2];
212 	unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
213 
214         if (augmented_args == NULL)
215                 return 1; /* Failure: don't filter */
216 
217 	_Static_assert(is_power_of_2(sizeof(augmented_args->arg.saddr)), "sizeof(augmented_args->arg.saddr) needs to be a power of two");
218 	socklen &= sizeof(augmented_args->arg.saddr) - 1;
219 
220 	bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg);
221 	augmented_args->arg.size = socklen;
222 	augmented_args->arg.err = 0;
223 
224 	return augmented__output(args, augmented_args, len + socklen);
225 }
226 
227 SEC("tp/syscalls/sys_enter_sendto")
228 int sys_enter_sendto(struct syscall_enter_args *args)
229 {
230 	struct augmented_args_payload *augmented_args = augmented_args_payload();
231 	const void *sockaddr_arg = (const void *)args->args[4];
232 	unsigned int socklen = args->args[5];
233 	unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
234 
235         if (augmented_args == NULL)
236                 return 1; /* Failure: don't filter */
237 
238 	socklen &= sizeof(augmented_args->arg.saddr) - 1;
239 
240 	bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg);
241 
242 	return augmented__output(args, augmented_args, len + socklen);
243 }
244 
245 SEC("tp/syscalls/sys_enter_open")
246 int sys_enter_open(struct syscall_enter_args *args)
247 {
248 	struct augmented_args_payload *augmented_args = augmented_args_payload();
249 	const void *filename_arg = (const void *)args->args[0];
250 	unsigned int len = sizeof(augmented_args->args);
251 
252         if (augmented_args == NULL)
253                 return 1; /* Failure: don't filter */
254 
255 	len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value));
256 
257 	return augmented__output(args, augmented_args, len);
258 }
259 
260 SEC("tp/syscalls/sys_enter_openat")
261 int sys_enter_openat(struct syscall_enter_args *args)
262 {
263 	struct augmented_args_payload *augmented_args = augmented_args_payload();
264 	const void *filename_arg = (const void *)args->args[1];
265 	unsigned int len = sizeof(augmented_args->args);
266 
267         if (augmented_args == NULL)
268                 return 1; /* Failure: don't filter */
269 
270 	len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value));
271 
272 	return augmented__output(args, augmented_args, len);
273 }
274 
275 SEC("tp/syscalls/sys_enter_rename")
276 int sys_enter_rename(struct syscall_enter_args *args)
277 {
278 	struct augmented_args_payload *augmented_args = augmented_args_payload();
279 	const void *oldpath_arg = (const void *)args->args[0],
280 		   *newpath_arg = (const void *)args->args[1];
281 	unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len;
282 
283         if (augmented_args == NULL)
284                 return 1; /* Failure: don't filter */
285 
286 	len += 2 * sizeof(u64); // The overhead of size and err, just before the payload...
287 
288 	oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
289 	augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
290 	len += augmented_args->arg.size;
291 
292 	/* Every read from userspace is limited to value size */
293 	if (augmented_args->arg.size > sizeof(augmented_args->arg.value))
294 		return 1; /* Failure: don't filter */
295 
296 	struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;
297 
298 	newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
299 	arg2->size = newpath_len;
300 
301 	len += newpath_len;
302 
303 	return augmented__output(args, augmented_args, len);
304 }
305 
306 SEC("tp/syscalls/sys_enter_renameat2")
307 int sys_enter_renameat2(struct syscall_enter_args *args)
308 {
309 	struct augmented_args_payload *augmented_args = augmented_args_payload();
310 	const void *oldpath_arg = (const void *)args->args[1],
311 		   *newpath_arg = (const void *)args->args[3];
312 	unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len;
313 
314         if (augmented_args == NULL)
315                 return 1; /* Failure: don't filter */
316 
317 	len += 2 * sizeof(u64); // The overhead of size and err, just before the payload...
318 
319 	oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
320 	augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
321 	len += augmented_args->arg.size;
322 
323 	/* Every read from userspace is limited to value size */
324 	if (augmented_args->arg.size > sizeof(augmented_args->arg.value))
325 		return 1; /* Failure: don't filter */
326 
327 	struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;
328 
329 	newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
330 	arg2->size = newpath_len;
331 
332 	len += newpath_len;
333 
334 	return augmented__output(args, augmented_args, len);
335 }
336 
337 #define PERF_ATTR_SIZE_VER0     64      /* sizeof first published struct */
338 
339 // we need just the start, get the size to then copy it
340 struct perf_event_attr_size {
341         __u32                   type;
342         /*
343          * Size of the attr structure, for fwd/bwd compat.
344          */
345         __u32                   size;
346 };
347 
348 SEC("tp/syscalls/sys_enter_perf_event_open")
349 int sys_enter_perf_event_open(struct syscall_enter_args *args)
350 {
351 	struct augmented_args_payload *augmented_args = augmented_args_payload();
352 	const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read;
353 	unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
354 
355         if (augmented_args == NULL)
356 		goto failure;
357 
358 	if (bpf_probe_read_user(&augmented_args->arg.value, sizeof(*attr), attr) < 0)
359 		goto failure;
360 
361 	attr_read = (const struct perf_event_attr_size *)augmented_args->arg.value;
362 
363 	__u32 size = attr_read->size;
364 
365 	if (!size)
366 		size = PERF_ATTR_SIZE_VER0;
367 
368 	if (size > sizeof(augmented_args->arg.value))
369                 goto failure;
370 
371 	// Now that we read attr->size and tested it against the size limits, read it completely
372 	if (bpf_probe_read_user(&augmented_args->arg.value, size, attr) < 0)
373 		goto failure;
374 
375 	return augmented__output(args, augmented_args, len + size);
376 failure:
377 	return 1; /* Failure: don't filter */
378 }
379 
380 SEC("tp/syscalls/sys_enter_clock_nanosleep")
381 int sys_enter_clock_nanosleep(struct syscall_enter_args *args)
382 {
383 	struct augmented_args_payload *augmented_args = augmented_args_payload();
384 	const void *rqtp_arg = (const void *)args->args[2];
385 	unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
386 	__u32 size = sizeof(struct timespec64);
387 
388         if (augmented_args == NULL)
389 		goto failure;
390 
391 	if (size > sizeof(augmented_args->arg.value))
392                 goto failure;
393 
394 	bpf_probe_read_user(&augmented_args->arg.value, size, rqtp_arg);
395 
396 	return augmented__output(args, augmented_args, len + size);
397 failure:
398 	return 1; /* Failure: don't filter */
399 }
400 
401 SEC("tp/syscalls/sys_enter_nanosleep")
402 int sys_enter_nanosleep(struct syscall_enter_args *args)
403 {
404 	struct augmented_args_payload *augmented_args = augmented_args_payload();
405 	const void *req_arg = (const void *)args->args[0];
406 	unsigned int len = sizeof(augmented_args->args);
407 	__u32 size = sizeof(struct timespec64);
408 
409         if (augmented_args == NULL)
410 		goto failure;
411 
412 	if (size > sizeof(augmented_args->arg.value))
413                 goto failure;
414 
415 	bpf_probe_read_user(&augmented_args->arg.value, size, req_arg);
416 
417 	return augmented__output(args, augmented_args, len + size);
418 failure:
419 	return 1; /* Failure: don't filter */
420 }
421 
422 static pid_t getpid(void)
423 {
424 	return bpf_get_current_pid_tgid();
425 }
426 
427 static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
428 {
429 	return bpf_map_lookup_elem(pids, &pid) != NULL;
430 }
431 
432 static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
433 {
434 	bool augmented, do_output = false;
435 	int zero = 0, index, value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
436 	u64 output = 0; /* has to be u64, otherwise it won't pass the verifier */
437 	s64 aug_size, size;
438 	unsigned int nr, *beauty_map;
439 	struct beauty_payload_enter *payload;
440 	void *arg, *payload_offset;
441 
442 	/* fall back to do predefined tail call */
443 	if (args == NULL)
444 		return 1;
445 
446 	/* use syscall number to get beauty_map entry */
447 	nr             = (__u32)args->syscall_nr;
448 	beauty_map     = bpf_map_lookup_elem(&beauty_map_enter, &nr);
449 
450 	/* set up payload for output */
451 	payload        = bpf_map_lookup_elem(&beauty_payload_enter_map, &zero);
452 	payload_offset = (void *)&payload->aug_args;
453 
454 	if (beauty_map == NULL || payload == NULL)
455 		return 1;
456 
457 	/* copy the sys_enter header, which has the syscall_nr */
458 	__builtin_memcpy(&payload->args, args, sizeof(struct syscall_enter_args));
459 
460 	/*
461 	 * Determine what type of argument and how many bytes to read from user space, using the
462 	 * value in the beauty_map. This is the relation of parameter type and its corresponding
463 	 * value in the beauty map, and how many bytes we read eventually:
464 	 *
465 	 * string: 1			      -> size of string
466 	 * struct: size of struct	      -> size of struct
467 	 * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF)
468 	 */
469 	for (int i = 0; i < 6; i++) {
470 		arg = (void *)args->args[i];
471 		augmented = false;
472 		size = beauty_map[i];
473 		aug_size = size; /* size of the augmented data read from user space */
474 
475 		if (size == 0 || arg == NULL)
476 			continue;
477 
478 		if (size == 1) { /* string */
479 			aug_size = bpf_probe_read_user_str(((struct augmented_arg *)payload_offset)->value, value_size, arg);
480 			/* minimum of 0 to pass the verifier */
481 			if (aug_size < 0)
482 				aug_size = 0;
483 
484 			augmented = true;
485 		} else if (size > 0 && size <= value_size) { /* struct */
486 			if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, size, arg))
487 				augmented = true;
488 		} else if ((int)size < 0 && size >= -6) { /* buffer */
489 			index = -(size + 1);
490 			barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick.
491 			index &= 7;	    // Satisfy the bounds checking with the verifier in some kernels.
492 			aug_size = args->args[index] > TRACE_AUG_MAX_BUF ? TRACE_AUG_MAX_BUF : args->args[index];
493 
494 			if (aug_size > 0) {
495 				if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg))
496 					augmented = true;
497 			}
498 		}
499 
500 		/* Augmented data size is limited to sizeof(augmented_arg->unnamed union with value field) */
501 		if (aug_size > value_size)
502 			aug_size = value_size;
503 
504 		/* write data to payload */
505 		if (augmented) {
506 			int written = offsetof(struct augmented_arg, value) + aug_size;
507 
508 			if (written < 0 || written > sizeof(struct augmented_arg))
509 				return 1;
510 
511 			((struct augmented_arg *)payload_offset)->size = aug_size;
512 			output += written;
513 			payload_offset += written;
514 			do_output = true;
515 		}
516 	}
517 
518 	if (!do_output || (sizeof(struct syscall_enter_args) + output) > sizeof(struct beauty_payload_enter))
519 		return 1;
520 
521 	return augmented__beauty_output(ctx, payload, sizeof(struct syscall_enter_args) + output);
522 }
523 
524 SEC("tp/raw_syscalls/sys_enter")
525 int sys_enter(struct syscall_enter_args *args)
526 {
527 	struct augmented_args_payload *augmented_args;
528 	/*
529 	 * We start len, the amount of data that will be in the perf ring
530 	 * buffer, if this is not filtered out by one of pid_filter__has(),
531 	 * syscall->enabled, etc, with the non-augmented raw syscall payload,
532 	 * i.e. sizeof(augmented_args->args).
533 	 *
534 	 * We'll add to this as we add augmented syscalls right after that
535 	 * initial, non-augmented raw_syscalls:sys_enter payload.
536 	 */
537 
538 	if (pid_filter__has(&pids_filtered, getpid()))
539 		return 0;
540 
541 	augmented_args = augmented_args_payload();
542 	if (augmented_args == NULL)
543 		return 1;
544 
545 	bpf_probe_read_kernel(&augmented_args->args, sizeof(augmented_args->args), args);
546 
547 	/*
548 	 * Jump to syscall specific augmenter, even if the default one,
549 	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
550 	 * unaugmented tracepoint payload.
551 	 */
552 	if (augment_sys_enter(args, &augmented_args->args))
553 		bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr);
554 
555 	// If not found on the PROG_ARRAY syscalls map, then we're filtering it:
556 	return 0;
557 }
558 
559 SEC("tp/raw_syscalls/sys_exit")
560 int sys_exit(struct syscall_exit_args *args)
561 {
562 	struct syscall_exit_args exit_args;
563 
564 	if (pid_filter__has(&pids_filtered, getpid()))
565 		return 0;
566 
567 	bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args);
568 	/*
569 	 * Jump to syscall specific return augmenter, even if the default one,
570 	 * "!raw_syscalls:unaugmented" that will just return 1 to return the
571 	 * unaugmented tracepoint payload.
572 	 */
573 	bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr);
574 	/*
575 	 * If not found on the PROG_ARRAY syscalls map, then we're filtering it:
576 	 */
577 	return 0;
578 }
579 
580 char _license[] SEC("license") = "GPL";
581