1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Augment the raw_syscalls tracepoints with the contents of the pointer arguments.
4 *
5 * This exactly matches what is marshalled into the raw_syscall:sys_enter
6 * payload expected by the 'perf trace' beautifiers.
7 */
8
9 #include "vmlinux.h"
10
11 #include <bpf/bpf_helpers.h>
12 #include <linux/limits.h>
13
14 #define PERF_ALIGN(x, a) __PERF_ALIGN_MASK(x, (typeof(x))(a)-1)
15 #define __PERF_ALIGN_MASK(x, mask) (((x)+(mask))&~(mask))
16
17 /**
18 * is_power_of_2() - check if a value is a power of two
19 * @n: the value to check
20 *
21 * Determine whether some value is a power of two, where zero is *not*
22 * considered a power of two. Return: true if @n is a power of 2, otherwise
23 * false.
24 */
25 #define is_power_of_2(n) (n != 0 && ((n & (n - 1)) == 0))
26
27 #define MAX_CPUS 4096
28
29 #define TRACE_AUG_MAX_BUF 32 /* for buffer augmentation in perf trace */
30
31 /* bpf-output associated map */
32 struct __augmented_syscalls__ {
33 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
34 __type(key, int);
35 __type(value, __u32);
36 __uint(max_entries, MAX_CPUS);
37 } __augmented_syscalls__ SEC(".maps");
38
39 /*
40 * What to augment at entry?
41 *
42 * Pointer arg payloads (filenames, etc) passed from userspace to the kernel
43 */
44 struct syscalls_sys_enter {
45 __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
46 __type(key, __u32);
47 __type(value, __u32);
48 __uint(max_entries, 512);
49 } syscalls_sys_enter SEC(".maps");
50
51 /*
52 * What to augment at exit?
53 *
54 * Pointer arg payloads returned from the kernel (struct stat, etc) to userspace.
55 */
56 struct syscalls_sys_exit {
57 __uint(type, BPF_MAP_TYPE_PROG_ARRAY);
58 __type(key, __u32);
59 __type(value, __u32);
60 __uint(max_entries, 512);
61 } syscalls_sys_exit SEC(".maps");
62
63 struct syscall_enter_args {
64 unsigned long long common_tp_fields;
65 long syscall_nr;
66 unsigned long args[6];
67 };
68
69 struct syscall_exit_args {
70 unsigned long long common_tp_fields;
71 long syscall_nr;
72 long ret;
73 };
74
75 /*
76 * Desired design of maximum size and alignment (see RFC2553)
77 */
78 #define SS_MAXSIZE 128 /* Implementation specific max size */
79
80 typedef unsigned short sa_family_t;
81
82 /*
83 * FIXME: Should come from system headers
84 *
85 * The definition uses anonymous union and struct in order to control the
86 * default alignment.
87 */
88 struct sockaddr_storage {
89 union {
90 struct {
91 sa_family_t ss_family; /* address family */
92 /* Following field(s) are implementation specific */
93 char __data[SS_MAXSIZE - sizeof(unsigned short)];
94 /* space to achieve desired size, */
95 /* _SS_MAXSIZE value minus size of ss_family */
96 };
97 void *__align; /* implementation specific desired alignment */
98 };
99 };
100
101 struct augmented_arg {
102 unsigned int size;
103 int err;
104 union {
105 char value[PATH_MAX];
106 struct sockaddr_storage saddr;
107 };
108 };
109
110 struct pids_filtered {
111 __uint(type, BPF_MAP_TYPE_HASH);
112 __type(key, pid_t);
113 __type(value, bool);
114 __uint(max_entries, 64);
115 } pids_filtered SEC(".maps");
116
117 struct augmented_args_payload {
118 struct syscall_enter_args args;
119 struct augmented_arg arg, arg2; // We have to reserve space for two arguments (rename, etc)
120 };
121
122 // We need more tmp space than the BPF stack can give us
123 struct augmented_args_tmp {
124 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
125 __type(key, int);
126 __type(value, struct augmented_args_payload);
127 __uint(max_entries, 1);
128 } augmented_args_tmp SEC(".maps");
129
130 struct beauty_map_enter {
131 __uint(type, BPF_MAP_TYPE_HASH);
132 __type(key, int);
133 __type(value, __u32[6]);
134 __uint(max_entries, 512);
135 } beauty_map_enter SEC(".maps");
136
137 struct beauty_payload_enter {
138 struct syscall_enter_args args;
139 struct augmented_arg aug_args[6];
140 };
141
142 struct beauty_payload_enter_map {
143 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
144 __type(key, int);
145 __type(value, struct beauty_payload_enter);
146 __uint(max_entries, 1);
147 } beauty_payload_enter_map SEC(".maps");
148
augmented_args_payload(void)149 static inline struct augmented_args_payload *augmented_args_payload(void)
150 {
151 int key = 0;
152 return bpf_map_lookup_elem(&augmented_args_tmp, &key);
153 }
154
augmented__output(void * ctx,struct augmented_args_payload * args,int len)155 static inline int augmented__output(void *ctx, struct augmented_args_payload *args, int len)
156 {
157 /* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */
158 return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len);
159 }
160
augmented__beauty_output(void * ctx,void * data,int len)161 static inline int augmented__beauty_output(void *ctx, void *data, int len)
162 {
163 return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, data, len);
164 }
165
166 static inline
augmented_arg__read_str(struct augmented_arg * augmented_arg,const void * arg,unsigned int arg_len)167 unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len)
168 {
169 unsigned int augmented_len = sizeof(*augmented_arg);
170 int string_len = bpf_probe_read_user_str(&augmented_arg->value, arg_len, arg);
171
172 augmented_arg->size = augmented_arg->err = 0;
173 /*
174 * probe_read_str may return < 0, e.g. -EFAULT
175 * So we leave that in the augmented_arg->size that userspace will
176 */
177 if (string_len > 0) {
178 augmented_len -= sizeof(augmented_arg->value) - string_len;
179 _Static_assert(is_power_of_2(sizeof(augmented_arg->value)), "sizeof(augmented_arg->value) needs to be a power of two");
180 augmented_len &= sizeof(augmented_arg->value) - 1;
181 augmented_arg->size = string_len;
182 } else {
183 /*
184 * So that username notice the error while still being able
185 * to skip this augmented arg record
186 */
187 augmented_arg->err = string_len;
188 augmented_len = offsetof(struct augmented_arg, value);
189 }
190
191 return augmented_len;
192 }
193
194 SEC("tp/raw_syscalls/sys_enter")
syscall_unaugmented(struct syscall_enter_args * args)195 int syscall_unaugmented(struct syscall_enter_args *args)
196 {
197 return 1;
198 }
199
200 /*
201 * These will be tail_called from SEC("raw_syscalls:sys_enter"), so will find in
202 * augmented_args_tmp what was read by that raw_syscalls:sys_enter and go
203 * on from there, reading the first syscall arg as a string, i.e. open's
204 * filename.
205 */
206 SEC("tp/syscalls/sys_enter_connect")
sys_enter_connect(struct syscall_enter_args * args)207 int sys_enter_connect(struct syscall_enter_args *args)
208 {
209 struct augmented_args_payload *augmented_args = augmented_args_payload();
210 const void *sockaddr_arg = (const void *)args->args[1];
211 unsigned int socklen = args->args[2];
212 unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
213
214 if (augmented_args == NULL)
215 return 1; /* Failure: don't filter */
216
217 _Static_assert(is_power_of_2(sizeof(augmented_args->arg.saddr)), "sizeof(augmented_args->arg.saddr) needs to be a power of two");
218 socklen &= sizeof(augmented_args->arg.saddr) - 1;
219
220 bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg);
221 augmented_args->arg.size = socklen;
222 augmented_args->arg.err = 0;
223
224 return augmented__output(args, augmented_args, len + socklen);
225 }
226
227 SEC("tp/syscalls/sys_enter_sendto")
sys_enter_sendto(struct syscall_enter_args * args)228 int sys_enter_sendto(struct syscall_enter_args *args)
229 {
230 struct augmented_args_payload *augmented_args = augmented_args_payload();
231 const void *sockaddr_arg = (const void *)args->args[4];
232 unsigned int socklen = args->args[5];
233 unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
234
235 if (augmented_args == NULL)
236 return 1; /* Failure: don't filter */
237
238 socklen &= sizeof(augmented_args->arg.saddr) - 1;
239
240 bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg);
241
242 return augmented__output(args, augmented_args, len + socklen);
243 }
244
245 SEC("tp/syscalls/sys_enter_open")
sys_enter_open(struct syscall_enter_args * args)246 int sys_enter_open(struct syscall_enter_args *args)
247 {
248 struct augmented_args_payload *augmented_args = augmented_args_payload();
249 const void *filename_arg = (const void *)args->args[0];
250 unsigned int len = sizeof(augmented_args->args);
251
252 if (augmented_args == NULL)
253 return 1; /* Failure: don't filter */
254
255 len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value));
256
257 return augmented__output(args, augmented_args, len);
258 }
259
260 SEC("tp/syscalls/sys_enter_openat")
sys_enter_openat(struct syscall_enter_args * args)261 int sys_enter_openat(struct syscall_enter_args *args)
262 {
263 struct augmented_args_payload *augmented_args = augmented_args_payload();
264 const void *filename_arg = (const void *)args->args[1];
265 unsigned int len = sizeof(augmented_args->args);
266
267 if (augmented_args == NULL)
268 return 1; /* Failure: don't filter */
269
270 len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value));
271
272 return augmented__output(args, augmented_args, len);
273 }
274
275 SEC("tp/syscalls/sys_enter_rename")
sys_enter_rename(struct syscall_enter_args * args)276 int sys_enter_rename(struct syscall_enter_args *args)
277 {
278 struct augmented_args_payload *augmented_args = augmented_args_payload();
279 const void *oldpath_arg = (const void *)args->args[0],
280 *newpath_arg = (const void *)args->args[1];
281 unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len;
282
283 if (augmented_args == NULL)
284 return 1; /* Failure: don't filter */
285
286 len += 2 * sizeof(u64); // The overhead of size and err, just before the payload...
287
288 oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
289 augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
290 len += augmented_args->arg.size;
291
292 /* Every read from userspace is limited to value size */
293 if (augmented_args->arg.size > sizeof(augmented_args->arg.value))
294 return 1; /* Failure: don't filter */
295
296 struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;
297
298 newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
299 arg2->size = newpath_len;
300
301 len += newpath_len;
302
303 return augmented__output(args, augmented_args, len);
304 }
305
306 SEC("tp/syscalls/sys_enter_renameat2")
sys_enter_renameat2(struct syscall_enter_args * args)307 int sys_enter_renameat2(struct syscall_enter_args *args)
308 {
309 struct augmented_args_payload *augmented_args = augmented_args_payload();
310 const void *oldpath_arg = (const void *)args->args[1],
311 *newpath_arg = (const void *)args->args[3];
312 unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len;
313
314 if (augmented_args == NULL)
315 return 1; /* Failure: don't filter */
316
317 len += 2 * sizeof(u64); // The overhead of size and err, just before the payload...
318
319 oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value));
320 augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64));
321 len += augmented_args->arg.size;
322
323 /* Every read from userspace is limited to value size */
324 if (augmented_args->arg.size > sizeof(augmented_args->arg.value))
325 return 1; /* Failure: don't filter */
326
327 struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size;
328
329 newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value));
330 arg2->size = newpath_len;
331
332 len += newpath_len;
333
334 return augmented__output(args, augmented_args, len);
335 }
336
337 #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */
338
339 // we need just the start, get the size to then copy it
340 struct perf_event_attr_size {
341 __u32 type;
342 /*
343 * Size of the attr structure, for fwd/bwd compat.
344 */
345 __u32 size;
346 };
347
348 SEC("tp/syscalls/sys_enter_perf_event_open")
sys_enter_perf_event_open(struct syscall_enter_args * args)349 int sys_enter_perf_event_open(struct syscall_enter_args *args)
350 {
351 struct augmented_args_payload *augmented_args = augmented_args_payload();
352 const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read;
353 unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
354
355 if (augmented_args == NULL)
356 goto failure;
357
358 if (bpf_probe_read_user(&augmented_args->arg.value, sizeof(*attr), attr) < 0)
359 goto failure;
360
361 attr_read = (const struct perf_event_attr_size *)augmented_args->arg.value;
362
363 __u32 size = attr_read->size;
364
365 if (!size)
366 size = PERF_ATTR_SIZE_VER0;
367
368 if (size > sizeof(augmented_args->arg.value))
369 goto failure;
370
371 // Now that we read attr->size and tested it against the size limits, read it completely
372 if (bpf_probe_read_user(&augmented_args->arg.value, size, attr) < 0)
373 goto failure;
374
375 return augmented__output(args, augmented_args, len + size);
376 failure:
377 return 1; /* Failure: don't filter */
378 }
379
380 SEC("tp/syscalls/sys_enter_clock_nanosleep")
sys_enter_clock_nanosleep(struct syscall_enter_args * args)381 int sys_enter_clock_nanosleep(struct syscall_enter_args *args)
382 {
383 struct augmented_args_payload *augmented_args = augmented_args_payload();
384 const void *rqtp_arg = (const void *)args->args[2];
385 unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs
386 __u32 size = sizeof(struct timespec64);
387
388 if (augmented_args == NULL)
389 goto failure;
390
391 if (size > sizeof(augmented_args->arg.value))
392 goto failure;
393
394 bpf_probe_read_user(&augmented_args->arg.value, size, rqtp_arg);
395
396 return augmented__output(args, augmented_args, len + size);
397 failure:
398 return 1; /* Failure: don't filter */
399 }
400
401 SEC("tp/syscalls/sys_enter_nanosleep")
sys_enter_nanosleep(struct syscall_enter_args * args)402 int sys_enter_nanosleep(struct syscall_enter_args *args)
403 {
404 struct augmented_args_payload *augmented_args = augmented_args_payload();
405 const void *req_arg = (const void *)args->args[0];
406 unsigned int len = sizeof(augmented_args->args);
407 __u32 size = sizeof(struct timespec64);
408
409 if (augmented_args == NULL)
410 goto failure;
411
412 if (size > sizeof(augmented_args->arg.value))
413 goto failure;
414
415 bpf_probe_read_user(&augmented_args->arg.value, size, req_arg);
416
417 return augmented__output(args, augmented_args, len + size);
418 failure:
419 return 1; /* Failure: don't filter */
420 }
421
getpid(void)422 static pid_t getpid(void)
423 {
424 return bpf_get_current_pid_tgid();
425 }
426
pid_filter__has(struct pids_filtered * pids,pid_t pid)427 static bool pid_filter__has(struct pids_filtered *pids, pid_t pid)
428 {
429 return bpf_map_lookup_elem(pids, &pid) != NULL;
430 }
431
augment_sys_enter(void * ctx,struct syscall_enter_args * args)432 static int augment_sys_enter(void *ctx, struct syscall_enter_args *args)
433 {
434 bool augmented, do_output = false;
435 int zero = 0, index, value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value);
436 u64 output = 0; /* has to be u64, otherwise it won't pass the verifier */
437 s64 aug_size, size;
438 unsigned int nr, *beauty_map;
439 struct beauty_payload_enter *payload;
440 void *arg, *payload_offset;
441
442 /* fall back to do predefined tail call */
443 if (args == NULL)
444 return 1;
445
446 /* use syscall number to get beauty_map entry */
447 nr = (__u32)args->syscall_nr;
448 beauty_map = bpf_map_lookup_elem(&beauty_map_enter, &nr);
449
450 /* set up payload for output */
451 payload = bpf_map_lookup_elem(&beauty_payload_enter_map, &zero);
452 payload_offset = (void *)&payload->aug_args;
453
454 if (beauty_map == NULL || payload == NULL)
455 return 1;
456
457 /* copy the sys_enter header, which has the syscall_nr */
458 __builtin_memcpy(&payload->args, args, sizeof(struct syscall_enter_args));
459
460 /*
461 * Determine what type of argument and how many bytes to read from user space, using the
462 * value in the beauty_map. This is the relation of parameter type and its corresponding
463 * value in the beauty map, and how many bytes we read eventually:
464 *
465 * string: 1 -> size of string
466 * struct: size of struct -> size of struct
467 * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF)
468 */
469 for (int i = 0; i < 6; i++) {
470 arg = (void *)args->args[i];
471 augmented = false;
472 size = beauty_map[i];
473 aug_size = size; /* size of the augmented data read from user space */
474
475 if (size == 0 || arg == NULL)
476 continue;
477
478 if (size == 1) { /* string */
479 aug_size = bpf_probe_read_user_str(((struct augmented_arg *)payload_offset)->value, value_size, arg);
480 /* minimum of 0 to pass the verifier */
481 if (aug_size < 0)
482 aug_size = 0;
483
484 augmented = true;
485 } else if (size > 0 && size <= value_size) { /* struct */
486 if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, size, arg))
487 augmented = true;
488 } else if ((int)size < 0 && size >= -6) { /* buffer */
489 index = -(size + 1);
490 barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick.
491 index &= 7; // Satisfy the bounds checking with the verifier in some kernels.
492 aug_size = args->args[index] > TRACE_AUG_MAX_BUF ? TRACE_AUG_MAX_BUF : args->args[index];
493
494 if (aug_size > 0) {
495 if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg))
496 augmented = true;
497 }
498 }
499
500 /* Augmented data size is limited to sizeof(augmented_arg->unnamed union with value field) */
501 if (aug_size > value_size)
502 aug_size = value_size;
503
504 /* write data to payload */
505 if (augmented) {
506 int written = offsetof(struct augmented_arg, value) + aug_size;
507
508 if (written < 0 || written > sizeof(struct augmented_arg))
509 return 1;
510
511 ((struct augmented_arg *)payload_offset)->size = aug_size;
512 output += written;
513 payload_offset += written;
514 do_output = true;
515 }
516 }
517
518 if (!do_output || (sizeof(struct syscall_enter_args) + output) > sizeof(struct beauty_payload_enter))
519 return 1;
520
521 return augmented__beauty_output(ctx, payload, sizeof(struct syscall_enter_args) + output);
522 }
523
524 SEC("tp/raw_syscalls/sys_enter")
sys_enter(struct syscall_enter_args * args)525 int sys_enter(struct syscall_enter_args *args)
526 {
527 struct augmented_args_payload *augmented_args;
528 /*
529 * We start len, the amount of data that will be in the perf ring
530 * buffer, if this is not filtered out by one of pid_filter__has(),
531 * syscall->enabled, etc, with the non-augmented raw syscall payload,
532 * i.e. sizeof(augmented_args->args).
533 *
534 * We'll add to this as we add augmented syscalls right after that
535 * initial, non-augmented raw_syscalls:sys_enter payload.
536 */
537
538 if (pid_filter__has(&pids_filtered, getpid()))
539 return 0;
540
541 augmented_args = augmented_args_payload();
542 if (augmented_args == NULL)
543 return 1;
544
545 bpf_probe_read_kernel(&augmented_args->args, sizeof(augmented_args->args), args);
546
547 /*
548 * Jump to syscall specific augmenter, even if the default one,
549 * "!raw_syscalls:unaugmented" that will just return 1 to return the
550 * unaugmented tracepoint payload.
551 */
552 if (augment_sys_enter(args, &augmented_args->args))
553 bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr);
554
555 // If not found on the PROG_ARRAY syscalls map, then we're filtering it:
556 return 0;
557 }
558
559 SEC("tp/raw_syscalls/sys_exit")
sys_exit(struct syscall_exit_args * args)560 int sys_exit(struct syscall_exit_args *args)
561 {
562 struct syscall_exit_args exit_args;
563
564 if (pid_filter__has(&pids_filtered, getpid()))
565 return 0;
566
567 bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args);
568 /*
569 * Jump to syscall specific return augmenter, even if the default one,
570 * "!raw_syscalls:unaugmented" that will just return 1 to return the
571 * unaugmented tracepoint payload.
572 */
573 bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr);
574 /*
575 * If not found on the PROG_ARRAY syscalls map, then we're filtering it:
576 */
577 return 0;
578 }
579
580 char _license[] SEC("license") = "GPL";
581