1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Augment the raw_syscalls tracepoints with the contents of the pointer arguments. 4 * 5 * This exactly matches what is marshalled into the raw_syscall:sys_enter 6 * payload expected by the 'perf trace' beautifiers. 7 */ 8 9 #include "vmlinux.h" 10 #include <bpf/bpf_helpers.h> 11 #include <linux/limits.h> 12 13 /** 14 * is_power_of_2() - check if a value is a power of two 15 * @n: the value to check 16 * 17 * Determine whether some value is a power of two, where zero is *not* 18 * considered a power of two. Return: true if @n is a power of 2, otherwise 19 * false. 20 */ 21 #define is_power_of_2(n) (n != 0 && ((n & (n - 1)) == 0)) 22 23 #define MAX_CPUS 4096 24 25 /* bpf-output associated map */ 26 struct __augmented_syscalls__ { 27 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 28 __type(key, int); 29 __type(value, __u32); 30 __uint(max_entries, MAX_CPUS); 31 } __augmented_syscalls__ SEC(".maps"); 32 33 /* 34 * What to augment at entry? 35 * 36 * Pointer arg payloads (filenames, etc) passed from userspace to the kernel 37 */ 38 struct syscalls_sys_enter { 39 __uint(type, BPF_MAP_TYPE_PROG_ARRAY); 40 __type(key, __u32); 41 __type(value, __u32); 42 __uint(max_entries, 512); 43 } syscalls_sys_enter SEC(".maps"); 44 45 /* 46 * What to augment at exit? 47 * 48 * Pointer arg payloads returned from the kernel (struct stat, etc) to userspace. 49 */ 50 struct syscalls_sys_exit { 51 __uint(type, BPF_MAP_TYPE_PROG_ARRAY); 52 __type(key, __u32); 53 __type(value, __u32); 54 __uint(max_entries, 512); 55 } syscalls_sys_exit SEC(".maps"); 56 57 struct syscall_enter_args { 58 unsigned long long common_tp_fields; 59 long syscall_nr; 60 unsigned long args[6]; 61 }; 62 63 struct syscall_exit_args { 64 unsigned long long common_tp_fields; 65 long syscall_nr; 66 long ret; 67 }; 68 69 struct augmented_arg { 70 unsigned int size; 71 int err; 72 char value[PATH_MAX]; 73 }; 74 75 struct pids_filtered { 76 __uint(type, BPF_MAP_TYPE_HASH); 77 __type(key, pid_t); 78 __type(value, bool); 79 __uint(max_entries, 64); 80 } pids_filtered SEC(".maps"); 81 82 /* 83 * Desired design of maximum size and alignment (see RFC2553) 84 */ 85 #define SS_MAXSIZE 128 /* Implementation specific max size */ 86 87 typedef unsigned short sa_family_t; 88 89 /* 90 * FIXME: Should come from system headers 91 * 92 * The definition uses anonymous union and struct in order to control the 93 * default alignment. 94 */ 95 struct sockaddr_storage { 96 union { 97 struct { 98 sa_family_t ss_family; /* address family */ 99 /* Following field(s) are implementation specific */ 100 char __data[SS_MAXSIZE - sizeof(unsigned short)]; 101 /* space to achieve desired size, */ 102 /* _SS_MAXSIZE value minus size of ss_family */ 103 }; 104 void *__align; /* implementation specific desired alignment */ 105 }; 106 }; 107 108 struct augmented_args_payload { 109 struct syscall_enter_args args; 110 union { 111 struct { 112 struct augmented_arg arg, arg2; 113 }; 114 struct sockaddr_storage saddr; 115 char __data[sizeof(struct augmented_arg)]; 116 }; 117 }; 118 119 // We need more tmp space than the BPF stack can give us 120 struct augmented_args_tmp { 121 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 122 __type(key, int); 123 __type(value, struct augmented_args_payload); 124 __uint(max_entries, 1); 125 } augmented_args_tmp SEC(".maps"); 126 127 static inline struct augmented_args_payload *augmented_args_payload(void) 128 { 129 int key = 0; 130 return bpf_map_lookup_elem(&augmented_args_tmp, &key); 131 } 132 133 static inline int augmented__output(void *ctx, struct augmented_args_payload *args, int len) 134 { 135 /* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */ 136 return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len); 137 } 138 139 static inline 140 unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len) 141 { 142 unsigned int augmented_len = sizeof(*augmented_arg); 143 int string_len = bpf_probe_read_user_str(&augmented_arg->value, arg_len, arg); 144 145 augmented_arg->size = augmented_arg->err = 0; 146 /* 147 * probe_read_str may return < 0, e.g. -EFAULT 148 * So we leave that in the augmented_arg->size that userspace will 149 */ 150 if (string_len > 0) { 151 augmented_len -= sizeof(augmented_arg->value) - string_len; 152 _Static_assert(is_power_of_2(sizeof(augmented_arg->value)), "sizeof(augmented_arg->value) needs to be a power of two"); 153 augmented_len &= sizeof(augmented_arg->value) - 1; 154 augmented_arg->size = string_len; 155 } else { 156 /* 157 * So that username notice the error while still being able 158 * to skip this augmented arg record 159 */ 160 augmented_arg->err = string_len; 161 augmented_len = offsetof(struct augmented_arg, value); 162 } 163 164 return augmented_len; 165 } 166 167 SEC("tp/raw_syscalls/sys_enter") 168 int syscall_unaugmented(struct syscall_enter_args *args) 169 { 170 return 1; 171 } 172 173 /* 174 * These will be tail_called from SEC("raw_syscalls:sys_enter"), so will find in 175 * augmented_args_tmp what was read by that raw_syscalls:sys_enter and go 176 * on from there, reading the first syscall arg as a string, i.e. open's 177 * filename. 178 */ 179 SEC("tp/syscalls/sys_enter_connect") 180 int sys_enter_connect(struct syscall_enter_args *args) 181 { 182 struct augmented_args_payload *augmented_args = augmented_args_payload(); 183 const void *sockaddr_arg = (const void *)args->args[1]; 184 unsigned int socklen = args->args[2]; 185 unsigned int len = sizeof(augmented_args->args); 186 187 if (augmented_args == NULL) 188 return 1; /* Failure: don't filter */ 189 190 _Static_assert(is_power_of_2(sizeof(augmented_args->saddr)), "sizeof(augmented_args->saddr) needs to be a power of two"); 191 socklen &= sizeof(augmented_args->saddr) - 1; 192 193 bpf_probe_read_user(&augmented_args->saddr, socklen, sockaddr_arg); 194 195 return augmented__output(args, augmented_args, len + socklen); 196 } 197 198 SEC("tp/syscalls/sys_enter_sendto") 199 int sys_enter_sendto(struct syscall_enter_args *args) 200 { 201 struct augmented_args_payload *augmented_args = augmented_args_payload(); 202 const void *sockaddr_arg = (const void *)args->args[4]; 203 unsigned int socklen = args->args[5]; 204 unsigned int len = sizeof(augmented_args->args); 205 206 if (augmented_args == NULL) 207 return 1; /* Failure: don't filter */ 208 209 socklen &= sizeof(augmented_args->saddr) - 1; 210 211 bpf_probe_read_user(&augmented_args->saddr, socklen, sockaddr_arg); 212 213 return augmented__output(args, augmented_args, len + socklen); 214 } 215 216 SEC("tp/syscalls/sys_enter_open") 217 int sys_enter_open(struct syscall_enter_args *args) 218 { 219 struct augmented_args_payload *augmented_args = augmented_args_payload(); 220 const void *filename_arg = (const void *)args->args[0]; 221 unsigned int len = sizeof(augmented_args->args); 222 223 if (augmented_args == NULL) 224 return 1; /* Failure: don't filter */ 225 226 len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value)); 227 228 return augmented__output(args, augmented_args, len); 229 } 230 231 SEC("tp/syscalls/sys_enter_openat") 232 int sys_enter_openat(struct syscall_enter_args *args) 233 { 234 struct augmented_args_payload *augmented_args = augmented_args_payload(); 235 const void *filename_arg = (const void *)args->args[1]; 236 unsigned int len = sizeof(augmented_args->args); 237 238 if (augmented_args == NULL) 239 return 1; /* Failure: don't filter */ 240 241 len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value)); 242 243 return augmented__output(args, augmented_args, len); 244 } 245 246 SEC("tp/syscalls/sys_enter_rename") 247 int sys_enter_rename(struct syscall_enter_args *args) 248 { 249 struct augmented_args_payload *augmented_args = augmented_args_payload(); 250 const void *oldpath_arg = (const void *)args->args[0], 251 *newpath_arg = (const void *)args->args[1]; 252 unsigned int len = sizeof(augmented_args->args), oldpath_len; 253 254 if (augmented_args == NULL) 255 return 1; /* Failure: don't filter */ 256 257 oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value)); 258 len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value)); 259 260 return augmented__output(args, augmented_args, len); 261 } 262 263 SEC("tp/syscalls/sys_enter_renameat") 264 int sys_enter_renameat(struct syscall_enter_args *args) 265 { 266 struct augmented_args_payload *augmented_args = augmented_args_payload(); 267 const void *oldpath_arg = (const void *)args->args[1], 268 *newpath_arg = (const void *)args->args[3]; 269 unsigned int len = sizeof(augmented_args->args), oldpath_len; 270 271 if (augmented_args == NULL) 272 return 1; /* Failure: don't filter */ 273 274 oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value)); 275 len += oldpath_len + augmented_arg__read_str((void *)(&augmented_args->arg) + oldpath_len, newpath_arg, sizeof(augmented_args->arg.value)); 276 277 return augmented__output(args, augmented_args, len); 278 } 279 280 #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ 281 282 // we need just the start, get the size to then copy it 283 struct perf_event_attr_size { 284 __u32 type; 285 /* 286 * Size of the attr structure, for fwd/bwd compat. 287 */ 288 __u32 size; 289 }; 290 291 SEC("tp/syscalls/sys_enter_perf_event_open") 292 int sys_enter_perf_event_open(struct syscall_enter_args *args) 293 { 294 struct augmented_args_payload *augmented_args = augmented_args_payload(); 295 const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read; 296 unsigned int len = sizeof(augmented_args->args); 297 298 if (augmented_args == NULL) 299 goto failure; 300 301 if (bpf_probe_read_user(&augmented_args->__data, sizeof(*attr), attr) < 0) 302 goto failure; 303 304 attr_read = (const struct perf_event_attr_size *)augmented_args->__data; 305 306 __u32 size = attr_read->size; 307 308 if (!size) 309 size = PERF_ATTR_SIZE_VER0; 310 311 if (size > sizeof(augmented_args->__data)) 312 goto failure; 313 314 // Now that we read attr->size and tested it against the size limits, read it completely 315 if (bpf_probe_read_user(&augmented_args->__data, size, attr) < 0) 316 goto failure; 317 318 return augmented__output(args, augmented_args, len + size); 319 failure: 320 return 1; /* Failure: don't filter */ 321 } 322 323 SEC("tp/syscalls/sys_enter_clock_nanosleep") 324 int sys_enter_clock_nanosleep(struct syscall_enter_args *args) 325 { 326 struct augmented_args_payload *augmented_args = augmented_args_payload(); 327 const void *rqtp_arg = (const void *)args->args[2]; 328 unsigned int len = sizeof(augmented_args->args); 329 __u32 size = sizeof(struct timespec64); 330 331 if (augmented_args == NULL) 332 goto failure; 333 334 if (size > sizeof(augmented_args->__data)) 335 goto failure; 336 337 bpf_probe_read_user(&augmented_args->__data, size, rqtp_arg); 338 339 return augmented__output(args, augmented_args, len + size); 340 failure: 341 return 1; /* Failure: don't filter */ 342 } 343 344 static pid_t getpid(void) 345 { 346 return bpf_get_current_pid_tgid(); 347 } 348 349 static bool pid_filter__has(struct pids_filtered *pids, pid_t pid) 350 { 351 return bpf_map_lookup_elem(pids, &pid) != NULL; 352 } 353 354 SEC("tp/raw_syscalls/sys_enter") 355 int sys_enter(struct syscall_enter_args *args) 356 { 357 struct augmented_args_payload *augmented_args; 358 /* 359 * We start len, the amount of data that will be in the perf ring 360 * buffer, if this is not filtered out by one of pid_filter__has(), 361 * syscall->enabled, etc, with the non-augmented raw syscall payload, 362 * i.e. sizeof(augmented_args->args). 363 * 364 * We'll add to this as we add augmented syscalls right after that 365 * initial, non-augmented raw_syscalls:sys_enter payload. 366 */ 367 368 if (pid_filter__has(&pids_filtered, getpid())) 369 return 0; 370 371 augmented_args = augmented_args_payload(); 372 if (augmented_args == NULL) 373 return 1; 374 375 bpf_probe_read_kernel(&augmented_args->args, sizeof(augmented_args->args), args); 376 377 /* 378 * Jump to syscall specific augmenter, even if the default one, 379 * "!raw_syscalls:unaugmented" that will just return 1 to return the 380 * unaugmented tracepoint payload. 381 */ 382 bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr); 383 384 // If not found on the PROG_ARRAY syscalls map, then we're filtering it: 385 return 0; 386 } 387 388 SEC("tp/raw_syscalls/sys_exit") 389 int sys_exit(struct syscall_exit_args *args) 390 { 391 struct syscall_exit_args exit_args; 392 393 if (pid_filter__has(&pids_filtered, getpid())) 394 return 0; 395 396 bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args); 397 /* 398 * Jump to syscall specific return augmenter, even if the default one, 399 * "!raw_syscalls:unaugmented" that will just return 1 to return the 400 * unaugmented tracepoint payload. 401 */ 402 bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr); 403 /* 404 * If not found on the PROG_ARRAY syscalls map, then we're filtering it: 405 */ 406 return 0; 407 } 408 409 char _license[] SEC("license") = "GPL"; 410