1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Augment the raw_syscalls tracepoints with the contents of the pointer arguments. 4 * 5 * This exactly matches what is marshalled into the raw_syscall:sys_enter 6 * payload expected by the 'perf trace' beautifiers. 7 */ 8 9 #include "vmlinux.h" 10 11 #include <bpf/bpf_helpers.h> 12 #include <linux/limits.h> 13 14 #define PERF_ALIGN(x, a) __PERF_ALIGN_MASK(x, (typeof(x))(a)-1) 15 #define __PERF_ALIGN_MASK(x, mask) (((x)+(mask))&~(mask)) 16 17 /** 18 * is_power_of_2() - check if a value is a power of two 19 * @n: the value to check 20 * 21 * Determine whether some value is a power of two, where zero is *not* 22 * considered a power of two. Return: true if @n is a power of 2, otherwise 23 * false. 24 */ 25 #define is_power_of_2(n) (n != 0 && ((n & (n - 1)) == 0)) 26 27 #define MAX_CPUS 4096 28 29 #define TRACE_AUG_MAX_BUF 32 /* for buffer augmentation in perf trace */ 30 31 /* bpf-output associated map */ 32 struct __augmented_syscalls__ { 33 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 34 __type(key, int); 35 __type(value, __u32); 36 __uint(max_entries, MAX_CPUS); 37 } __augmented_syscalls__ SEC(".maps"); 38 39 /* 40 * What to augment at entry? 41 * 42 * Pointer arg payloads (filenames, etc) passed from userspace to the kernel 43 */ 44 struct syscalls_sys_enter { 45 __uint(type, BPF_MAP_TYPE_PROG_ARRAY); 46 __type(key, __u32); 47 __type(value, __u32); 48 __uint(max_entries, 512); 49 } syscalls_sys_enter SEC(".maps"); 50 51 /* 52 * What to augment at exit? 53 * 54 * Pointer arg payloads returned from the kernel (struct stat, etc) to userspace. 55 */ 56 struct syscalls_sys_exit { 57 __uint(type, BPF_MAP_TYPE_PROG_ARRAY); 58 __type(key, __u32); 59 __type(value, __u32); 60 __uint(max_entries, 512); 61 } syscalls_sys_exit SEC(".maps"); 62 63 struct syscall_enter_args { 64 unsigned long long common_tp_fields; 65 long syscall_nr; 66 unsigned long args[6]; 67 }; 68 69 struct syscall_exit_args { 70 unsigned long long common_tp_fields; 71 long syscall_nr; 72 long ret; 73 }; 74 75 /* 76 * Desired design of maximum size and alignment (see RFC2553) 77 */ 78 #define SS_MAXSIZE 128 /* Implementation specific max size */ 79 80 typedef unsigned short sa_family_t; 81 82 /* 83 * FIXME: Should come from system headers 84 * 85 * The definition uses anonymous union and struct in order to control the 86 * default alignment. 87 */ 88 struct sockaddr_storage { 89 union { 90 struct { 91 sa_family_t ss_family; /* address family */ 92 /* Following field(s) are implementation specific */ 93 char __data[SS_MAXSIZE - sizeof(unsigned short)]; 94 /* space to achieve desired size, */ 95 /* _SS_MAXSIZE value minus size of ss_family */ 96 }; 97 void *__align; /* implementation specific desired alignment */ 98 }; 99 }; 100 101 struct augmented_arg { 102 unsigned int size; 103 int err; 104 union { 105 char value[PATH_MAX]; 106 struct sockaddr_storage saddr; 107 }; 108 }; 109 110 struct pids_filtered { 111 __uint(type, BPF_MAP_TYPE_HASH); 112 __type(key, pid_t); 113 __type(value, bool); 114 __uint(max_entries, 64); 115 } pids_filtered SEC(".maps"); 116 117 struct augmented_args_payload { 118 struct syscall_enter_args args; 119 struct augmented_arg arg, arg2; // We have to reserve space for two arguments (rename, etc) 120 }; 121 122 // We need more tmp space than the BPF stack can give us 123 struct augmented_args_tmp { 124 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 125 __type(key, int); 126 __type(value, struct augmented_args_payload); 127 __uint(max_entries, 1); 128 } augmented_args_tmp SEC(".maps"); 129 130 struct beauty_map_enter { 131 __uint(type, BPF_MAP_TYPE_HASH); 132 __type(key, int); 133 __type(value, __u32[6]); 134 __uint(max_entries, 512); 135 } beauty_map_enter SEC(".maps"); 136 137 struct beauty_payload_enter { 138 struct syscall_enter_args args; 139 struct augmented_arg aug_args[6]; 140 }; 141 142 struct beauty_payload_enter_map { 143 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 144 __type(key, int); 145 __type(value, struct beauty_payload_enter); 146 __uint(max_entries, 1); 147 } beauty_payload_enter_map SEC(".maps"); 148 149 static inline struct augmented_args_payload *augmented_args_payload(void) 150 { 151 int key = 0; 152 return bpf_map_lookup_elem(&augmented_args_tmp, &key); 153 } 154 155 static inline int augmented__output(void *ctx, struct augmented_args_payload *args, int len) 156 { 157 /* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */ 158 return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len); 159 } 160 161 static inline int augmented__beauty_output(void *ctx, void *data, int len) 162 { 163 return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, data, len); 164 } 165 166 static inline 167 unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len) 168 { 169 unsigned int augmented_len = sizeof(*augmented_arg); 170 int string_len = bpf_probe_read_user_str(&augmented_arg->value, arg_len, arg); 171 172 augmented_arg->size = augmented_arg->err = 0; 173 /* 174 * probe_read_str may return < 0, e.g. -EFAULT 175 * So we leave that in the augmented_arg->size that userspace will 176 */ 177 if (string_len > 0) { 178 augmented_len -= sizeof(augmented_arg->value) - string_len; 179 _Static_assert(is_power_of_2(sizeof(augmented_arg->value)), "sizeof(augmented_arg->value) needs to be a power of two"); 180 augmented_len &= sizeof(augmented_arg->value) - 1; 181 augmented_arg->size = string_len; 182 } else { 183 /* 184 * So that username notice the error while still being able 185 * to skip this augmented arg record 186 */ 187 augmented_arg->err = string_len; 188 augmented_len = offsetof(struct augmented_arg, value); 189 } 190 191 return augmented_len; 192 } 193 194 SEC("tp/raw_syscalls/sys_enter") 195 int syscall_unaugmented(struct syscall_enter_args *args) 196 { 197 return 1; 198 } 199 200 /* 201 * These will be tail_called from SEC("raw_syscalls:sys_enter"), so will find in 202 * augmented_args_tmp what was read by that raw_syscalls:sys_enter and go 203 * on from there, reading the first syscall arg as a string, i.e. open's 204 * filename. 205 */ 206 SEC("tp/syscalls/sys_enter_connect") 207 int sys_enter_connect(struct syscall_enter_args *args) 208 { 209 struct augmented_args_payload *augmented_args = augmented_args_payload(); 210 const void *sockaddr_arg = (const void *)args->args[1]; 211 unsigned int socklen = args->args[2]; 212 unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs 213 214 if (augmented_args == NULL) 215 return 1; /* Failure: don't filter */ 216 217 _Static_assert(is_power_of_2(sizeof(augmented_args->arg.saddr)), "sizeof(augmented_args->arg.saddr) needs to be a power of two"); 218 socklen &= sizeof(augmented_args->arg.saddr) - 1; 219 220 bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg); 221 augmented_args->arg.size = socklen; 222 augmented_args->arg.err = 0; 223 224 return augmented__output(args, augmented_args, len + socklen); 225 } 226 227 SEC("tp/syscalls/sys_enter_sendto") 228 int sys_enter_sendto(struct syscall_enter_args *args) 229 { 230 struct augmented_args_payload *augmented_args = augmented_args_payload(); 231 const void *sockaddr_arg = (const void *)args->args[4]; 232 unsigned int socklen = args->args[5]; 233 unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs 234 235 if (augmented_args == NULL) 236 return 1; /* Failure: don't filter */ 237 238 socklen &= sizeof(augmented_args->arg.saddr) - 1; 239 240 bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg); 241 242 return augmented__output(args, augmented_args, len + socklen); 243 } 244 245 SEC("tp/syscalls/sys_enter_open") 246 int sys_enter_open(struct syscall_enter_args *args) 247 { 248 struct augmented_args_payload *augmented_args = augmented_args_payload(); 249 const void *filename_arg = (const void *)args->args[0]; 250 unsigned int len = sizeof(augmented_args->args); 251 252 if (augmented_args == NULL) 253 return 1; /* Failure: don't filter */ 254 255 len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value)); 256 257 return augmented__output(args, augmented_args, len); 258 } 259 260 SEC("tp/syscalls/sys_enter_openat") 261 int sys_enter_openat(struct syscall_enter_args *args) 262 { 263 struct augmented_args_payload *augmented_args = augmented_args_payload(); 264 const void *filename_arg = (const void *)args->args[1]; 265 unsigned int len = sizeof(augmented_args->args); 266 267 if (augmented_args == NULL) 268 return 1; /* Failure: don't filter */ 269 270 len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value)); 271 272 return augmented__output(args, augmented_args, len); 273 } 274 275 SEC("tp/syscalls/sys_enter_rename") 276 int sys_enter_rename(struct syscall_enter_args *args) 277 { 278 struct augmented_args_payload *augmented_args = augmented_args_payload(); 279 const void *oldpath_arg = (const void *)args->args[0], 280 *newpath_arg = (const void *)args->args[1]; 281 unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len; 282 283 if (augmented_args == NULL) 284 return 1; /* Failure: don't filter */ 285 286 len += 2 * sizeof(u64); // The overhead of size and err, just before the payload... 287 288 oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value)); 289 augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64)); 290 len += augmented_args->arg.size; 291 292 /* Every read from userspace is limited to value size */ 293 if (augmented_args->arg.size > sizeof(augmented_args->arg.value)) 294 return 1; /* Failure: don't filter */ 295 296 struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size; 297 298 newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value)); 299 arg2->size = newpath_len; 300 301 len += newpath_len; 302 303 return augmented__output(args, augmented_args, len); 304 } 305 306 SEC("tp/syscalls/sys_enter_renameat2") 307 int sys_enter_renameat2(struct syscall_enter_args *args) 308 { 309 struct augmented_args_payload *augmented_args = augmented_args_payload(); 310 const void *oldpath_arg = (const void *)args->args[1], 311 *newpath_arg = (const void *)args->args[3]; 312 unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len; 313 314 if (augmented_args == NULL) 315 return 1; /* Failure: don't filter */ 316 317 len += 2 * sizeof(u64); // The overhead of size and err, just before the payload... 318 319 oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value)); 320 augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64)); 321 len += augmented_args->arg.size; 322 323 /* Every read from userspace is limited to value size */ 324 if (augmented_args->arg.size > sizeof(augmented_args->arg.value)) 325 return 1; /* Failure: don't filter */ 326 327 struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size; 328 329 newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value)); 330 arg2->size = newpath_len; 331 332 len += newpath_len; 333 334 return augmented__output(args, augmented_args, len); 335 } 336 337 #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ 338 339 // we need just the start, get the size to then copy it 340 struct perf_event_attr_size { 341 __u32 type; 342 /* 343 * Size of the attr structure, for fwd/bwd compat. 344 */ 345 __u32 size; 346 }; 347 348 SEC("tp/syscalls/sys_enter_perf_event_open") 349 int sys_enter_perf_event_open(struct syscall_enter_args *args) 350 { 351 struct augmented_args_payload *augmented_args = augmented_args_payload(); 352 const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read; 353 unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs 354 355 if (augmented_args == NULL) 356 goto failure; 357 358 if (bpf_probe_read_user(&augmented_args->arg.value, sizeof(*attr), attr) < 0) 359 goto failure; 360 361 attr_read = (const struct perf_event_attr_size *)augmented_args->arg.value; 362 363 __u32 size = attr_read->size; 364 365 if (!size) 366 size = PERF_ATTR_SIZE_VER0; 367 368 if (size > sizeof(augmented_args->arg.value)) 369 goto failure; 370 371 // Now that we read attr->size and tested it against the size limits, read it completely 372 if (bpf_probe_read_user(&augmented_args->arg.value, size, attr) < 0) 373 goto failure; 374 375 return augmented__output(args, augmented_args, len + size); 376 failure: 377 return 1; /* Failure: don't filter */ 378 } 379 380 SEC("tp/syscalls/sys_enter_clock_nanosleep") 381 int sys_enter_clock_nanosleep(struct syscall_enter_args *args) 382 { 383 struct augmented_args_payload *augmented_args = augmented_args_payload(); 384 const void *rqtp_arg = (const void *)args->args[2]; 385 unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs 386 __u32 size = sizeof(struct timespec64); 387 388 if (augmented_args == NULL) 389 goto failure; 390 391 if (size > sizeof(augmented_args->arg.value)) 392 goto failure; 393 394 bpf_probe_read_user(&augmented_args->arg.value, size, rqtp_arg); 395 396 return augmented__output(args, augmented_args, len + size); 397 failure: 398 return 1; /* Failure: don't filter */ 399 } 400 401 SEC("tp/syscalls/sys_enter_nanosleep") 402 int sys_enter_nanosleep(struct syscall_enter_args *args) 403 { 404 struct augmented_args_payload *augmented_args = augmented_args_payload(); 405 const void *req_arg = (const void *)args->args[0]; 406 unsigned int len = sizeof(augmented_args->args); 407 __u32 size = sizeof(struct timespec64); 408 409 if (augmented_args == NULL) 410 goto failure; 411 412 if (size > sizeof(augmented_args->arg.value)) 413 goto failure; 414 415 bpf_probe_read_user(&augmented_args->arg.value, size, req_arg); 416 417 return augmented__output(args, augmented_args, len + size); 418 failure: 419 return 1; /* Failure: don't filter */ 420 } 421 422 static pid_t getpid(void) 423 { 424 return bpf_get_current_pid_tgid(); 425 } 426 427 static bool pid_filter__has(struct pids_filtered *pids, pid_t pid) 428 { 429 return bpf_map_lookup_elem(pids, &pid) != NULL; 430 } 431 432 static int augment_sys_enter(void *ctx, struct syscall_enter_args *args) 433 { 434 bool augmented, do_output = false; 435 int zero = 0, index, value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value); 436 u64 output = 0; /* has to be u64, otherwise it won't pass the verifier */ 437 s64 aug_size, size; 438 unsigned int nr, *beauty_map; 439 struct beauty_payload_enter *payload; 440 void *arg, *payload_offset; 441 442 /* fall back to do predefined tail call */ 443 if (args == NULL) 444 return 1; 445 446 /* use syscall number to get beauty_map entry */ 447 nr = (__u32)args->syscall_nr; 448 beauty_map = bpf_map_lookup_elem(&beauty_map_enter, &nr); 449 450 /* set up payload for output */ 451 payload = bpf_map_lookup_elem(&beauty_payload_enter_map, &zero); 452 payload_offset = (void *)&payload->aug_args; 453 454 if (beauty_map == NULL || payload == NULL) 455 return 1; 456 457 /* copy the sys_enter header, which has the syscall_nr */ 458 __builtin_memcpy(&payload->args, args, sizeof(struct syscall_enter_args)); 459 460 /* 461 * Determine what type of argument and how many bytes to read from user space, using the 462 * value in the beauty_map. This is the relation of parameter type and its corresponding 463 * value in the beauty map, and how many bytes we read eventually: 464 * 465 * string: 1 -> size of string 466 * struct: size of struct -> size of struct 467 * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF) 468 */ 469 for (int i = 0; i < 6; i++) { 470 arg = (void *)args->args[i]; 471 augmented = false; 472 size = beauty_map[i]; 473 aug_size = size; /* size of the augmented data read from user space */ 474 475 if (size == 0 || arg == NULL) 476 continue; 477 478 if (size == 1) { /* string */ 479 aug_size = bpf_probe_read_user_str(((struct augmented_arg *)payload_offset)->value, value_size, arg); 480 /* minimum of 0 to pass the verifier */ 481 if (aug_size < 0) 482 aug_size = 0; 483 484 augmented = true; 485 } else if (size > 0 && size <= value_size) { /* struct */ 486 if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, size, arg)) 487 augmented = true; 488 } else if ((int)size < 0 && size >= -6) { /* buffer */ 489 index = -(size + 1); 490 barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick. 491 index &= 7; // Satisfy the bounds checking with the verifier in some kernels. 492 aug_size = args->args[index] > TRACE_AUG_MAX_BUF ? TRACE_AUG_MAX_BUF : args->args[index]; 493 494 if (aug_size > 0) { 495 if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg)) 496 augmented = true; 497 } 498 } 499 500 /* Augmented data size is limited to sizeof(augmented_arg->unnamed union with value field) */ 501 if (aug_size > value_size) 502 aug_size = value_size; 503 504 /* write data to payload */ 505 if (augmented) { 506 int written = offsetof(struct augmented_arg, value) + aug_size; 507 508 if (written < 0 || written > sizeof(struct augmented_arg)) 509 return 1; 510 511 ((struct augmented_arg *)payload_offset)->size = aug_size; 512 output += written; 513 payload_offset += written; 514 do_output = true; 515 } 516 } 517 518 if (!do_output || (sizeof(struct syscall_enter_args) + output) > sizeof(struct beauty_payload_enter)) 519 return 1; 520 521 return augmented__beauty_output(ctx, payload, sizeof(struct syscall_enter_args) + output); 522 } 523 524 SEC("tp/raw_syscalls/sys_enter") 525 int sys_enter(struct syscall_enter_args *args) 526 { 527 struct augmented_args_payload *augmented_args; 528 /* 529 * We start len, the amount of data that will be in the perf ring 530 * buffer, if this is not filtered out by one of pid_filter__has(), 531 * syscall->enabled, etc, with the non-augmented raw syscall payload, 532 * i.e. sizeof(augmented_args->args). 533 * 534 * We'll add to this as we add augmented syscalls right after that 535 * initial, non-augmented raw_syscalls:sys_enter payload. 536 */ 537 538 if (pid_filter__has(&pids_filtered, getpid())) 539 return 0; 540 541 augmented_args = augmented_args_payload(); 542 if (augmented_args == NULL) 543 return 1; 544 545 bpf_probe_read_kernel(&augmented_args->args, sizeof(augmented_args->args), args); 546 547 /* 548 * Jump to syscall specific augmenter, even if the default one, 549 * "!raw_syscalls:unaugmented" that will just return 1 to return the 550 * unaugmented tracepoint payload. 551 */ 552 if (augment_sys_enter(args, &augmented_args->args)) 553 bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr); 554 555 // If not found on the PROG_ARRAY syscalls map, then we're filtering it: 556 return 0; 557 } 558 559 SEC("tp/raw_syscalls/sys_exit") 560 int sys_exit(struct syscall_exit_args *args) 561 { 562 struct syscall_exit_args exit_args; 563 564 if (pid_filter__has(&pids_filtered, getpid())) 565 return 0; 566 567 bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args); 568 /* 569 * Jump to syscall specific return augmenter, even if the default one, 570 * "!raw_syscalls:unaugmented" that will just return 1 to return the 571 * unaugmented tracepoint payload. 572 */ 573 bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr); 574 /* 575 * If not found on the PROG_ARRAY syscalls map, then we're filtering it: 576 */ 577 return 0; 578 } 579 580 char _license[] SEC("license") = "GPL"; 581