1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Augment the raw_syscalls tracepoints with the contents of the pointer arguments. 4 * 5 * This exactly matches what is marshalled into the raw_syscall:sys_enter 6 * payload expected by the 'perf trace' beautifiers. 7 */ 8 9 #include "vmlinux.h" 10 #include "../trace_augment.h" 11 12 #include <bpf/bpf_helpers.h> 13 #include <linux/limits.h> 14 15 #define PERF_ALIGN(x, a) __PERF_ALIGN_MASK(x, (typeof(x))(a)-1) 16 #define __PERF_ALIGN_MASK(x, mask) (((x)+(mask))&~(mask)) 17 18 /** 19 * is_power_of_2() - check if a value is a power of two 20 * @n: the value to check 21 * 22 * Determine whether some value is a power of two, where zero is *not* 23 * considered a power of two. Return: true if @n is a power of 2, otherwise 24 * false. 25 */ 26 #define is_power_of_2(n) (n != 0 && ((n & (n - 1)) == 0)) 27 28 #define MAX_CPUS 4096 29 30 /* bpf-output associated map */ 31 struct __augmented_syscalls__ { 32 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 33 __type(key, int); 34 __type(value, __u32); 35 __uint(max_entries, MAX_CPUS); 36 } __augmented_syscalls__ SEC(".maps"); 37 38 /* 39 * What to augment at entry? 40 * 41 * Pointer arg payloads (filenames, etc) passed from userspace to the kernel 42 */ 43 struct syscalls_sys_enter { 44 __uint(type, BPF_MAP_TYPE_PROG_ARRAY); 45 __type(key, __u32); 46 __type(value, __u32); 47 __uint(max_entries, 512); 48 } syscalls_sys_enter SEC(".maps"); 49 50 /* 51 * What to augment at exit? 52 * 53 * Pointer arg payloads returned from the kernel (struct stat, etc) to userspace. 54 */ 55 struct syscalls_sys_exit { 56 __uint(type, BPF_MAP_TYPE_PROG_ARRAY); 57 __type(key, __u32); 58 __type(value, __u32); 59 __uint(max_entries, 512); 60 } syscalls_sys_exit SEC(".maps"); 61 62 struct syscall_enter_args { 63 unsigned long long common_tp_fields; 64 long syscall_nr; 65 unsigned long args[6]; 66 }; 67 68 struct syscall_exit_args { 69 unsigned long long common_tp_fields; 70 long syscall_nr; 71 long ret; 72 }; 73 74 /* 75 * Desired design of maximum size and alignment (see RFC2553) 76 */ 77 #define SS_MAXSIZE 128 /* Implementation specific max size */ 78 79 typedef unsigned short sa_family_t; 80 81 /* 82 * FIXME: Should come from system headers 83 * 84 * The definition uses anonymous union and struct in order to control the 85 * default alignment. 86 */ 87 struct sockaddr_storage { 88 union { 89 struct { 90 sa_family_t ss_family; /* address family */ 91 /* Following field(s) are implementation specific */ 92 char __data[SS_MAXSIZE - sizeof(unsigned short)]; 93 /* space to achieve desired size, */ 94 /* _SS_MAXSIZE value minus size of ss_family */ 95 }; 96 void *__align; /* implementation specific desired alignment */ 97 }; 98 }; 99 100 struct augmented_arg { 101 unsigned int size; 102 int err; 103 union { 104 char value[PATH_MAX]; 105 struct sockaddr_storage saddr; 106 }; 107 }; 108 109 struct pids_filtered { 110 __uint(type, BPF_MAP_TYPE_HASH); 111 __type(key, pid_t); 112 __type(value, bool); 113 __uint(max_entries, 64); 114 } pids_filtered SEC(".maps"); 115 116 struct augmented_args_payload { 117 struct syscall_enter_args args; 118 struct augmented_arg arg, arg2; // We have to reserve space for two arguments (rename, etc) 119 }; 120 121 // We need more tmp space than the BPF stack can give us 122 struct augmented_args_tmp { 123 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 124 __type(key, int); 125 __type(value, struct augmented_args_payload); 126 __uint(max_entries, 1); 127 } augmented_args_tmp SEC(".maps"); 128 129 struct beauty_map_enter { 130 __uint(type, BPF_MAP_TYPE_HASH); 131 __type(key, int); 132 __type(value, __u32[6]); 133 __uint(max_entries, 512); 134 } beauty_map_enter SEC(".maps"); 135 136 struct beauty_payload_enter { 137 struct syscall_enter_args args; 138 struct augmented_arg aug_args[6]; 139 }; 140 141 struct beauty_payload_enter_map { 142 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 143 __type(key, int); 144 __type(value, struct beauty_payload_enter); 145 __uint(max_entries, 1); 146 } beauty_payload_enter_map SEC(".maps"); 147 148 static inline struct augmented_args_payload *augmented_args_payload(void) 149 { 150 int key = 0; 151 return bpf_map_lookup_elem(&augmented_args_tmp, &key); 152 } 153 154 static inline int augmented__output(void *ctx, struct augmented_args_payload *args, int len) 155 { 156 /* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */ 157 return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len); 158 } 159 160 static inline int augmented__beauty_output(void *ctx, void *data, int len) 161 { 162 return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, data, len); 163 } 164 165 static inline 166 unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len) 167 { 168 unsigned int augmented_len = sizeof(*augmented_arg); 169 int string_len = bpf_probe_read_user_str(&augmented_arg->value, arg_len, arg); 170 171 augmented_arg->size = augmented_arg->err = 0; 172 /* 173 * probe_read_str may return < 0, e.g. -EFAULT 174 * So we leave that in the augmented_arg->size that userspace will 175 */ 176 if (string_len > 0) { 177 augmented_len -= sizeof(augmented_arg->value) - string_len; 178 _Static_assert(is_power_of_2(sizeof(augmented_arg->value)), "sizeof(augmented_arg->value) needs to be a power of two"); 179 augmented_len &= sizeof(augmented_arg->value) - 1; 180 augmented_arg->size = string_len; 181 } else { 182 /* 183 * So that username notice the error while still being able 184 * to skip this augmented arg record 185 */ 186 augmented_arg->err = string_len; 187 augmented_len = offsetof(struct augmented_arg, value); 188 } 189 190 return augmented_len; 191 } 192 193 SEC("tp/raw_syscalls/sys_enter") 194 int syscall_unaugmented(struct syscall_enter_args *args) 195 { 196 return 1; 197 } 198 199 /* 200 * These will be tail_called from SEC("raw_syscalls:sys_enter"), so will find in 201 * augmented_args_tmp what was read by that raw_syscalls:sys_enter and go 202 * on from there, reading the first syscall arg as a string, i.e. open's 203 * filename. 204 */ 205 SEC("tp/syscalls/sys_enter_connect") 206 int sys_enter_connect(struct syscall_enter_args *args) 207 { 208 struct augmented_args_payload *augmented_args = augmented_args_payload(); 209 const void *sockaddr_arg = (const void *)args->args[1]; 210 unsigned int socklen = args->args[2]; 211 unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs 212 213 if (augmented_args == NULL) 214 return 1; /* Failure: don't filter */ 215 216 _Static_assert(is_power_of_2(sizeof(augmented_args->arg.saddr)), "sizeof(augmented_args->arg.saddr) needs to be a power of two"); 217 socklen &= sizeof(augmented_args->arg.saddr) - 1; 218 219 bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg); 220 augmented_args->arg.size = socklen; 221 augmented_args->arg.err = 0; 222 223 return augmented__output(args, augmented_args, len + socklen); 224 } 225 226 SEC("tp/syscalls/sys_enter_sendto") 227 int sys_enter_sendto(struct syscall_enter_args *args) 228 { 229 struct augmented_args_payload *augmented_args = augmented_args_payload(); 230 const void *sockaddr_arg = (const void *)args->args[4]; 231 unsigned int socklen = args->args[5]; 232 unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs 233 234 if (augmented_args == NULL) 235 return 1; /* Failure: don't filter */ 236 237 socklen &= sizeof(augmented_args->arg.saddr) - 1; 238 239 bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg); 240 241 return augmented__output(args, augmented_args, len + socklen); 242 } 243 244 SEC("tp/syscalls/sys_enter_open") 245 int sys_enter_open(struct syscall_enter_args *args) 246 { 247 struct augmented_args_payload *augmented_args = augmented_args_payload(); 248 const void *filename_arg = (const void *)args->args[0]; 249 unsigned int len = sizeof(augmented_args->args); 250 251 if (augmented_args == NULL) 252 return 1; /* Failure: don't filter */ 253 254 len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value)); 255 256 return augmented__output(args, augmented_args, len); 257 } 258 259 SEC("tp/syscalls/sys_enter_openat") 260 int sys_enter_openat(struct syscall_enter_args *args) 261 { 262 struct augmented_args_payload *augmented_args = augmented_args_payload(); 263 const void *filename_arg = (const void *)args->args[1]; 264 unsigned int len = sizeof(augmented_args->args); 265 266 if (augmented_args == NULL) 267 return 1; /* Failure: don't filter */ 268 269 len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value)); 270 271 return augmented__output(args, augmented_args, len); 272 } 273 274 SEC("tp/syscalls/sys_enter_rename") 275 int sys_enter_rename(struct syscall_enter_args *args) 276 { 277 struct augmented_args_payload *augmented_args = augmented_args_payload(); 278 const void *oldpath_arg = (const void *)args->args[0], 279 *newpath_arg = (const void *)args->args[1]; 280 unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len; 281 282 if (augmented_args == NULL) 283 return 1; /* Failure: don't filter */ 284 285 len += 2 * sizeof(u64); // The overhead of size and err, just before the payload... 286 287 oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value)); 288 augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64)); 289 len += augmented_args->arg.size; 290 291 /* Every read from userspace is limited to value size */ 292 if (augmented_args->arg.size > sizeof(augmented_args->arg.value)) 293 return 1; /* Failure: don't filter */ 294 295 struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size; 296 297 newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value)); 298 arg2->size = newpath_len; 299 300 len += newpath_len; 301 302 return augmented__output(args, augmented_args, len); 303 } 304 305 SEC("tp/syscalls/sys_enter_renameat2") 306 int sys_enter_renameat2(struct syscall_enter_args *args) 307 { 308 struct augmented_args_payload *augmented_args = augmented_args_payload(); 309 const void *oldpath_arg = (const void *)args->args[1], 310 *newpath_arg = (const void *)args->args[3]; 311 unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len; 312 313 if (augmented_args == NULL) 314 return 1; /* Failure: don't filter */ 315 316 len += 2 * sizeof(u64); // The overhead of size and err, just before the payload... 317 318 oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value)); 319 augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64)); 320 len += augmented_args->arg.size; 321 322 /* Every read from userspace is limited to value size */ 323 if (augmented_args->arg.size > sizeof(augmented_args->arg.value)) 324 return 1; /* Failure: don't filter */ 325 326 struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size; 327 328 newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value)); 329 arg2->size = newpath_len; 330 331 len += newpath_len; 332 333 return augmented__output(args, augmented_args, len); 334 } 335 336 #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ 337 338 // we need just the start, get the size to then copy it 339 struct perf_event_attr_size { 340 __u32 type; 341 /* 342 * Size of the attr structure, for fwd/bwd compat. 343 */ 344 __u32 size; 345 }; 346 347 SEC("tp/syscalls/sys_enter_perf_event_open") 348 int sys_enter_perf_event_open(struct syscall_enter_args *args) 349 { 350 struct augmented_args_payload *augmented_args = augmented_args_payload(); 351 const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read; 352 unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs 353 354 if (augmented_args == NULL) 355 goto failure; 356 357 if (bpf_probe_read_user(&augmented_args->arg.value, sizeof(*attr), attr) < 0) 358 goto failure; 359 360 attr_read = (const struct perf_event_attr_size *)augmented_args->arg.value; 361 362 __u32 size = attr_read->size; 363 364 if (!size) 365 size = PERF_ATTR_SIZE_VER0; 366 367 if (size > sizeof(augmented_args->arg.value)) 368 goto failure; 369 370 // Now that we read attr->size and tested it against the size limits, read it completely 371 if (bpf_probe_read_user(&augmented_args->arg.value, size, attr) < 0) 372 goto failure; 373 374 return augmented__output(args, augmented_args, len + size); 375 failure: 376 return 1; /* Failure: don't filter */ 377 } 378 379 SEC("tp/syscalls/sys_enter_clock_nanosleep") 380 int sys_enter_clock_nanosleep(struct syscall_enter_args *args) 381 { 382 struct augmented_args_payload *augmented_args = augmented_args_payload(); 383 const void *rqtp_arg = (const void *)args->args[2]; 384 unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs 385 __u32 size = sizeof(struct timespec64); 386 387 if (augmented_args == NULL) 388 goto failure; 389 390 if (size > sizeof(augmented_args->arg.value)) 391 goto failure; 392 393 bpf_probe_read_user(&augmented_args->arg.value, size, rqtp_arg); 394 395 return augmented__output(args, augmented_args, len + size); 396 failure: 397 return 1; /* Failure: don't filter */ 398 } 399 400 SEC("tp/syscalls/sys_enter_nanosleep") 401 int sys_enter_nanosleep(struct syscall_enter_args *args) 402 { 403 struct augmented_args_payload *augmented_args = augmented_args_payload(); 404 const void *req_arg = (const void *)args->args[0]; 405 unsigned int len = sizeof(augmented_args->args); 406 __u32 size = sizeof(struct timespec64); 407 408 if (augmented_args == NULL) 409 goto failure; 410 411 if (size > sizeof(augmented_args->arg.value)) 412 goto failure; 413 414 bpf_probe_read_user(&augmented_args->arg.value, size, req_arg); 415 416 return augmented__output(args, augmented_args, len + size); 417 failure: 418 return 1; /* Failure: don't filter */ 419 } 420 421 static pid_t getpid(void) 422 { 423 return bpf_get_current_pid_tgid(); 424 } 425 426 static bool pid_filter__has(struct pids_filtered *pids, pid_t pid) 427 { 428 return bpf_map_lookup_elem(pids, &pid) != NULL; 429 } 430 431 static int augment_sys_enter(void *ctx, struct syscall_enter_args *args) 432 { 433 bool augmented, do_output = false; 434 int zero = 0, size, aug_size, index, 435 value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value); 436 u64 output = 0; /* has to be u64, otherwise it won't pass the verifier */ 437 unsigned int nr, *beauty_map; 438 struct beauty_payload_enter *payload; 439 void *arg, *payload_offset; 440 441 /* fall back to do predefined tail call */ 442 if (args == NULL) 443 return 1; 444 445 /* use syscall number to get beauty_map entry */ 446 nr = (__u32)args->syscall_nr; 447 beauty_map = bpf_map_lookup_elem(&beauty_map_enter, &nr); 448 449 /* set up payload for output */ 450 payload = bpf_map_lookup_elem(&beauty_payload_enter_map, &zero); 451 payload_offset = (void *)&payload->aug_args; 452 453 if (beauty_map == NULL || payload == NULL) 454 return 1; 455 456 /* copy the sys_enter header, which has the syscall_nr */ 457 __builtin_memcpy(&payload->args, args, sizeof(struct syscall_enter_args)); 458 459 /* 460 * Determine what type of argument and how many bytes to read from user space, using the 461 * value in the beauty_map. This is the relation of parameter type and its corresponding 462 * value in the beauty map, and how many bytes we read eventually: 463 * 464 * string: 1 -> size of string 465 * struct: size of struct -> size of struct 466 * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF) 467 */ 468 for (int i = 0; i < 6; i++) { 469 arg = (void *)args->args[i]; 470 augmented = false; 471 size = beauty_map[i]; 472 aug_size = size; /* size of the augmented data read from user space */ 473 474 if (size == 0 || arg == NULL) 475 continue; 476 477 if (size == 1) { /* string */ 478 aug_size = bpf_probe_read_user_str(((struct augmented_arg *)payload_offset)->value, value_size, arg); 479 /* minimum of 0 to pass the verifier */ 480 if (aug_size < 0) 481 aug_size = 0; 482 483 augmented = true; 484 } else if (size > 0 && size <= value_size) { /* struct */ 485 if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, size, arg)) 486 augmented = true; 487 } else if (size < 0 && size >= -6) { /* buffer */ 488 index = -(size + 1); 489 barrier_var(index); // Prevent clang (noticed with v18) from removing the &= 7 trick. 490 index &= 7; // Satisfy the bounds checking with the verifier in some kernels. 491 aug_size = args->args[index]; 492 493 if (aug_size > TRACE_AUG_MAX_BUF) 494 aug_size = TRACE_AUG_MAX_BUF; 495 496 if (aug_size > 0) { 497 if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg)) 498 augmented = true; 499 } 500 } 501 502 /* Augmented data size is limited to sizeof(augmented_arg->unnamed union with value field) */ 503 if (aug_size > value_size) 504 aug_size = value_size; 505 506 /* write data to payload */ 507 if (augmented) { 508 int written = offsetof(struct augmented_arg, value) + aug_size; 509 510 if (written < 0 || written > sizeof(struct augmented_arg)) 511 return 1; 512 513 ((struct augmented_arg *)payload_offset)->size = aug_size; 514 output += written; 515 payload_offset += written; 516 do_output = true; 517 } 518 } 519 520 if (!do_output || (sizeof(struct syscall_enter_args) + output) > sizeof(struct beauty_payload_enter)) 521 return 1; 522 523 return augmented__beauty_output(ctx, payload, sizeof(struct syscall_enter_args) + output); 524 } 525 526 SEC("tp/raw_syscalls/sys_enter") 527 int sys_enter(struct syscall_enter_args *args) 528 { 529 struct augmented_args_payload *augmented_args; 530 /* 531 * We start len, the amount of data that will be in the perf ring 532 * buffer, if this is not filtered out by one of pid_filter__has(), 533 * syscall->enabled, etc, with the non-augmented raw syscall payload, 534 * i.e. sizeof(augmented_args->args). 535 * 536 * We'll add to this as we add augmented syscalls right after that 537 * initial, non-augmented raw_syscalls:sys_enter payload. 538 */ 539 540 if (pid_filter__has(&pids_filtered, getpid())) 541 return 0; 542 543 augmented_args = augmented_args_payload(); 544 if (augmented_args == NULL) 545 return 1; 546 547 bpf_probe_read_kernel(&augmented_args->args, sizeof(augmented_args->args), args); 548 549 /* 550 * Jump to syscall specific augmenter, even if the default one, 551 * "!raw_syscalls:unaugmented" that will just return 1 to return the 552 * unaugmented tracepoint payload. 553 */ 554 if (augment_sys_enter(args, &augmented_args->args)) 555 bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr); 556 557 // If not found on the PROG_ARRAY syscalls map, then we're filtering it: 558 return 0; 559 } 560 561 SEC("tp/raw_syscalls/sys_exit") 562 int sys_exit(struct syscall_exit_args *args) 563 { 564 struct syscall_exit_args exit_args; 565 566 if (pid_filter__has(&pids_filtered, getpid())) 567 return 0; 568 569 bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args); 570 /* 571 * Jump to syscall specific return augmenter, even if the default one, 572 * "!raw_syscalls:unaugmented" that will just return 1 to return the 573 * unaugmented tracepoint payload. 574 */ 575 bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr); 576 /* 577 * If not found on the PROG_ARRAY syscalls map, then we're filtering it: 578 */ 579 return 0; 580 } 581 582 char _license[] SEC("license") = "GPL"; 583