1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Augment the raw_syscalls tracepoints with the contents of the pointer arguments. 4 * 5 * This exactly matches what is marshalled into the raw_syscall:sys_enter 6 * payload expected by the 'perf trace' beautifiers. 7 */ 8 9 #include "vmlinux.h" 10 #include "../trace_augment.h" 11 12 #include <bpf/bpf_helpers.h> 13 #include <linux/limits.h> 14 15 #define PERF_ALIGN(x, a) __PERF_ALIGN_MASK(x, (typeof(x))(a)-1) 16 #define __PERF_ALIGN_MASK(x, mask) (((x)+(mask))&~(mask)) 17 18 /** 19 * is_power_of_2() - check if a value is a power of two 20 * @n: the value to check 21 * 22 * Determine whether some value is a power of two, where zero is *not* 23 * considered a power of two. Return: true if @n is a power of 2, otherwise 24 * false. 25 */ 26 #define is_power_of_2(n) (n != 0 && ((n & (n - 1)) == 0)) 27 28 #define MAX_CPUS 4096 29 30 /* bpf-output associated map */ 31 struct __augmented_syscalls__ { 32 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 33 __type(key, int); 34 __type(value, __u32); 35 __uint(max_entries, MAX_CPUS); 36 } __augmented_syscalls__ SEC(".maps"); 37 38 /* 39 * What to augment at entry? 40 * 41 * Pointer arg payloads (filenames, etc) passed from userspace to the kernel 42 */ 43 struct syscalls_sys_enter { 44 __uint(type, BPF_MAP_TYPE_PROG_ARRAY); 45 __type(key, __u32); 46 __type(value, __u32); 47 __uint(max_entries, 512); 48 } syscalls_sys_enter SEC(".maps"); 49 50 /* 51 * What to augment at exit? 52 * 53 * Pointer arg payloads returned from the kernel (struct stat, etc) to userspace. 54 */ 55 struct syscalls_sys_exit { 56 __uint(type, BPF_MAP_TYPE_PROG_ARRAY); 57 __type(key, __u32); 58 __type(value, __u32); 59 __uint(max_entries, 512); 60 } syscalls_sys_exit SEC(".maps"); 61 62 struct syscall_enter_args { 63 unsigned long long common_tp_fields; 64 long syscall_nr; 65 unsigned long args[6]; 66 }; 67 68 struct syscall_exit_args { 69 unsigned long long common_tp_fields; 70 long syscall_nr; 71 long ret; 72 }; 73 74 /* 75 * Desired design of maximum size and alignment (see RFC2553) 76 */ 77 #define SS_MAXSIZE 128 /* Implementation specific max size */ 78 79 typedef unsigned short sa_family_t; 80 81 /* 82 * FIXME: Should come from system headers 83 * 84 * The definition uses anonymous union and struct in order to control the 85 * default alignment. 86 */ 87 struct sockaddr_storage { 88 union { 89 struct { 90 sa_family_t ss_family; /* address family */ 91 /* Following field(s) are implementation specific */ 92 char __data[SS_MAXSIZE - sizeof(unsigned short)]; 93 /* space to achieve desired size, */ 94 /* _SS_MAXSIZE value minus size of ss_family */ 95 }; 96 void *__align; /* implementation specific desired alignment */ 97 }; 98 }; 99 100 struct augmented_arg { 101 unsigned int size; 102 int err; 103 union { 104 char value[PATH_MAX]; 105 struct sockaddr_storage saddr; 106 }; 107 }; 108 109 struct pids_filtered { 110 __uint(type, BPF_MAP_TYPE_HASH); 111 __type(key, pid_t); 112 __type(value, bool); 113 __uint(max_entries, 64); 114 } pids_filtered SEC(".maps"); 115 116 struct augmented_args_payload { 117 struct syscall_enter_args args; 118 struct augmented_arg arg, arg2; // We have to reserve space for two arguments (rename, etc) 119 }; 120 121 // We need more tmp space than the BPF stack can give us 122 struct augmented_args_tmp { 123 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 124 __type(key, int); 125 __type(value, struct augmented_args_payload); 126 __uint(max_entries, 1); 127 } augmented_args_tmp SEC(".maps"); 128 129 struct beauty_map_enter { 130 __uint(type, BPF_MAP_TYPE_HASH); 131 __type(key, int); 132 __type(value, __u32[6]); 133 __uint(max_entries, 512); 134 } beauty_map_enter SEC(".maps"); 135 136 struct beauty_payload_enter { 137 struct syscall_enter_args args; 138 struct augmented_arg aug_args[6]; 139 }; 140 141 struct beauty_payload_enter_map { 142 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 143 __type(key, int); 144 __type(value, struct beauty_payload_enter); 145 __uint(max_entries, 1); 146 } beauty_payload_enter_map SEC(".maps"); 147 148 static inline struct augmented_args_payload *augmented_args_payload(void) 149 { 150 int key = 0; 151 return bpf_map_lookup_elem(&augmented_args_tmp, &key); 152 } 153 154 static inline int augmented__output(void *ctx, struct augmented_args_payload *args, int len) 155 { 156 /* If perf_event_output fails, return non-zero so that it gets recorded unaugmented */ 157 return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, args, len); 158 } 159 160 static inline int augmented__beauty_output(void *ctx, void *data, int len) 161 { 162 return bpf_perf_event_output(ctx, &__augmented_syscalls__, BPF_F_CURRENT_CPU, data, len); 163 } 164 165 static inline 166 unsigned int augmented_arg__read_str(struct augmented_arg *augmented_arg, const void *arg, unsigned int arg_len) 167 { 168 unsigned int augmented_len = sizeof(*augmented_arg); 169 int string_len = bpf_probe_read_user_str(&augmented_arg->value, arg_len, arg); 170 171 augmented_arg->size = augmented_arg->err = 0; 172 /* 173 * probe_read_str may return < 0, e.g. -EFAULT 174 * So we leave that in the augmented_arg->size that userspace will 175 */ 176 if (string_len > 0) { 177 augmented_len -= sizeof(augmented_arg->value) - string_len; 178 _Static_assert(is_power_of_2(sizeof(augmented_arg->value)), "sizeof(augmented_arg->value) needs to be a power of two"); 179 augmented_len &= sizeof(augmented_arg->value) - 1; 180 augmented_arg->size = string_len; 181 } else { 182 /* 183 * So that username notice the error while still being able 184 * to skip this augmented arg record 185 */ 186 augmented_arg->err = string_len; 187 augmented_len = offsetof(struct augmented_arg, value); 188 } 189 190 return augmented_len; 191 } 192 193 SEC("tp/raw_syscalls/sys_enter") 194 int syscall_unaugmented(struct syscall_enter_args *args) 195 { 196 return 1; 197 } 198 199 /* 200 * These will be tail_called from SEC("raw_syscalls:sys_enter"), so will find in 201 * augmented_args_tmp what was read by that raw_syscalls:sys_enter and go 202 * on from there, reading the first syscall arg as a string, i.e. open's 203 * filename. 204 */ 205 SEC("tp/syscalls/sys_enter_connect") 206 int sys_enter_connect(struct syscall_enter_args *args) 207 { 208 struct augmented_args_payload *augmented_args = augmented_args_payload(); 209 const void *sockaddr_arg = (const void *)args->args[1]; 210 unsigned int socklen = args->args[2]; 211 unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs 212 213 if (augmented_args == NULL) 214 return 1; /* Failure: don't filter */ 215 216 _Static_assert(is_power_of_2(sizeof(augmented_args->arg.saddr)), "sizeof(augmented_args->arg.saddr) needs to be a power of two"); 217 socklen &= sizeof(augmented_args->arg.saddr) - 1; 218 219 bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg); 220 augmented_args->arg.size = socklen; 221 augmented_args->arg.err = 0; 222 223 return augmented__output(args, augmented_args, len + socklen); 224 } 225 226 SEC("tp/syscalls/sys_enter_sendto") 227 int sys_enter_sendto(struct syscall_enter_args *args) 228 { 229 struct augmented_args_payload *augmented_args = augmented_args_payload(); 230 const void *sockaddr_arg = (const void *)args->args[4]; 231 unsigned int socklen = args->args[5]; 232 unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs 233 234 if (augmented_args == NULL) 235 return 1; /* Failure: don't filter */ 236 237 socklen &= sizeof(augmented_args->arg.saddr) - 1; 238 239 bpf_probe_read_user(&augmented_args->arg.saddr, socklen, sockaddr_arg); 240 241 return augmented__output(args, augmented_args, len + socklen); 242 } 243 244 SEC("tp/syscalls/sys_enter_open") 245 int sys_enter_open(struct syscall_enter_args *args) 246 { 247 struct augmented_args_payload *augmented_args = augmented_args_payload(); 248 const void *filename_arg = (const void *)args->args[0]; 249 unsigned int len = sizeof(augmented_args->args); 250 251 if (augmented_args == NULL) 252 return 1; /* Failure: don't filter */ 253 254 len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value)); 255 256 return augmented__output(args, augmented_args, len); 257 } 258 259 SEC("tp/syscalls/sys_enter_openat") 260 int sys_enter_openat(struct syscall_enter_args *args) 261 { 262 struct augmented_args_payload *augmented_args = augmented_args_payload(); 263 const void *filename_arg = (const void *)args->args[1]; 264 unsigned int len = sizeof(augmented_args->args); 265 266 if (augmented_args == NULL) 267 return 1; /* Failure: don't filter */ 268 269 len += augmented_arg__read_str(&augmented_args->arg, filename_arg, sizeof(augmented_args->arg.value)); 270 271 return augmented__output(args, augmented_args, len); 272 } 273 274 SEC("tp/syscalls/sys_enter_rename") 275 int sys_enter_rename(struct syscall_enter_args *args) 276 { 277 struct augmented_args_payload *augmented_args = augmented_args_payload(); 278 const void *oldpath_arg = (const void *)args->args[0], 279 *newpath_arg = (const void *)args->args[1]; 280 unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len; 281 282 if (augmented_args == NULL) 283 return 1; /* Failure: don't filter */ 284 285 len += 2 * sizeof(u64); // The overhead of size and err, just before the payload... 286 287 oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value)); 288 augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64)); 289 len += augmented_args->arg.size; 290 291 struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size; 292 293 newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value)); 294 arg2->size = newpath_len; 295 296 len += newpath_len; 297 298 return augmented__output(args, augmented_args, len); 299 } 300 301 SEC("tp/syscalls/sys_enter_renameat2") 302 int sys_enter_renameat2(struct syscall_enter_args *args) 303 { 304 struct augmented_args_payload *augmented_args = augmented_args_payload(); 305 const void *oldpath_arg = (const void *)args->args[1], 306 *newpath_arg = (const void *)args->args[3]; 307 unsigned int len = sizeof(augmented_args->args), oldpath_len, newpath_len; 308 309 if (augmented_args == NULL) 310 return 1; /* Failure: don't filter */ 311 312 len += 2 * sizeof(u64); // The overhead of size and err, just before the payload... 313 314 oldpath_len = augmented_arg__read_str(&augmented_args->arg, oldpath_arg, sizeof(augmented_args->arg.value)); 315 augmented_args->arg.size = PERF_ALIGN(oldpath_len + 1, sizeof(u64)); 316 len += augmented_args->arg.size; 317 318 struct augmented_arg *arg2 = (void *)&augmented_args->arg.value + augmented_args->arg.size; 319 320 newpath_len = augmented_arg__read_str(arg2, newpath_arg, sizeof(augmented_args->arg.value)); 321 arg2->size = newpath_len; 322 323 len += newpath_len; 324 325 return augmented__output(args, augmented_args, len); 326 } 327 328 #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ 329 330 // we need just the start, get the size to then copy it 331 struct perf_event_attr_size { 332 __u32 type; 333 /* 334 * Size of the attr structure, for fwd/bwd compat. 335 */ 336 __u32 size; 337 }; 338 339 SEC("tp/syscalls/sys_enter_perf_event_open") 340 int sys_enter_perf_event_open(struct syscall_enter_args *args) 341 { 342 struct augmented_args_payload *augmented_args = augmented_args_payload(); 343 const struct perf_event_attr_size *attr = (const struct perf_event_attr_size *)args->args[0], *attr_read; 344 unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs 345 346 if (augmented_args == NULL) 347 goto failure; 348 349 if (bpf_probe_read_user(&augmented_args->arg.value, sizeof(*attr), attr) < 0) 350 goto failure; 351 352 attr_read = (const struct perf_event_attr_size *)augmented_args->arg.value; 353 354 __u32 size = attr_read->size; 355 356 if (!size) 357 size = PERF_ATTR_SIZE_VER0; 358 359 if (size > sizeof(augmented_args->arg.value)) 360 goto failure; 361 362 // Now that we read attr->size and tested it against the size limits, read it completely 363 if (bpf_probe_read_user(&augmented_args->arg.value, size, attr) < 0) 364 goto failure; 365 366 return augmented__output(args, augmented_args, len + size); 367 failure: 368 return 1; /* Failure: don't filter */ 369 } 370 371 SEC("tp/syscalls/sys_enter_clock_nanosleep") 372 int sys_enter_clock_nanosleep(struct syscall_enter_args *args) 373 { 374 struct augmented_args_payload *augmented_args = augmented_args_payload(); 375 const void *rqtp_arg = (const void *)args->args[2]; 376 unsigned int len = sizeof(u64) + sizeof(augmented_args->args); // the size + err in all 'augmented_arg' structs 377 __u32 size = sizeof(struct timespec64); 378 379 if (augmented_args == NULL) 380 goto failure; 381 382 if (size > sizeof(augmented_args->arg.value)) 383 goto failure; 384 385 bpf_probe_read_user(&augmented_args->arg.value, size, rqtp_arg); 386 387 return augmented__output(args, augmented_args, len + size); 388 failure: 389 return 1; /* Failure: don't filter */ 390 } 391 392 SEC("tp/syscalls/sys_enter_nanosleep") 393 int sys_enter_nanosleep(struct syscall_enter_args *args) 394 { 395 struct augmented_args_payload *augmented_args = augmented_args_payload(); 396 const void *req_arg = (const void *)args->args[0]; 397 unsigned int len = sizeof(augmented_args->args); 398 __u32 size = sizeof(struct timespec64); 399 400 if (augmented_args == NULL) 401 goto failure; 402 403 if (size > sizeof(augmented_args->arg.value)) 404 goto failure; 405 406 bpf_probe_read_user(&augmented_args->arg.value, size, req_arg); 407 408 return augmented__output(args, augmented_args, len + size); 409 failure: 410 return 1; /* Failure: don't filter */ 411 } 412 413 static pid_t getpid(void) 414 { 415 return bpf_get_current_pid_tgid(); 416 } 417 418 static bool pid_filter__has(struct pids_filtered *pids, pid_t pid) 419 { 420 return bpf_map_lookup_elem(pids, &pid) != NULL; 421 } 422 423 static int augment_sys_enter(void *ctx, struct syscall_enter_args *args) 424 { 425 bool augmented, do_output = false; 426 int zero = 0, size, aug_size, index, output = 0, 427 value_size = sizeof(struct augmented_arg) - offsetof(struct augmented_arg, value); 428 unsigned int nr, *beauty_map; 429 struct beauty_payload_enter *payload; 430 void *arg, *payload_offset; 431 432 /* fall back to do predefined tail call */ 433 if (args == NULL) 434 return 1; 435 436 /* use syscall number to get beauty_map entry */ 437 nr = (__u32)args->syscall_nr; 438 beauty_map = bpf_map_lookup_elem(&beauty_map_enter, &nr); 439 440 /* set up payload for output */ 441 payload = bpf_map_lookup_elem(&beauty_payload_enter_map, &zero); 442 payload_offset = (void *)&payload->aug_args; 443 444 if (beauty_map == NULL || payload == NULL) 445 return 1; 446 447 /* copy the sys_enter header, which has the syscall_nr */ 448 __builtin_memcpy(&payload->args, args, sizeof(struct syscall_enter_args)); 449 450 /* 451 * Determine what type of argument and how many bytes to read from user space, using the 452 * value in the beauty_map. This is the relation of parameter type and its corresponding 453 * value in the beauty map, and how many bytes we read eventually: 454 * 455 * string: 1 -> size of string 456 * struct: size of struct -> size of struct 457 * buffer: -1 * (index of paired len) -> value of paired len (maximum: TRACE_AUG_MAX_BUF) 458 */ 459 for (int i = 0; i < 6; i++) { 460 arg = (void *)args->args[i]; 461 augmented = false; 462 size = beauty_map[i]; 463 aug_size = size; /* size of the augmented data read from user space */ 464 465 if (size == 0 || arg == NULL) 466 continue; 467 468 if (size == 1) { /* string */ 469 aug_size = bpf_probe_read_user_str(((struct augmented_arg *)payload_offset)->value, value_size, arg); 470 /* minimum of 0 to pass the verifier */ 471 if (aug_size < 0) 472 aug_size = 0; 473 474 augmented = true; 475 } else if (size > 0 && size <= value_size) { /* struct */ 476 if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, size, arg)) 477 augmented = true; 478 } else if (size < 0 && size >= -6) { /* buffer */ 479 index = -(size + 1); 480 aug_size = args->args[index]; 481 482 if (aug_size > TRACE_AUG_MAX_BUF) 483 aug_size = TRACE_AUG_MAX_BUF; 484 485 if (aug_size > 0) { 486 if (!bpf_probe_read_user(((struct augmented_arg *)payload_offset)->value, aug_size, arg)) 487 augmented = true; 488 } 489 } 490 491 /* write data to payload */ 492 if (augmented) { 493 int written = offsetof(struct augmented_arg, value) + aug_size; 494 495 ((struct augmented_arg *)payload_offset)->size = aug_size; 496 output += written; 497 payload_offset += written; 498 do_output = true; 499 } 500 } 501 502 if (!do_output) 503 return 1; 504 505 return augmented__beauty_output(ctx, payload, sizeof(struct syscall_enter_args) + output); 506 } 507 508 SEC("tp/raw_syscalls/sys_enter") 509 int sys_enter(struct syscall_enter_args *args) 510 { 511 struct augmented_args_payload *augmented_args; 512 /* 513 * We start len, the amount of data that will be in the perf ring 514 * buffer, if this is not filtered out by one of pid_filter__has(), 515 * syscall->enabled, etc, with the non-augmented raw syscall payload, 516 * i.e. sizeof(augmented_args->args). 517 * 518 * We'll add to this as we add augmented syscalls right after that 519 * initial, non-augmented raw_syscalls:sys_enter payload. 520 */ 521 522 if (pid_filter__has(&pids_filtered, getpid())) 523 return 0; 524 525 augmented_args = augmented_args_payload(); 526 if (augmented_args == NULL) 527 return 1; 528 529 bpf_probe_read_kernel(&augmented_args->args, sizeof(augmented_args->args), args); 530 531 /* 532 * Jump to syscall specific augmenter, even if the default one, 533 * "!raw_syscalls:unaugmented" that will just return 1 to return the 534 * unaugmented tracepoint payload. 535 */ 536 if (augment_sys_enter(args, &augmented_args->args)) 537 bpf_tail_call(args, &syscalls_sys_enter, augmented_args->args.syscall_nr); 538 539 // If not found on the PROG_ARRAY syscalls map, then we're filtering it: 540 return 0; 541 } 542 543 SEC("tp/raw_syscalls/sys_exit") 544 int sys_exit(struct syscall_exit_args *args) 545 { 546 struct syscall_exit_args exit_args; 547 548 if (pid_filter__has(&pids_filtered, getpid())) 549 return 0; 550 551 bpf_probe_read_kernel(&exit_args, sizeof(exit_args), args); 552 /* 553 * Jump to syscall specific return augmenter, even if the default one, 554 * "!raw_syscalls:unaugmented" that will just return 1 to return the 555 * unaugmented tracepoint payload. 556 */ 557 bpf_tail_call(args, &syscalls_sys_exit, exit_args.syscall_nr); 558 /* 559 * If not found on the PROG_ARRAY syscalls map, then we're filtering it: 560 */ 561 return 0; 562 } 563 564 char _license[] SEC("license") = "GPL"; 565