// SPDX-License-Identifier: GPL-2.0 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "trace_helpers.h" #include #include #include #include "bpf/hashmap.h" #include "bpf/libbpf_internal.h" #include "bpf_util.h" #define TRACEFS_PIPE "/sys/kernel/tracing/trace_pipe" #define DEBUGFS_PIPE "/sys/kernel/debug/tracing/trace_pipe" struct ksyms { struct ksym *syms; size_t sym_cap; size_t sym_cnt; }; static struct ksyms *ksyms; static pthread_mutex_t ksyms_mutex = PTHREAD_MUTEX_INITIALIZER; static int ksyms__add_symbol(struct ksyms *ksyms, const char *name, unsigned long addr) { void *tmp; tmp = strdup(name); if (!tmp) return -ENOMEM; ksyms->syms[ksyms->sym_cnt].addr = addr; ksyms->syms[ksyms->sym_cnt].name = tmp; ksyms->sym_cnt++; return 0; } void free_kallsyms_local(struct ksyms *ksyms) { unsigned int i; if (!ksyms) return; if (!ksyms->syms) { free(ksyms); return; } for (i = 0; i < ksyms->sym_cnt; i++) free(ksyms->syms[i].name); free(ksyms->syms); free(ksyms); } static struct ksyms *load_kallsyms_local_common(ksym_cmp_t cmp_cb) { FILE *f; char func[256], buf[256]; char symbol; void *addr; int ret; struct ksyms *ksyms; f = fopen("/proc/kallsyms", "r"); if (!f) return NULL; ksyms = calloc(1, sizeof(struct ksyms)); if (!ksyms) { fclose(f); return NULL; } while (fgets(buf, sizeof(buf), f)) { if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3) break; if (!addr) continue; ret = libbpf_ensure_mem((void **) &ksyms->syms, &ksyms->sym_cap, sizeof(struct ksym), ksyms->sym_cnt + 1); if (ret) goto error; ret = ksyms__add_symbol(ksyms, func, (unsigned long)addr); if (ret) goto error; } fclose(f); qsort(ksyms->syms, ksyms->sym_cnt, sizeof(struct ksym), cmp_cb); return ksyms; error: fclose(f); free_kallsyms_local(ksyms); return NULL; } static int ksym_cmp(const void *p1, const void *p2) { return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr; } struct ksyms *load_kallsyms_local(void) { return load_kallsyms_local_common(ksym_cmp); } struct ksyms *load_kallsyms_custom_local(ksym_cmp_t cmp_cb) { return load_kallsyms_local_common(cmp_cb); } int load_kallsyms(void) { pthread_mutex_lock(&ksyms_mutex); if (!ksyms) ksyms = load_kallsyms_local(); pthread_mutex_unlock(&ksyms_mutex); return ksyms ? 0 : 1; } struct ksym *ksym_search_local(struct ksyms *ksyms, long key) { int start = 0, end = ksyms->sym_cnt; int result; /* kallsyms not loaded. return NULL */ if (ksyms->sym_cnt <= 0) return NULL; while (start < end) { size_t mid = start + (end - start) / 2; result = key - ksyms->syms[mid].addr; if (result < 0) end = mid; else if (result > 0) start = mid + 1; else return &ksyms->syms[mid]; } if (start >= 1 && ksyms->syms[start - 1].addr < key && key < ksyms->syms[start].addr) /* valid ksym */ return &ksyms->syms[start - 1]; /* out of range. return _stext */ return &ksyms->syms[0]; } struct ksym *search_kallsyms_custom_local(struct ksyms *ksyms, const void *p, ksym_search_cmp_t cmp_cb) { int start = 0, mid, end = ksyms->sym_cnt; struct ksym *ks; int result; while (start < end) { mid = start + (end - start) / 2; ks = &ksyms->syms[mid]; result = cmp_cb(p, ks); if (result < 0) end = mid; else if (result > 0) start = mid + 1; else return ks; } return NULL; } struct ksym *ksym_search(long key) { if (!ksyms) return NULL; return ksym_search_local(ksyms, key); } long ksym_get_addr_local(struct ksyms *ksyms, const char *name) { int i; for (i = 0; i < ksyms->sym_cnt; i++) { if (strcmp(ksyms->syms[i].name, name) == 0) return ksyms->syms[i].addr; } return 0; } long ksym_get_addr(const char *name) { if (!ksyms) return 0; return ksym_get_addr_local(ksyms, name); } /* open kallsyms and read symbol addresses on the fly. Without caching all symbols, * this is faster than load + find. */ int kallsyms_find(const char *sym, unsigned long long *addr) { char type, name[500], *match; unsigned long long value; int err = 0; FILE *f; f = fopen("/proc/kallsyms", "r"); if (!f) return -EINVAL; while (fscanf(f, "%llx %c %499s%*[^\n]\n", &value, &type, name) > 0) { /* If CONFIG_LTO_CLANG_THIN is enabled, static variable/function * symbols could be promoted to global due to cross-file inlining. * For such cases, clang compiler will add .llvm. suffix * to those symbols to avoid potential naming conflict. * Let us ignore .llvm. suffix during symbol comparison. */ if (type == 'd') { match = strstr(name, ".llvm."); if (match) *match = '\0'; } if (strcmp(name, sym) == 0) { *addr = value; goto out; } } err = -ENOENT; out: fclose(f); return err; } #ifdef PROCMAP_QUERY int env_verbosity __weak = 0; static int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, size_t *offset, int *flags) { char path_buf[PATH_MAX], build_id_buf[20]; struct procmap_query q; int err; memset(&q, 0, sizeof(q)); q.size = sizeof(q); q.query_flags = query_flags; q.query_addr = (__u64)addr; q.vma_name_addr = (__u64)path_buf; q.vma_name_size = sizeof(path_buf); q.build_id_addr = (__u64)build_id_buf; q.build_id_size = sizeof(build_id_buf); err = ioctl(fd, PROCMAP_QUERY, &q); if (err < 0) { err = -errno; if (err == -ENOTTY) return -EOPNOTSUPP; /* ioctl() not implemented yet */ if (err == -ENOENT) return -ESRCH; /* vma not found */ return err; } if (env_verbosity >= 1) { printf("VMA FOUND (addr %08lx): %08lx-%08lx %c%c%c%c %08lx %02x:%02x %ld %s (build ID: %s, %d bytes)\n", (long)addr, (long)q.vma_start, (long)q.vma_end, (q.vma_flags & PROCMAP_QUERY_VMA_READABLE) ? 'r' : '-', (q.vma_flags & PROCMAP_QUERY_VMA_WRITABLE) ? 'w' : '-', (q.vma_flags & PROCMAP_QUERY_VMA_EXECUTABLE) ? 'x' : '-', (q.vma_flags & PROCMAP_QUERY_VMA_SHARED) ? 's' : 'p', (long)q.vma_offset, q.dev_major, q.dev_minor, (long)q.inode, q.vma_name_size ? path_buf : "", q.build_id_size ? "YES" : "NO", q.build_id_size); } *start = q.vma_start; *offset = q.vma_offset; *flags = q.vma_flags; return 0; } #else # ifndef PROCMAP_QUERY_VMA_EXECUTABLE # define PROCMAP_QUERY_VMA_EXECUTABLE 0x04 # endif static int procmap_query(int fd, const void *addr, __u32 query_flags, size_t *start, size_t *offset, int *flags) { return -EOPNOTSUPP; } #endif ssize_t get_uprobe_offset(const void *addr) { size_t start, base, end; FILE *f; char buf[256]; int err, flags; f = fopen("/proc/self/maps", "r"); if (!f) return -errno; /* requested executable VMA only */ err = procmap_query(fileno(f), addr, PROCMAP_QUERY_VMA_EXECUTABLE, &start, &base, &flags); if (err == -EOPNOTSUPP) { bool found = false; while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &base) == 4) { if (buf[2] == 'x' && (uintptr_t)addr >= start && (uintptr_t)addr < end) { found = true; break; } } if (!found) { fclose(f); return -ESRCH; } } else if (err) { fclose(f); return err; } fclose(f); #if defined(__powerpc64__) && defined(_CALL_ELF) && _CALL_ELF == 2 #define OP_RT_RA_MASK 0xffff0000UL #define LIS_R2 0x3c400000UL #define ADDIS_R2_R12 0x3c4c0000UL #define ADDI_R2_R2 0x38420000UL /* * A PPC64 ABIv2 function may have a local and a global entry * point. We need to use the local entry point when patching * functions, so identify and step over the global entry point * sequence. * * The global entry point sequence is always of the form: * * addis r2,r12,XXXX * addi r2,r2,XXXX * * A linker optimisation may convert the addis to lis: * * lis r2,XXXX * addi r2,r2,XXXX */ { const __u32 *insn = (const __u32 *)(uintptr_t)addr; if ((((*insn & OP_RT_RA_MASK) == ADDIS_R2_R12) || ((*insn & OP_RT_RA_MASK) == LIS_R2)) && ((*(insn + 1) & OP_RT_RA_MASK) == ADDI_R2_R2)) return (uintptr_t)(insn + 2) - start + base; } #endif return (uintptr_t)addr - start + base; } ssize_t get_rel_offset(uintptr_t addr) { size_t start, end, offset; char buf[256]; FILE *f; int err, flags; f = fopen("/proc/self/maps", "r"); if (!f) return -errno; err = procmap_query(fileno(f), (const void *)addr, 0, &start, &offset, &flags); if (err == 0) { fclose(f); return (size_t)addr - start + offset; } else if (err != -EOPNOTSUPP) { fclose(f); return err; } else if (err) { while (fscanf(f, "%zx-%zx %s %zx %*[^\n]\n", &start, &end, buf, &offset) == 4) { if (addr >= start && addr < end) { fclose(f); return (size_t)addr - start + offset; } } } fclose(f); return -EINVAL; } static int parse_build_id_buf(const void *note_start, Elf32_Word note_size, char *build_id) { Elf32_Word note_offs = 0; while (note_offs + sizeof(Elf32_Nhdr) < note_size) { Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); if (nhdr->n_type == 3 && nhdr->n_namesz == sizeof("GNU") && !strcmp((char *)(nhdr + 1), "GNU") && nhdr->n_descsz > 0 && nhdr->n_descsz <= BPF_BUILD_ID_SIZE) { memcpy(build_id, note_start + note_offs + ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), nhdr->n_descsz); memset(build_id + nhdr->n_descsz, 0, BPF_BUILD_ID_SIZE - nhdr->n_descsz); return (int) nhdr->n_descsz; } note_offs = note_offs + sizeof(Elf32_Nhdr) + ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); } return -ENOENT; } /* Reads binary from *path* file and returns it in the *build_id* buffer * with *size* which is expected to be at least BPF_BUILD_ID_SIZE bytes. * Returns size of build id on success. On error the error value is * returned. */ int read_build_id(const char *path, char *build_id, size_t size) { int fd, err = -EINVAL; Elf *elf = NULL; GElf_Ehdr ehdr; size_t max, i; if (size < BPF_BUILD_ID_SIZE) return -EINVAL; fd = open(path, O_RDONLY | O_CLOEXEC); if (fd < 0) return -errno; (void)elf_version(EV_CURRENT); elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); if (!elf) goto out; if (elf_kind(elf) != ELF_K_ELF) goto out; if (!gelf_getehdr(elf, &ehdr)) goto out; for (i = 0; i < ehdr.e_phnum; i++) { GElf_Phdr mem, *phdr; char *data; phdr = gelf_getphdr(elf, i, &mem); if (!phdr) goto out; if (phdr->p_type != PT_NOTE) continue; data = elf_rawfile(elf, &max); if (!data) goto out; if (phdr->p_offset + phdr->p_memsz > max) goto out; err = parse_build_id_buf(data + phdr->p_offset, phdr->p_memsz, build_id); if (err > 0) break; } out: if (elf) elf_end(elf); close(fd); return err; } int read_trace_pipe_iter(void (*cb)(const char *str, void *data), void *data, int iter) { size_t buflen, n; char *buf = NULL; FILE *fp = NULL; if (access(TRACEFS_PIPE, F_OK) == 0) fp = fopen(TRACEFS_PIPE, "r"); else fp = fopen(DEBUGFS_PIPE, "r"); if (!fp) return -1; /* We do not want to wait forever when iter is specified. */ if (iter) fcntl(fileno(fp), F_SETFL, O_NONBLOCK); while ((n = getline(&buf, &buflen, fp) >= 0) || errno == EAGAIN) { if (n > 0) cb(buf, data); if (iter && !(--iter)) break; } free(buf); if (fp) fclose(fp); return 0; } static void trace_pipe_cb(const char *str, void *data) { printf("%s", str); } void read_trace_pipe(void) { read_trace_pipe_iter(trace_pipe_cb, NULL, 0); } static size_t symbol_hash(long key, void *ctx __maybe_unused) { return str_hash((const char *) key); } static bool symbol_equal(long key1, long key2, void *ctx __maybe_unused) { return strcmp((const char *) key1, (const char *) key2) == 0; } static bool is_invalid_entry(char *buf, bool kernel) { if (kernel && strchr(buf, '[')) return true; if (!kernel && !strchr(buf, '[')) return true; return false; } static const char * const trace_blacklist[] = { "migrate_disable", "migrate_enable", "rcu_read_unlock_strict", "preempt_count_add", "preempt_count_sub", "__rcu_read_lock", "__rcu_read_unlock", "bpf_get_numa_node_id", }; static bool skip_entry(char *name) { int i; /* * We attach to almost all kernel functions and some of them * will cause 'suspicious RCU usage' when fprobe is attached * to them. Filter out the current culprits - arch_cpu_idle * default_idle and rcu_* functions. */ if (!strcmp(name, "arch_cpu_idle")) return true; if (!strcmp(name, "default_idle")) return true; if (!strncmp(name, "rcu_", 4)) return true; if (!strcmp(name, "bpf_dispatcher_xdp_func")) return true; if (!strncmp(name, "__ftrace_invalid_address__", sizeof("__ftrace_invalid_address__") - 1)) return true; for (i = 0; i < ARRAY_SIZE(trace_blacklist); i++) { if (!strcmp(name, trace_blacklist[i])) return true; } return false; } /* Do comparison by ignoring '.llvm.' suffixes. */ static int compare_name(const char *name1, const char *name2) { const char *res1, *res2; int len1, len2; res1 = strstr(name1, ".llvm."); res2 = strstr(name2, ".llvm."); len1 = res1 ? res1 - name1 : strlen(name1); len2 = res2 ? res2 - name2 : strlen(name2); if (len1 == len2) return strncmp(name1, name2, len1); if (len1 < len2) return strncmp(name1, name2, len1) <= 0 ? -1 : 1; return strncmp(name1, name2, len2) >= 0 ? 1 : -1; } static int load_kallsyms_compare(const void *p1, const void *p2) { return compare_name(((const struct ksym *)p1)->name, ((const struct ksym *)p2)->name); } static int search_kallsyms_compare(const void *p1, const struct ksym *p2) { return compare_name(p1, p2->name); } int bpf_get_ksyms(char ***symsp, size_t *cntp, bool kernel) { size_t cap = 0, cnt = 0; char *name = NULL, *ksym_name, **syms = NULL; struct hashmap *map; struct ksyms *ksyms; struct ksym *ks; char buf[256]; FILE *f; int err = 0; ksyms = load_kallsyms_custom_local(load_kallsyms_compare); if (!ksyms) return -EINVAL; /* * The available_filter_functions contains many duplicates, * but other than that all symbols are usable to trace. * Filtering out duplicates by using hashmap__add, which won't * add existing entry. */ if (access("/sys/kernel/tracing/trace", F_OK) == 0) f = fopen("/sys/kernel/tracing/available_filter_functions", "r"); else f = fopen("/sys/kernel/debug/tracing/available_filter_functions", "r"); if (!f) return -EINVAL; map = hashmap__new(symbol_hash, symbol_equal, NULL); if (IS_ERR(map)) { err = libbpf_get_error(map); goto error; } while (fgets(buf, sizeof(buf), f)) { if (is_invalid_entry(buf, kernel)) continue; free(name); if (sscanf(buf, "%ms$*[^\n]\n", &name) != 1) continue; if (skip_entry(name)) continue; ks = search_kallsyms_custom_local(ksyms, name, search_kallsyms_compare); if (!ks) { err = -EINVAL; goto error; } ksym_name = ks->name; err = hashmap__add(map, ksym_name, 0); if (err == -EEXIST) { err = 0; continue; } if (err) goto error; err = libbpf_ensure_mem((void **) &syms, &cap, sizeof(*syms), cnt + 1); if (err) goto error; syms[cnt++] = ksym_name; } *symsp = syms; *cntp = cnt; error: free(name); fclose(f); hashmap__free(map); if (err) free(syms); return err; } int bpf_get_addrs(unsigned long **addrsp, size_t *cntp, bool kernel) { unsigned long *addr, *addrs, *tmp_addrs; int err = 0, max_cnt, inc_cnt; char *name = NULL; size_t cnt = 0; char buf[256]; FILE *f; if (access("/sys/kernel/tracing/trace", F_OK) == 0) f = fopen("/sys/kernel/tracing/available_filter_functions_addrs", "r"); else f = fopen("/sys/kernel/debug/tracing/available_filter_functions_addrs", "r"); if (!f) return -ENOENT; /* In my local setup, the number of entries is 50k+ so Let us initially * allocate space to hold 64k entries. If 64k is not enough, incrementally * increase 1k each time. */ max_cnt = 65536; inc_cnt = 1024; addrs = malloc(max_cnt * sizeof(long)); if (addrs == NULL) { err = -ENOMEM; goto error; } while (fgets(buf, sizeof(buf), f)) { if (is_invalid_entry(buf, kernel)) continue; free(name); if (sscanf(buf, "%p %ms$*[^\n]\n", &addr, &name) != 2) continue; if (skip_entry(name)) continue; if (cnt == max_cnt) { max_cnt += inc_cnt; tmp_addrs = realloc(addrs, max_cnt); if (!tmp_addrs) { err = -ENOMEM; goto error; } addrs = tmp_addrs; } addrs[cnt++] = (unsigned long)addr; } *addrsp = addrs; *cntp = cnt; error: free(name); fclose(f); if (err) free(addrs); return err; }