1 // SPDX-License-Identifier: GPL-2.0 2 #include "util/bpf_counter.h" 3 #include "util/debug.h" 4 #include "util/evsel.h" 5 #include "util/evlist.h" 6 #include "util/off_cpu.h" 7 #include "util/perf-hooks.h" 8 #include "util/record.h" 9 #include "util/session.h" 10 #include "util/target.h" 11 #include "util/cpumap.h" 12 #include "util/thread_map.h" 13 #include "util/cgroup.h" 14 #include "util/strlist.h" 15 #include <bpf/bpf.h> 16 #include <bpf/btf.h> 17 #include <internal/xyarray.h> 18 #include <linux/time64.h> 19 20 #include "bpf_skel/off_cpu.skel.h" 21 22 #define MAX_STACKS 32 23 #define MAX_PROC 4096 24 /* we don't need actual timestamp, just want to put the samples at last */ 25 #define OFF_CPU_TIMESTAMP (~0ull << 32) 26 27 static struct off_cpu_bpf *skel; 28 29 struct off_cpu_key { 30 u32 pid; 31 u32 tgid; 32 u32 stack_id; 33 u32 state; 34 u64 cgroup_id; 35 }; 36 37 union off_cpu_data { 38 struct perf_event_header hdr; 39 u64 array[1024 / sizeof(u64)]; 40 }; 41 42 u64 off_cpu_raw[MAX_STACKS + 5]; 43 44 static int off_cpu_config(struct evlist *evlist) 45 { 46 char off_cpu_event[64]; 47 struct evsel *evsel; 48 49 scnprintf(off_cpu_event, sizeof(off_cpu_event), "bpf-output/name=%s/", OFFCPU_EVENT); 50 if (parse_event(evlist, off_cpu_event)) { 51 pr_err("Failed to open off-cpu event\n"); 52 return -1; 53 } 54 55 evlist__for_each_entry(evlist, evsel) { 56 if (evsel__is_offcpu_event(evsel)) { 57 evsel->core.system_wide = true; 58 break; 59 } 60 } 61 62 return 0; 63 } 64 65 static void off_cpu_start(void *arg) 66 { 67 struct evlist *evlist = arg; 68 struct evsel *evsel; 69 struct perf_cpu pcpu; 70 int i; 71 72 /* update task filter for the given workload */ 73 if (skel->rodata->has_task && skel->rodata->uses_tgid && 74 perf_thread_map__pid(evlist->core.threads, 0) != -1) { 75 int fd; 76 u32 pid; 77 u8 val = 1; 78 79 fd = bpf_map__fd(skel->maps.task_filter); 80 pid = perf_thread_map__pid(evlist->core.threads, 0); 81 bpf_map_update_elem(fd, &pid, &val, BPF_ANY); 82 } 83 84 /* update BPF perf_event map */ 85 evsel = evlist__find_evsel_by_str(evlist, OFFCPU_EVENT); 86 if (evsel == NULL) { 87 pr_err("%s evsel not found\n", OFFCPU_EVENT); 88 return; 89 } 90 91 perf_cpu_map__for_each_cpu(pcpu, i, evsel->core.cpus) { 92 int err; 93 int cpu_nr = pcpu.cpu; 94 95 err = bpf_map__update_elem(skel->maps.offcpu_output, &cpu_nr, sizeof(int), 96 xyarray__entry(evsel->core.fd, cpu_nr, 0), 97 sizeof(int), BPF_ANY); 98 if (err) { 99 pr_err("Failed to update perf event map for direct off-cpu dumping\n"); 100 return; 101 } 102 } 103 104 skel->bss->enabled = 1; 105 } 106 107 static void off_cpu_finish(void *arg __maybe_unused) 108 { 109 skel->bss->enabled = 0; 110 off_cpu_bpf__destroy(skel); 111 } 112 113 /* v5.18 kernel added prev_state arg, so it needs to check the signature */ 114 static void check_sched_switch_args(void) 115 { 116 struct btf *btf = btf__load_vmlinux_btf(); 117 const struct btf_type *t1, *t2, *t3; 118 u32 type_id; 119 120 if (!btf) { 121 pr_debug("Missing btf, check if CONFIG_DEBUG_INFO_BTF is enabled\n"); 122 goto cleanup; 123 } 124 125 type_id = btf__find_by_name_kind(btf, "btf_trace_sched_switch", 126 BTF_KIND_TYPEDEF); 127 if ((s32)type_id < 0) 128 goto cleanup; 129 130 t1 = btf__type_by_id(btf, type_id); 131 if (t1 == NULL) 132 goto cleanup; 133 134 t2 = btf__type_by_id(btf, t1->type); 135 if (t2 == NULL || !btf_is_ptr(t2)) 136 goto cleanup; 137 138 t3 = btf__type_by_id(btf, t2->type); 139 /* btf_trace func proto has one more argument for the context */ 140 if (t3 && btf_is_func_proto(t3) && btf_vlen(t3) == 5) { 141 /* new format: pass prev_state as 4th arg */ 142 skel->rodata->has_prev_state = true; 143 } 144 cleanup: 145 btf__free(btf); 146 } 147 148 int off_cpu_prepare(struct evlist *evlist, struct target *target, 149 struct record_opts *opts) 150 { 151 int err, fd, i; 152 int ncpus = 1, ntasks = 1, ncgrps = 1; 153 struct strlist *pid_slist = NULL; 154 struct str_node *pos; 155 156 if (off_cpu_config(evlist) < 0) { 157 pr_err("Failed to config off-cpu BPF event\n"); 158 return -1; 159 } 160 161 skel = off_cpu_bpf__open(); 162 if (!skel) { 163 pr_err("Failed to open off-cpu BPF skeleton\n"); 164 return -1; 165 } 166 167 /* don't need to set cpu filter for system-wide mode */ 168 if (target->cpu_list) { 169 ncpus = perf_cpu_map__nr(evlist->core.user_requested_cpus); 170 bpf_map__set_max_entries(skel->maps.cpu_filter, ncpus); 171 skel->rodata->has_cpu = 1; 172 } 173 174 if (target->pid) { 175 pid_slist = strlist__new(target->pid, NULL); 176 if (!pid_slist) { 177 pr_err("Failed to create a strlist for pid\n"); 178 return -1; 179 } 180 181 ntasks = 0; 182 strlist__for_each_entry(pos, pid_slist) { 183 char *end_ptr; 184 int pid = strtol(pos->s, &end_ptr, 10); 185 186 if (pid == INT_MIN || pid == INT_MAX || 187 (*end_ptr != '\0' && *end_ptr != ',')) 188 continue; 189 190 ntasks++; 191 } 192 193 if (ntasks < MAX_PROC) 194 ntasks = MAX_PROC; 195 196 bpf_map__set_max_entries(skel->maps.task_filter, ntasks); 197 skel->rodata->has_task = 1; 198 skel->rodata->uses_tgid = 1; 199 } else if (target__has_task(target)) { 200 ntasks = perf_thread_map__nr(evlist->core.threads); 201 bpf_map__set_max_entries(skel->maps.task_filter, ntasks); 202 skel->rodata->has_task = 1; 203 } else if (target__none(target)) { 204 bpf_map__set_max_entries(skel->maps.task_filter, MAX_PROC); 205 skel->rodata->has_task = 1; 206 skel->rodata->uses_tgid = 1; 207 } 208 209 if (evlist__first(evlist)->cgrp) { 210 ncgrps = evlist->core.nr_entries - 1; /* excluding a dummy */ 211 bpf_map__set_max_entries(skel->maps.cgroup_filter, ncgrps); 212 213 if (!cgroup_is_v2("perf_event")) 214 skel->rodata->uses_cgroup_v1 = true; 215 skel->rodata->has_cgroup = 1; 216 } 217 218 if (opts->record_cgroup) { 219 skel->rodata->needs_cgroup = true; 220 221 if (!cgroup_is_v2("perf_event")) 222 skel->rodata->uses_cgroup_v1 = true; 223 } 224 225 set_max_rlimit(); 226 check_sched_switch_args(); 227 228 err = off_cpu_bpf__load(skel); 229 if (err) { 230 pr_err("Failed to load off-cpu skeleton\n"); 231 goto out; 232 } 233 234 if (target->cpu_list) { 235 u32 cpu; 236 u8 val = 1; 237 238 fd = bpf_map__fd(skel->maps.cpu_filter); 239 240 for (i = 0; i < ncpus; i++) { 241 cpu = perf_cpu_map__cpu(evlist->core.user_requested_cpus, i).cpu; 242 bpf_map_update_elem(fd, &cpu, &val, BPF_ANY); 243 } 244 } 245 246 if (target->pid) { 247 u8 val = 1; 248 249 fd = bpf_map__fd(skel->maps.task_filter); 250 251 strlist__for_each_entry(pos, pid_slist) { 252 char *end_ptr; 253 u32 tgid; 254 int pid = strtol(pos->s, &end_ptr, 10); 255 256 if (pid == INT_MIN || pid == INT_MAX || 257 (*end_ptr != '\0' && *end_ptr != ',')) 258 continue; 259 260 tgid = pid; 261 bpf_map_update_elem(fd, &tgid, &val, BPF_ANY); 262 } 263 } else if (target__has_task(target)) { 264 u32 pid; 265 u8 val = 1; 266 267 fd = bpf_map__fd(skel->maps.task_filter); 268 269 for (i = 0; i < ntasks; i++) { 270 pid = perf_thread_map__pid(evlist->core.threads, i); 271 bpf_map_update_elem(fd, &pid, &val, BPF_ANY); 272 } 273 } 274 275 if (evlist__first(evlist)->cgrp) { 276 struct evsel *evsel; 277 u8 val = 1; 278 279 fd = bpf_map__fd(skel->maps.cgroup_filter); 280 281 evlist__for_each_entry(evlist, evsel) { 282 struct cgroup *cgrp = evsel->cgrp; 283 284 if (cgrp == NULL) 285 continue; 286 287 if (!cgrp->id && read_cgroup_id(cgrp) < 0) { 288 pr_err("Failed to read cgroup id of %s\n", 289 cgrp->name); 290 goto out; 291 } 292 293 bpf_map_update_elem(fd, &cgrp->id, &val, BPF_ANY); 294 } 295 } 296 297 skel->bss->offcpu_thresh_ns = opts->off_cpu_thresh_ns; 298 299 err = off_cpu_bpf__attach(skel); 300 if (err) { 301 pr_err("Failed to attach off-cpu BPF skeleton\n"); 302 goto out; 303 } 304 305 if (perf_hooks__set_hook("record_start", off_cpu_start, evlist) || 306 perf_hooks__set_hook("record_end", off_cpu_finish, evlist)) { 307 pr_err("Failed to attach off-cpu skeleton\n"); 308 goto out; 309 } 310 311 return 0; 312 313 out: 314 off_cpu_bpf__destroy(skel); 315 return -1; 316 } 317 318 int off_cpu_write(struct perf_session *session) 319 { 320 int bytes = 0, size; 321 int fd, stack; 322 u32 raw_size; 323 u64 sample_type, val, sid = 0; 324 struct evsel *evsel; 325 struct perf_data_file *file = &session->data->file; 326 struct off_cpu_key prev, key; 327 union off_cpu_data data = { 328 .hdr = { 329 .type = PERF_RECORD_SAMPLE, 330 .misc = PERF_RECORD_MISC_USER, 331 }, 332 }; 333 u64 tstamp = OFF_CPU_TIMESTAMP; 334 335 skel->bss->enabled = 0; 336 337 evsel = evlist__find_evsel_by_str(session->evlist, OFFCPU_EVENT); 338 if (evsel == NULL) { 339 pr_err("%s evsel not found\n", OFFCPU_EVENT); 340 return 0; 341 } 342 343 sample_type = evsel->core.attr.sample_type; 344 345 if (sample_type & ~OFFCPU_SAMPLE_TYPES) { 346 pr_err("not supported sample type: %llx\n", 347 (unsigned long long)sample_type); 348 return -1; 349 } 350 351 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) { 352 if (evsel->core.id) 353 sid = evsel->core.id[0]; 354 } 355 356 fd = bpf_map__fd(skel->maps.off_cpu); 357 stack = bpf_map__fd(skel->maps.stacks); 358 memset(&prev, 0, sizeof(prev)); 359 360 while (!bpf_map_get_next_key(fd, &prev, &key)) { 361 int n = 1; /* start from perf_event_header */ 362 363 bpf_map_lookup_elem(fd, &key, &val); 364 365 /* zero-fill some of the fields, will be overwritten by raw_data when parsing */ 366 if (sample_type & PERF_SAMPLE_IDENTIFIER) 367 data.array[n++] = sid; 368 if (sample_type & PERF_SAMPLE_IP) 369 data.array[n++] = 0; /* will be updated */ 370 if (sample_type & PERF_SAMPLE_TID) 371 data.array[n++] = 0; 372 if (sample_type & PERF_SAMPLE_TIME) 373 data.array[n++] = tstamp; 374 if (sample_type & PERF_SAMPLE_CPU) 375 data.array[n++] = 0; 376 if (sample_type & PERF_SAMPLE_PERIOD) 377 data.array[n++] = 0; 378 if (sample_type & PERF_SAMPLE_RAW) { 379 /* 380 * [ size ][ data ] 381 * [ data ] 382 * [ data ] 383 * [ data ] 384 * [ data ][ empty] 385 */ 386 int len = 0, i = 0; 387 void *raw_data = (void *)data.array + n * sizeof(u64); 388 389 off_cpu_raw[i++] = (u64)key.pid << 32 | key.tgid; 390 off_cpu_raw[i++] = val; 391 392 /* off_cpu_raw[i] is callchain->nr (updated later) */ 393 off_cpu_raw[i + 1] = PERF_CONTEXT_USER; 394 off_cpu_raw[i + 2] = 0; 395 396 bpf_map_lookup_elem(stack, &key.stack_id, &off_cpu_raw[i + 2]); 397 while (off_cpu_raw[i + 2 + len]) 398 len++; 399 400 off_cpu_raw[i] = len + 1; 401 i += len + 2; 402 403 off_cpu_raw[i++] = key.cgroup_id; 404 405 raw_size = i * sizeof(u64) + sizeof(u32); /* 4 bytes for alignment */ 406 memcpy(raw_data, &raw_size, sizeof(raw_size)); 407 memcpy(raw_data + sizeof(u32), off_cpu_raw, i * sizeof(u64)); 408 409 n += i + 1; 410 } 411 if (sample_type & PERF_SAMPLE_CGROUP) 412 data.array[n++] = key.cgroup_id; 413 414 size = n * sizeof(u64); 415 data.hdr.size = size; 416 bytes += size; 417 418 if (perf_data_file__write(file, &data, size) < 0) { 419 pr_err("failed to write perf data, error: %m\n"); 420 return bytes; 421 } 422 423 prev = key; 424 /* increase dummy timestamp to sort later samples */ 425 tstamp++; 426 } 427 return bytes; 428 } 429