1 // SPDX-License-Identifier: GPL-2.0 2 3 /* Copyright (c) 2021 Facebook */ 4 /* Copyright (c) 2021 Google */ 5 6 #include <assert.h> 7 #include <errno.h> 8 #include <limits.h> 9 #include <unistd.h> 10 #include <sys/file.h> 11 #include <sys/time.h> 12 #include <sys/resource.h> 13 #include <linux/err.h> 14 #include <linux/zalloc.h> 15 #include <linux/perf_event.h> 16 #include <api/fs/fs.h> 17 #include <bpf/bpf.h> 18 #include <perf/bpf_perf.h> 19 20 #include "affinity.h" 21 #include "bpf_counter.h" 22 #include "cgroup.h" 23 #include "counts.h" 24 #include "debug.h" 25 #include "evsel.h" 26 #include "evlist.h" 27 #include "target.h" 28 #include "cpumap.h" 29 #include "thread_map.h" 30 31 #include "bpf_skel/bperf_cgroup.h" 32 #include "bpf_skel/bperf_cgroup.skel.h" 33 34 static struct perf_event_attr cgrp_switch_attr = { 35 .type = PERF_TYPE_SOFTWARE, 36 .config = PERF_COUNT_SW_CGROUP_SWITCHES, 37 .size = sizeof(cgrp_switch_attr), 38 .sample_period = 1, 39 .disabled = 1, 40 }; 41 42 static struct evsel *cgrp_switch; 43 static struct bperf_cgroup_bpf *skel; 44 45 #define FD(evt, cpu) (*(int *)xyarray__entry(evt->core.fd, cpu, 0)) 46 47 static void setup_rodata(struct bperf_cgroup_bpf *sk, int evlist_size) 48 { 49 int map_size, total_cpus = cpu__max_cpu().cpu; 50 51 sk->rodata->num_cpus = total_cpus; 52 sk->rodata->num_events = evlist_size / nr_cgroups; 53 54 if (cgroup_is_v2("perf_event") > 0) 55 sk->rodata->use_cgroup_v2 = 1; 56 57 BUG_ON(evlist_size % nr_cgroups != 0); 58 59 /* we need one copy of events per cpu for reading */ 60 map_size = total_cpus * evlist_size / nr_cgroups; 61 bpf_map__set_max_entries(sk->maps.events, map_size); 62 bpf_map__set_max_entries(sk->maps.cgrp_idx, nr_cgroups); 63 /* previous result is saved in a per-cpu array */ 64 map_size = evlist_size / nr_cgroups; 65 bpf_map__set_max_entries(sk->maps.prev_readings, map_size); 66 /* cgroup result needs all events (per-cpu) */ 67 map_size = evlist_size; 68 bpf_map__set_max_entries(sk->maps.cgrp_readings, map_size); 69 } 70 71 static void test_max_events_program_load(void) 72 { 73 #ifndef NDEBUG 74 /* 75 * Test that the program verifies with the maximum number of events. If 76 * this test fails unfortunately perf needs recompiling with a lower 77 * BPERF_CGROUP__MAX_EVENTS to avoid BPF verifier issues. 78 */ 79 int err, max_events = BPERF_CGROUP__MAX_EVENTS * nr_cgroups; 80 struct bperf_cgroup_bpf *test_skel = bperf_cgroup_bpf__open(); 81 82 if (!test_skel) { 83 pr_err("Failed to open cgroup skeleton\n"); 84 return; 85 } 86 setup_rodata(test_skel, max_events); 87 err = bperf_cgroup_bpf__load(test_skel); 88 if (err) { 89 pr_err("Failed to load cgroup skeleton with max events %d.\n", 90 BPERF_CGROUP__MAX_EVENTS); 91 } 92 bperf_cgroup_bpf__destroy(test_skel); 93 #endif 94 } 95 96 static int bperf_load_program(struct evlist *evlist) 97 { 98 struct bpf_link *link; 99 struct evsel *evsel; 100 struct cgroup *cgrp, *leader_cgrp; 101 int i, j; 102 struct perf_cpu cpu; 103 int total_cpus = cpu__max_cpu().cpu; 104 int map_fd, prog_fd, err; 105 106 set_max_rlimit(); 107 108 test_max_events_program_load(); 109 110 skel = bperf_cgroup_bpf__open(); 111 if (!skel) { 112 pr_err("Failed to open cgroup skeleton\n"); 113 return -1; 114 } 115 setup_rodata(skel, evlist->core.nr_entries); 116 117 err = bperf_cgroup_bpf__load(skel); 118 if (err) { 119 pr_err("Failed to load cgroup skeleton\n"); 120 goto out; 121 } 122 123 err = -1; 124 125 cgrp_switch = evsel__new(&cgrp_switch_attr); 126 if (evsel__open_per_cpu(cgrp_switch, evlist->core.all_cpus, -1) < 0) { 127 pr_err("Failed to open cgroup switches event\n"); 128 goto out; 129 } 130 131 perf_cpu_map__for_each_cpu(cpu, i, evlist->core.all_cpus) { 132 link = bpf_program__attach_perf_event(skel->progs.on_cgrp_switch, 133 FD(cgrp_switch, i)); 134 if (IS_ERR(link)) { 135 pr_err("Failed to attach cgroup program\n"); 136 err = PTR_ERR(link); 137 goto out; 138 } 139 } 140 141 /* 142 * Update cgrp_idx map from cgroup-id to event index. 143 */ 144 cgrp = NULL; 145 i = 0; 146 147 evlist__for_each_entry(evlist, evsel) { 148 if (cgrp == NULL || evsel->cgrp == leader_cgrp) { 149 leader_cgrp = evsel->cgrp; 150 evsel->cgrp = NULL; 151 152 /* open single copy of the events w/o cgroup */ 153 err = evsel__open_per_cpu(evsel, evsel->core.cpus, -1); 154 if (err == 0) 155 evsel->supported = true; 156 157 map_fd = bpf_map__fd(skel->maps.events); 158 perf_cpu_map__for_each_cpu(cpu, j, evsel->core.cpus) { 159 int fd = FD(evsel, j); 160 __u32 idx = evsel->core.idx * total_cpus + cpu.cpu; 161 162 bpf_map_update_elem(map_fd, &idx, &fd, BPF_ANY); 163 } 164 165 evsel->cgrp = leader_cgrp; 166 } 167 168 if (evsel->cgrp == cgrp) 169 continue; 170 171 cgrp = evsel->cgrp; 172 173 if (read_cgroup_id(cgrp) < 0) { 174 pr_debug("Failed to get cgroup id for %s\n", cgrp->name); 175 cgrp->id = 0; 176 } 177 178 map_fd = bpf_map__fd(skel->maps.cgrp_idx); 179 err = bpf_map_update_elem(map_fd, &cgrp->id, &i, BPF_ANY); 180 if (err < 0) { 181 pr_err("Failed to update cgroup index map\n"); 182 goto out; 183 } 184 185 i++; 186 } 187 188 /* 189 * bperf uses BPF_PROG_TEST_RUN to get accurate reading. Check 190 * whether the kernel support it 191 */ 192 prog_fd = bpf_program__fd(skel->progs.trigger_read); 193 err = bperf_trigger_reading(prog_fd, 0); 194 if (err) { 195 pr_warning("The kernel does not support test_run for raw_tp BPF programs.\n" 196 "Therefore, --for-each-cgroup might show inaccurate readings\n"); 197 err = 0; 198 } 199 200 out: 201 return err; 202 } 203 204 static int bperf_cgrp__load(struct evsel *evsel, 205 struct target *target __maybe_unused) 206 { 207 static bool bperf_loaded = false; 208 209 evsel->bperf_leader_prog_fd = -1; 210 evsel->bperf_leader_link_fd = -1; 211 212 if (!bperf_loaded && bperf_load_program(evsel->evlist)) 213 return -1; 214 215 bperf_loaded = true; 216 /* just to bypass bpf_counter_skip() */ 217 evsel->follower_skel = (struct bperf_follower_bpf *)skel; 218 219 return 0; 220 } 221 222 static int bperf_cgrp__install_pe(struct evsel *evsel __maybe_unused, 223 int cpu_map_idx __maybe_unused, 224 int fd __maybe_unused) 225 { 226 /* nothing to do */ 227 return 0; 228 } 229 230 /* 231 * trigger the leader prog on each cpu, so the cgrp_reading map could get 232 * the latest results. 233 */ 234 static int bperf_cgrp__sync_counters(struct evlist *evlist) 235 { 236 struct perf_cpu cpu; 237 int idx; 238 int prog_fd = bpf_program__fd(skel->progs.trigger_read); 239 240 perf_cpu_map__for_each_cpu(cpu, idx, evlist->core.all_cpus) 241 bperf_trigger_reading(prog_fd, cpu.cpu); 242 243 return 0; 244 } 245 246 static int bperf_cgrp__enable(struct evsel *evsel) 247 { 248 if (evsel->core.idx) 249 return 0; 250 251 bperf_cgrp__sync_counters(evsel->evlist); 252 253 skel->bss->enabled = 1; 254 return 0; 255 } 256 257 static int bperf_cgrp__disable(struct evsel *evsel) 258 { 259 if (evsel->core.idx) 260 return 0; 261 262 bperf_cgrp__sync_counters(evsel->evlist); 263 264 skel->bss->enabled = 0; 265 return 0; 266 } 267 268 static int bperf_cgrp__read(struct evsel *evsel) 269 { 270 struct evlist *evlist = evsel->evlist; 271 int total_cpus = cpu__max_cpu().cpu; 272 struct perf_counts_values *counts; 273 struct bpf_perf_event_value *values; 274 int reading_map_fd, err = 0; 275 276 if (evsel->core.idx) 277 return 0; 278 279 bperf_cgrp__sync_counters(evsel->evlist); 280 281 values = calloc(total_cpus, sizeof(*values)); 282 if (values == NULL) 283 return -ENOMEM; 284 285 reading_map_fd = bpf_map__fd(skel->maps.cgrp_readings); 286 287 evlist__for_each_entry(evlist, evsel) { 288 __u32 idx = evsel->core.idx; 289 int i; 290 struct perf_cpu cpu; 291 292 err = bpf_map_lookup_elem(reading_map_fd, &idx, values); 293 if (err) { 294 pr_err("bpf map lookup failed: idx=%u, event=%s, cgrp=%s\n", 295 idx, evsel__name(evsel), evsel->cgrp->name); 296 goto out; 297 } 298 299 perf_cpu_map__for_each_cpu(cpu, i, evsel->core.cpus) { 300 counts = perf_counts(evsel->counts, i, 0); 301 counts->val = values[cpu.cpu].counter; 302 counts->ena = values[cpu.cpu].enabled; 303 counts->run = values[cpu.cpu].running; 304 } 305 } 306 307 out: 308 free(values); 309 return err; 310 } 311 312 static int bperf_cgrp__destroy(struct evsel *evsel) 313 { 314 if (evsel->core.idx) 315 return 0; 316 317 bperf_cgroup_bpf__destroy(skel); 318 evsel__delete(cgrp_switch); // it'll destroy on_switch progs too 319 320 return 0; 321 } 322 323 struct bpf_counter_ops bperf_cgrp_ops = { 324 .load = bperf_cgrp__load, 325 .enable = bperf_cgrp__enable, 326 .disable = bperf_cgrp__disable, 327 .read = bperf_cgrp__read, 328 .install_pe = bperf_cgrp__install_pe, 329 .destroy = bperf_cgrp__destroy, 330 }; 331