1 // SPDX-License-Identifier: GPL-2.0 2 3 /* Copyright (c) 2021 Facebook */ 4 /* Copyright (c) 2021 Google */ 5 6 #include <assert.h> 7 #include <limits.h> 8 #include <unistd.h> 9 #include <sys/file.h> 10 #include <sys/time.h> 11 #include <sys/resource.h> 12 #include <linux/err.h> 13 #include <linux/zalloc.h> 14 #include <linux/perf_event.h> 15 #include <api/fs/fs.h> 16 #include <bpf/bpf.h> 17 #include <perf/bpf_perf.h> 18 19 #include "affinity.h" 20 #include "bpf_counter.h" 21 #include "cgroup.h" 22 #include "counts.h" 23 #include "debug.h" 24 #include "evsel.h" 25 #include "evlist.h" 26 #include "target.h" 27 #include "cpumap.h" 28 #include "thread_map.h" 29 30 #include "bpf_skel/bperf_cgroup.skel.h" 31 32 static struct perf_event_attr cgrp_switch_attr = { 33 .type = PERF_TYPE_SOFTWARE, 34 .config = PERF_COUNT_SW_CGROUP_SWITCHES, 35 .size = sizeof(cgrp_switch_attr), 36 .sample_period = 1, 37 .disabled = 1, 38 }; 39 40 static struct evsel *cgrp_switch; 41 static struct bperf_cgroup_bpf *skel; 42 43 #define FD(evt, cpu) (*(int *)xyarray__entry(evt->core.fd, cpu, 0)) 44 45 static int bperf_load_program(struct evlist *evlist) 46 { 47 struct bpf_link *link; 48 struct evsel *evsel; 49 struct cgroup *cgrp, *leader_cgrp; 50 int i, j; 51 struct perf_cpu cpu; 52 int total_cpus = cpu__max_cpu().cpu; 53 int map_size, map_fd; 54 int prog_fd, err; 55 56 skel = bperf_cgroup_bpf__open(); 57 if (!skel) { 58 pr_err("Failed to open cgroup skeleton\n"); 59 return -1; 60 } 61 62 skel->rodata->num_cpus = total_cpus; 63 skel->rodata->num_events = evlist->core.nr_entries / nr_cgroups; 64 65 if (cgroup_is_v2("perf_event") > 0) 66 skel->rodata->use_cgroup_v2 = 1; 67 68 BUG_ON(evlist->core.nr_entries % nr_cgroups != 0); 69 70 /* we need one copy of events per cpu for reading */ 71 map_size = total_cpus * evlist->core.nr_entries / nr_cgroups; 72 bpf_map__set_max_entries(skel->maps.events, map_size); 73 bpf_map__set_max_entries(skel->maps.cgrp_idx, nr_cgroups); 74 /* previous result is saved in a per-cpu array */ 75 map_size = evlist->core.nr_entries / nr_cgroups; 76 bpf_map__set_max_entries(skel->maps.prev_readings, map_size); 77 /* cgroup result needs all events (per-cpu) */ 78 map_size = evlist->core.nr_entries; 79 bpf_map__set_max_entries(skel->maps.cgrp_readings, map_size); 80 81 set_max_rlimit(); 82 83 err = bperf_cgroup_bpf__load(skel); 84 if (err) { 85 pr_err("Failed to load cgroup skeleton\n"); 86 goto out; 87 } 88 89 err = -1; 90 91 cgrp_switch = evsel__new(&cgrp_switch_attr); 92 if (evsel__open_per_cpu(cgrp_switch, evlist->core.all_cpus, -1) < 0) { 93 pr_err("Failed to open cgroup switches event\n"); 94 goto out; 95 } 96 97 perf_cpu_map__for_each_cpu(cpu, i, evlist->core.all_cpus) { 98 link = bpf_program__attach_perf_event(skel->progs.on_cgrp_switch, 99 FD(cgrp_switch, i)); 100 if (IS_ERR(link)) { 101 pr_err("Failed to attach cgroup program\n"); 102 err = PTR_ERR(link); 103 goto out; 104 } 105 } 106 107 /* 108 * Update cgrp_idx map from cgroup-id to event index. 109 */ 110 cgrp = NULL; 111 i = 0; 112 113 evlist__for_each_entry(evlist, evsel) { 114 if (cgrp == NULL || evsel->cgrp == leader_cgrp) { 115 leader_cgrp = evsel->cgrp; 116 evsel->cgrp = NULL; 117 118 /* open single copy of the events w/o cgroup */ 119 err = evsel__open_per_cpu(evsel, evsel->core.cpus, -1); 120 if (err == 0) 121 evsel->supported = true; 122 123 map_fd = bpf_map__fd(skel->maps.events); 124 perf_cpu_map__for_each_cpu(cpu, j, evsel->core.cpus) { 125 int fd = FD(evsel, j); 126 __u32 idx = evsel->core.idx * total_cpus + cpu.cpu; 127 128 bpf_map_update_elem(map_fd, &idx, &fd, BPF_ANY); 129 } 130 131 evsel->cgrp = leader_cgrp; 132 } 133 134 if (evsel->cgrp == cgrp) 135 continue; 136 137 cgrp = evsel->cgrp; 138 139 if (read_cgroup_id(cgrp) < 0) { 140 pr_debug("Failed to get cgroup id for %s\n", cgrp->name); 141 cgrp->id = 0; 142 } 143 144 map_fd = bpf_map__fd(skel->maps.cgrp_idx); 145 err = bpf_map_update_elem(map_fd, &cgrp->id, &i, BPF_ANY); 146 if (err < 0) { 147 pr_err("Failed to update cgroup index map\n"); 148 goto out; 149 } 150 151 i++; 152 } 153 154 /* 155 * bperf uses BPF_PROG_TEST_RUN to get accurate reading. Check 156 * whether the kernel support it 157 */ 158 prog_fd = bpf_program__fd(skel->progs.trigger_read); 159 err = bperf_trigger_reading(prog_fd, 0); 160 if (err) { 161 pr_warning("The kernel does not support test_run for raw_tp BPF programs.\n" 162 "Therefore, --for-each-cgroup might show inaccurate readings\n"); 163 err = 0; 164 } 165 166 out: 167 return err; 168 } 169 170 static int bperf_cgrp__load(struct evsel *evsel, 171 struct target *target __maybe_unused) 172 { 173 static bool bperf_loaded = false; 174 175 evsel->bperf_leader_prog_fd = -1; 176 evsel->bperf_leader_link_fd = -1; 177 178 if (!bperf_loaded && bperf_load_program(evsel->evlist)) 179 return -1; 180 181 bperf_loaded = true; 182 /* just to bypass bpf_counter_skip() */ 183 evsel->follower_skel = (struct bperf_follower_bpf *)skel; 184 185 return 0; 186 } 187 188 static int bperf_cgrp__install_pe(struct evsel *evsel __maybe_unused, 189 int cpu_map_idx __maybe_unused, 190 int fd __maybe_unused) 191 { 192 /* nothing to do */ 193 return 0; 194 } 195 196 /* 197 * trigger the leader prog on each cpu, so the cgrp_reading map could get 198 * the latest results. 199 */ 200 static int bperf_cgrp__sync_counters(struct evlist *evlist) 201 { 202 struct perf_cpu cpu; 203 int idx; 204 int prog_fd = bpf_program__fd(skel->progs.trigger_read); 205 206 perf_cpu_map__for_each_cpu(cpu, idx, evlist->core.all_cpus) 207 bperf_trigger_reading(prog_fd, cpu.cpu); 208 209 return 0; 210 } 211 212 static int bperf_cgrp__enable(struct evsel *evsel) 213 { 214 if (evsel->core.idx) 215 return 0; 216 217 bperf_cgrp__sync_counters(evsel->evlist); 218 219 skel->bss->enabled = 1; 220 return 0; 221 } 222 223 static int bperf_cgrp__disable(struct evsel *evsel) 224 { 225 if (evsel->core.idx) 226 return 0; 227 228 bperf_cgrp__sync_counters(evsel->evlist); 229 230 skel->bss->enabled = 0; 231 return 0; 232 } 233 234 static int bperf_cgrp__read(struct evsel *evsel) 235 { 236 struct evlist *evlist = evsel->evlist; 237 int total_cpus = cpu__max_cpu().cpu; 238 struct perf_counts_values *counts; 239 struct bpf_perf_event_value *values; 240 int reading_map_fd, err = 0; 241 242 if (evsel->core.idx) 243 return 0; 244 245 bperf_cgrp__sync_counters(evsel->evlist); 246 247 values = calloc(total_cpus, sizeof(*values)); 248 if (values == NULL) 249 return -ENOMEM; 250 251 reading_map_fd = bpf_map__fd(skel->maps.cgrp_readings); 252 253 evlist__for_each_entry(evlist, evsel) { 254 __u32 idx = evsel->core.idx; 255 int i; 256 struct perf_cpu cpu; 257 258 err = bpf_map_lookup_elem(reading_map_fd, &idx, values); 259 if (err) { 260 pr_err("bpf map lookup failed: idx=%u, event=%s, cgrp=%s\n", 261 idx, evsel__name(evsel), evsel->cgrp->name); 262 goto out; 263 } 264 265 perf_cpu_map__for_each_cpu(cpu, i, evsel->core.cpus) { 266 counts = perf_counts(evsel->counts, i, 0); 267 counts->val = values[cpu.cpu].counter; 268 counts->ena = values[cpu.cpu].enabled; 269 counts->run = values[cpu.cpu].running; 270 } 271 } 272 273 out: 274 free(values); 275 return err; 276 } 277 278 static int bperf_cgrp__destroy(struct evsel *evsel) 279 { 280 if (evsel->core.idx) 281 return 0; 282 283 bperf_cgroup_bpf__destroy(skel); 284 evsel__delete(cgrp_switch); // it'll destroy on_switch progs too 285 286 return 0; 287 } 288 289 struct bpf_counter_ops bperf_cgrp_ops = { 290 .load = bperf_cgrp__load, 291 .enable = bperf_cgrp__enable, 292 .disable = bperf_cgrp__disable, 293 .read = bperf_cgrp__read, 294 .install_pe = bperf_cgrp__install_pe, 295 .destroy = bperf_cgrp__destroy, 296 }; 297