1 // SPDX-License-Identifier: GPL-2.0
2
3 /* Copyright (c) 2021 Facebook */
4 /* Copyright (c) 2021 Google */
5
6 #include <assert.h>
7 #include <errno.h>
8 #include <limits.h>
9 #include <unistd.h>
10 #include <sys/file.h>
11 #include <sys/time.h>
12 #include <sys/resource.h>
13 #include <linux/err.h>
14 #include <linux/zalloc.h>
15 #include <linux/perf_event.h>
16 #include <api/fs/fs.h>
17 #include <bpf/bpf.h>
18 #include <perf/bpf_perf.h>
19
20 #include "affinity.h"
21 #include "bpf_counter.h"
22 #include "cgroup.h"
23 #include "counts.h"
24 #include "debug.h"
25 #include "evsel.h"
26 #include "evlist.h"
27 #include "target.h"
28 #include "cpumap.h"
29 #include "thread_map.h"
30
31 #include "bpf_skel/bperf_cgroup.h"
32 #include "bpf_skel/bperf_cgroup.skel.h"
33
34 static struct perf_event_attr cgrp_switch_attr = {
35 .type = PERF_TYPE_SOFTWARE,
36 .config = PERF_COUNT_SW_CGROUP_SWITCHES,
37 .size = sizeof(cgrp_switch_attr),
38 .sample_period = 1,
39 .disabled = 1,
40 };
41
42 static struct evsel *cgrp_switch;
43 static struct bperf_cgroup_bpf *skel;
44
45 #define FD(evt, cpu) (*(int *)xyarray__entry(evt->core.fd, cpu, 0))
46
setup_rodata(struct bperf_cgroup_bpf * sk,int evlist_size)47 static void setup_rodata(struct bperf_cgroup_bpf *sk, int evlist_size)
48 {
49 int map_size, total_cpus = cpu__max_cpu().cpu;
50
51 sk->rodata->num_cpus = total_cpus;
52 sk->rodata->num_events = evlist_size / nr_cgroups;
53
54 if (cgroup_is_v2("perf_event") > 0)
55 sk->rodata->use_cgroup_v2 = 1;
56
57 BUG_ON(evlist_size % nr_cgroups != 0);
58
59 /* we need one copy of events per cpu for reading */
60 map_size = total_cpus * evlist_size / nr_cgroups;
61 bpf_map__set_max_entries(sk->maps.events, map_size);
62 bpf_map__set_max_entries(sk->maps.cgrp_idx, nr_cgroups);
63 /* previous result is saved in a per-cpu array */
64 map_size = evlist_size / nr_cgroups;
65 bpf_map__set_max_entries(sk->maps.prev_readings, map_size);
66 /* cgroup result needs all events (per-cpu) */
67 map_size = evlist_size;
68 bpf_map__set_max_entries(sk->maps.cgrp_readings, map_size);
69 }
70
test_max_events_program_load(void)71 static void test_max_events_program_load(void)
72 {
73 #ifndef NDEBUG
74 /*
75 * Test that the program verifies with the maximum number of events. If
76 * this test fails unfortunately perf needs recompiling with a lower
77 * BPERF_CGROUP__MAX_EVENTS to avoid BPF verifier issues.
78 */
79 int err, max_events = BPERF_CGROUP__MAX_EVENTS * nr_cgroups;
80 struct bperf_cgroup_bpf *test_skel = bperf_cgroup_bpf__open();
81
82 if (!test_skel) {
83 pr_err("Failed to open cgroup skeleton\n");
84 return;
85 }
86 setup_rodata(test_skel, max_events);
87 err = bperf_cgroup_bpf__load(test_skel);
88 if (err) {
89 pr_err("Failed to load cgroup skeleton with max events %d.\n",
90 BPERF_CGROUP__MAX_EVENTS);
91 }
92 bperf_cgroup_bpf__destroy(test_skel);
93 #endif
94 }
95
bperf_load_program(struct evlist * evlist)96 static int bperf_load_program(struct evlist *evlist)
97 {
98 struct bpf_link *link;
99 struct evsel *evsel;
100 struct cgroup *cgrp, *leader_cgrp;
101 int i, j;
102 struct perf_cpu cpu;
103 int total_cpus = cpu__max_cpu().cpu;
104 int map_fd, prog_fd, err;
105
106 set_max_rlimit();
107
108 test_max_events_program_load();
109
110 skel = bperf_cgroup_bpf__open();
111 if (!skel) {
112 pr_err("Failed to open cgroup skeleton\n");
113 return -1;
114 }
115 setup_rodata(skel, evlist->core.nr_entries);
116
117 err = bperf_cgroup_bpf__load(skel);
118 if (err) {
119 pr_err("Failed to load cgroup skeleton\n");
120 goto out;
121 }
122
123 err = -1;
124
125 cgrp_switch = evsel__new(&cgrp_switch_attr);
126 if (evsel__open_per_cpu(cgrp_switch, evlist->core.all_cpus, -1) < 0) {
127 pr_err("Failed to open cgroup switches event\n");
128 goto out;
129 }
130
131 perf_cpu_map__for_each_cpu(cpu, i, evlist->core.all_cpus) {
132 link = bpf_program__attach_perf_event(skel->progs.on_cgrp_switch,
133 FD(cgrp_switch, i));
134 if (IS_ERR(link)) {
135 pr_err("Failed to attach cgroup program\n");
136 err = PTR_ERR(link);
137 goto out;
138 }
139 }
140
141 /*
142 * Update cgrp_idx map from cgroup-id to event index.
143 */
144 cgrp = NULL;
145 i = 0;
146
147 evlist__for_each_entry(evlist, evsel) {
148 if (cgrp == NULL || evsel->cgrp == leader_cgrp) {
149 leader_cgrp = evsel->cgrp;
150 evsel->cgrp = NULL;
151
152 /* open single copy of the events w/o cgroup */
153 err = evsel__open_per_cpu(evsel, evsel->core.cpus, -1);
154 if (err == 0)
155 evsel->supported = true;
156
157 map_fd = bpf_map__fd(skel->maps.events);
158 perf_cpu_map__for_each_cpu(cpu, j, evsel->core.cpus) {
159 int fd = FD(evsel, j);
160 __u32 idx = evsel->core.idx * total_cpus + cpu.cpu;
161
162 bpf_map_update_elem(map_fd, &idx, &fd, BPF_ANY);
163 }
164
165 evsel->cgrp = leader_cgrp;
166 }
167
168 if (evsel->cgrp == cgrp)
169 continue;
170
171 cgrp = evsel->cgrp;
172
173 if (read_cgroup_id(cgrp) < 0) {
174 pr_debug("Failed to get cgroup id for %s\n", cgrp->name);
175 cgrp->id = 0;
176 }
177
178 map_fd = bpf_map__fd(skel->maps.cgrp_idx);
179 err = bpf_map_update_elem(map_fd, &cgrp->id, &i, BPF_ANY);
180 if (err < 0) {
181 pr_err("Failed to update cgroup index map\n");
182 goto out;
183 }
184
185 i++;
186 }
187
188 /*
189 * bperf uses BPF_PROG_TEST_RUN to get accurate reading. Check
190 * whether the kernel support it
191 */
192 prog_fd = bpf_program__fd(skel->progs.trigger_read);
193 err = bperf_trigger_reading(prog_fd, 0);
194 if (err) {
195 pr_warning("The kernel does not support test_run for raw_tp BPF programs.\n"
196 "Therefore, --for-each-cgroup might show inaccurate readings\n");
197 err = 0;
198 }
199
200 out:
201 return err;
202 }
203
bperf_cgrp__load(struct evsel * evsel,struct target * target __maybe_unused)204 static int bperf_cgrp__load(struct evsel *evsel,
205 struct target *target __maybe_unused)
206 {
207 static bool bperf_loaded = false;
208
209 evsel->bperf_leader_prog_fd = -1;
210 evsel->bperf_leader_link_fd = -1;
211
212 if (!bperf_loaded && bperf_load_program(evsel->evlist))
213 return -1;
214
215 bperf_loaded = true;
216 /* just to bypass bpf_counter_skip() */
217 evsel->follower_skel = (struct bperf_follower_bpf *)skel;
218
219 return 0;
220 }
221
bperf_cgrp__install_pe(struct evsel * evsel __maybe_unused,int cpu_map_idx __maybe_unused,int fd __maybe_unused)222 static int bperf_cgrp__install_pe(struct evsel *evsel __maybe_unused,
223 int cpu_map_idx __maybe_unused,
224 int fd __maybe_unused)
225 {
226 /* nothing to do */
227 return 0;
228 }
229
230 /*
231 * trigger the leader prog on each cpu, so the cgrp_reading map could get
232 * the latest results.
233 */
bperf_cgrp__sync_counters(struct evlist * evlist)234 static int bperf_cgrp__sync_counters(struct evlist *evlist)
235 {
236 struct perf_cpu cpu;
237 int idx;
238 int prog_fd = bpf_program__fd(skel->progs.trigger_read);
239
240 perf_cpu_map__for_each_cpu(cpu, idx, evlist->core.all_cpus)
241 bperf_trigger_reading(prog_fd, cpu.cpu);
242
243 return 0;
244 }
245
bperf_cgrp__enable(struct evsel * evsel)246 static int bperf_cgrp__enable(struct evsel *evsel)
247 {
248 if (evsel->core.idx)
249 return 0;
250
251 bperf_cgrp__sync_counters(evsel->evlist);
252
253 skel->bss->enabled = 1;
254 return 0;
255 }
256
bperf_cgrp__disable(struct evsel * evsel)257 static int bperf_cgrp__disable(struct evsel *evsel)
258 {
259 if (evsel->core.idx)
260 return 0;
261
262 bperf_cgrp__sync_counters(evsel->evlist);
263
264 skel->bss->enabled = 0;
265 return 0;
266 }
267
bperf_cgrp__read(struct evsel * evsel)268 static int bperf_cgrp__read(struct evsel *evsel)
269 {
270 struct evlist *evlist = evsel->evlist;
271 int total_cpus = cpu__max_cpu().cpu;
272 struct perf_counts_values *counts;
273 struct bpf_perf_event_value *values;
274 int reading_map_fd, err = 0;
275
276 if (evsel->core.idx)
277 return 0;
278
279 bperf_cgrp__sync_counters(evsel->evlist);
280
281 values = calloc(total_cpus, sizeof(*values));
282 if (values == NULL)
283 return -ENOMEM;
284
285 reading_map_fd = bpf_map__fd(skel->maps.cgrp_readings);
286
287 evlist__for_each_entry(evlist, evsel) {
288 __u32 idx = evsel->core.idx;
289 int i;
290 struct perf_cpu cpu;
291
292 err = bpf_map_lookup_elem(reading_map_fd, &idx, values);
293 if (err) {
294 pr_err("bpf map lookup failed: idx=%u, event=%s, cgrp=%s\n",
295 idx, evsel__name(evsel), evsel->cgrp->name);
296 goto out;
297 }
298
299 perf_cpu_map__for_each_cpu(cpu, i, evsel->core.cpus) {
300 counts = perf_counts(evsel->counts, i, 0);
301 counts->val = values[cpu.cpu].counter;
302 counts->ena = values[cpu.cpu].enabled;
303 counts->run = values[cpu.cpu].running;
304 }
305 }
306
307 out:
308 free(values);
309 return err;
310 }
311
bperf_cgrp__destroy(struct evsel * evsel)312 static int bperf_cgrp__destroy(struct evsel *evsel)
313 {
314 if (evsel->core.idx)
315 return 0;
316
317 bperf_cgroup_bpf__destroy(skel);
318 evsel__delete(cgrp_switch); // it'll destroy on_switch progs too
319
320 return 0;
321 }
322
323 struct bpf_counter_ops bperf_cgrp_ops = {
324 .load = bperf_cgrp__load,
325 .enable = bperf_cgrp__enable,
326 .disable = bperf_cgrp__disable,
327 .read = bperf_cgrp__read,
328 .install_pe = bperf_cgrp__install_pe,
329 .destroy = bperf_cgrp__destroy,
330 };
331