1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 // Copyright (c) 2021 Facebook
3 // Copyright (c) 2021 Google
4 #include "bperf_cgroup.h"
5 #include "vmlinux.h"
6 #include <bpf/bpf_helpers.h>
7 #include <bpf/bpf_tracing.h>
8 #include <bpf/bpf_core_read.h>
9
10 // NOTE: many of map and global data will be modified before loading
11 // from the userspace (perf tool) using the skeleton helpers.
12
13 // single set of global perf events to measure
14 struct {
15 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
16 __uint(key_size, sizeof(__u32));
17 __uint(value_size, sizeof(int));
18 __uint(max_entries, 1);
19 } events SEC(".maps");
20
21 // from cgroup id to event index
22 struct {
23 __uint(type, BPF_MAP_TYPE_HASH);
24 __uint(key_size, sizeof(__u64));
25 __uint(value_size, sizeof(__u32));
26 __uint(max_entries, 1);
27 } cgrp_idx SEC(".maps");
28
29 // per-cpu event snapshots to calculate delta
30 struct {
31 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
32 __uint(key_size, sizeof(__u32));
33 __uint(value_size, sizeof(struct bpf_perf_event_value));
34 } prev_readings SEC(".maps");
35
36 // aggregated event values for each cgroup (per-cpu)
37 // will be read from the user-space
38 struct {
39 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
40 __uint(key_size, sizeof(__u32));
41 __uint(value_size, sizeof(struct bpf_perf_event_value));
42 } cgrp_readings SEC(".maps");
43
44 /* new kernel cgroup definition */
45 struct cgroup___new {
46 int level;
47 struct cgroup *ancestors[];
48 } __attribute__((preserve_access_index));
49
50 /* old kernel cgroup definition */
51 struct cgroup___old {
52 int level;
53 u64 ancestor_ids[];
54 } __attribute__((preserve_access_index));
55
56 const volatile __u32 num_events = 1;
57 const volatile __u32 num_cpus = 1;
58 const volatile int use_cgroup_v2 = 0;
59
60 int enabled = 0;
61 int perf_subsys_id = -1;
62
get_cgroup_v1_ancestor_id(struct cgroup * cgrp,int level)63 static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level)
64 {
65 /* recast pointer to capture new type for compiler */
66 struct cgroup___new *cgrp_new = (void *)cgrp;
67
68 if (bpf_core_field_exists(cgrp_new->ancestors)) {
69 return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id);
70 } else {
71 /* recast pointer to capture old type for compiler */
72 struct cgroup___old *cgrp_old = (void *)cgrp;
73
74 return BPF_CORE_READ(cgrp_old, ancestor_ids[level]);
75 }
76 }
77
get_cgroup_v1_idx(__u32 * cgrps,int size)78 static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
79 {
80 struct task_struct *p = (void *)bpf_get_current_task();
81 struct cgroup *cgrp;
82 register int i = 0;
83 __u32 *elem;
84 int level;
85 int cnt;
86
87 if (perf_subsys_id == -1) {
88 #if __has_builtin(__builtin_preserve_enum_value)
89 perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
90 perf_event_cgrp_id);
91 #else
92 perf_subsys_id = perf_event_cgrp_id;
93 #endif
94 }
95 cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup);
96 level = BPF_CORE_READ(cgrp, level);
97
98 for (cnt = 0; i < BPERF_CGROUP__MAX_LEVELS; i++) {
99 __u64 cgrp_id;
100
101 if (i > level)
102 break;
103
104 // convert cgroup-id to a map index
105 cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i);
106 elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
107 if (!elem)
108 continue;
109
110 cgrps[cnt++] = *elem;
111 if (cnt == size)
112 break;
113 }
114
115 return cnt;
116 }
117
get_cgroup_v2_idx(__u32 * cgrps,int size)118 static inline int get_cgroup_v2_idx(__u32 *cgrps, int size)
119 {
120 register int i = 0;
121 __u32 *elem;
122 int cnt;
123
124 for (cnt = 0; i < BPERF_CGROUP__MAX_LEVELS; i++) {
125 __u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i);
126
127 if (cgrp_id == 0)
128 break;
129
130 // convert cgroup-id to a map index
131 elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
132 if (!elem)
133 continue;
134
135 cgrps[cnt++] = *elem;
136 if (cnt == size)
137 break;
138 }
139
140 return cnt;
141 }
142
bperf_cgroup_count(void)143 static int bperf_cgroup_count(void)
144 {
145 register __u32 idx = 0; // to have it in a register to pass BPF verifier
146 register int c = 0;
147 struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val;
148 __u32 cpu = bpf_get_smp_processor_id();
149 __u32 cgrp_idx[BPERF_CGROUP__MAX_LEVELS];
150 int cgrp_cnt;
151 __u32 key, cgrp;
152 long err;
153
154 if (use_cgroup_v2)
155 cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, BPERF_CGROUP__MAX_LEVELS);
156 else
157 cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, BPERF_CGROUP__MAX_LEVELS);
158
159 for ( ; idx < BPERF_CGROUP__MAX_EVENTS; idx++) {
160 if (idx == num_events)
161 break;
162
163 // XXX: do not pass idx directly (for verifier)
164 key = idx;
165 // this is per-cpu array for diff
166 prev_val = bpf_map_lookup_elem(&prev_readings, &key);
167 if (!prev_val) {
168 val.counter = val.enabled = val.running = 0;
169 bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY);
170
171 prev_val = bpf_map_lookup_elem(&prev_readings, &key);
172 if (!prev_val)
173 continue;
174 }
175
176 // read from global perf_event array
177 key = idx * num_cpus + cpu;
178 err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
179 if (err)
180 continue;
181
182 if (enabled) {
183 delta.counter = val.counter - prev_val->counter;
184 delta.enabled = val.enabled - prev_val->enabled;
185 delta.running = val.running - prev_val->running;
186
187 for (c = 0; c < BPERF_CGROUP__MAX_LEVELS; c++) {
188 if (c == cgrp_cnt)
189 break;
190
191 cgrp = cgrp_idx[c];
192
193 // aggregate the result by cgroup
194 key = cgrp * num_events + idx;
195 cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key);
196 if (cgrp_val) {
197 cgrp_val->counter += delta.counter;
198 cgrp_val->enabled += delta.enabled;
199 cgrp_val->running += delta.running;
200 } else {
201 bpf_map_update_elem(&cgrp_readings, &key,
202 &delta, BPF_ANY);
203 }
204 }
205 }
206
207 *prev_val = val;
208 }
209 return 0;
210 }
211
212 // This will be attached to cgroup-switches event for each cpu
213 SEC("perf_event")
BPF_PROG(on_cgrp_switch)214 int BPF_PROG(on_cgrp_switch)
215 {
216 return bperf_cgroup_count();
217 }
218
219 SEC("raw_tp/sched_switch")
BPF_PROG(trigger_read)220 int BPF_PROG(trigger_read)
221 {
222 return bperf_cgroup_count();
223 }
224
225 char LICENSE[] SEC("license") = "Dual BSD/GPL";
226