xref: /linux/tools/perf/util/bpf_skel/bperf_cgroup.bpf.c (revision 9e906a9dead17d81d6c2687f65e159231d0e3286)
1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
2 // Copyright (c) 2021 Facebook
3 // Copyright (c) 2021 Google
4 #include "bperf_cgroup.h"
5 #include "vmlinux.h"
6 #include <bpf/bpf_helpers.h>
7 #include <bpf/bpf_tracing.h>
8 #include <bpf/bpf_core_read.h>
9 
10 // NOTE: many of map and global data will be modified before loading
11 //       from the userspace (perf tool) using the skeleton helpers.
12 
13 // single set of global perf events to measure
14 struct {
15 	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
16 	__uint(key_size, sizeof(__u32));
17 	__uint(value_size, sizeof(int));
18 	__uint(max_entries, 1);
19 } events SEC(".maps");
20 
21 // from cgroup id to event index
22 struct {
23 	__uint(type, BPF_MAP_TYPE_HASH);
24 	__uint(key_size, sizeof(__u64));
25 	__uint(value_size, sizeof(__u32));
26 	__uint(max_entries, 1);
27 } cgrp_idx SEC(".maps");
28 
29 // per-cpu event snapshots to calculate delta
30 struct {
31 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
32 	__uint(key_size, sizeof(__u32));
33 	__uint(value_size, sizeof(struct bpf_perf_event_value));
34 } prev_readings SEC(".maps");
35 
36 // aggregated event values for each cgroup (per-cpu)
37 // will be read from the user-space
38 struct {
39 	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
40 	__uint(key_size, sizeof(__u32));
41 	__uint(value_size, sizeof(struct bpf_perf_event_value));
42 } cgrp_readings SEC(".maps");
43 
44 /* new kernel cgroup definition */
45 struct cgroup___new {
46 	int level;
47 	struct cgroup *ancestors[];
48 } __attribute__((preserve_access_index));
49 
50 /* old kernel cgroup definition */
51 struct cgroup___old {
52 	int level;
53 	u64 ancestor_ids[];
54 } __attribute__((preserve_access_index));
55 
56 const volatile __u32 num_events = 1;
57 const volatile __u32 num_cpus = 1;
58 const volatile int use_cgroup_v2 = 0;
59 
60 int enabled = 0;
61 int perf_subsys_id = -1;
62 
get_cgroup_v1_ancestor_id(struct cgroup * cgrp,int level)63 static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level)
64 {
65 	/* recast pointer to capture new type for compiler */
66 	struct cgroup___new *cgrp_new = (void *)cgrp;
67 
68 	if (bpf_core_field_exists(cgrp_new->ancestors)) {
69 		return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id);
70 	} else {
71 		/* recast pointer to capture old type for compiler */
72 		struct cgroup___old *cgrp_old = (void *)cgrp;
73 
74 		return BPF_CORE_READ(cgrp_old, ancestor_ids[level]);
75 	}
76 }
77 
get_cgroup_v1_idx(__u32 * cgrps,int size)78 static inline int get_cgroup_v1_idx(__u32 *cgrps, int size)
79 {
80 	struct task_struct *p = (void *)bpf_get_current_task();
81 	struct cgroup *cgrp;
82 	register int i = 0;
83 	__u32 *elem;
84 	int level;
85 	int cnt;
86 
87 	if (perf_subsys_id == -1) {
88 #if __has_builtin(__builtin_preserve_enum_value)
89 		perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
90 						     perf_event_cgrp_id);
91 #else
92 		perf_subsys_id = perf_event_cgrp_id;
93 #endif
94 	}
95 	cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup);
96 	level = BPF_CORE_READ(cgrp, level);
97 
98 	for (cnt = 0; i < BPERF_CGROUP__MAX_LEVELS; i++) {
99 		__u64 cgrp_id;
100 
101 		if (i > level)
102 			break;
103 
104 		// convert cgroup-id to a map index
105 		cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i);
106 		elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
107 		if (!elem)
108 			continue;
109 
110 		cgrps[cnt++] = *elem;
111 		if (cnt == size)
112 			break;
113 	}
114 
115 	return cnt;
116 }
117 
get_cgroup_v2_idx(__u32 * cgrps,int size)118 static inline int get_cgroup_v2_idx(__u32 *cgrps, int size)
119 {
120 	register int i = 0;
121 	__u32 *elem;
122 	int cnt;
123 
124 	for (cnt = 0; i < BPERF_CGROUP__MAX_LEVELS; i++) {
125 		__u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i);
126 
127 		if (cgrp_id == 0)
128 			break;
129 
130 		// convert cgroup-id to a map index
131 		elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id);
132 		if (!elem)
133 			continue;
134 
135 		cgrps[cnt++] = *elem;
136 		if (cnt == size)
137 			break;
138 	}
139 
140 	return cnt;
141 }
142 
bperf_cgroup_count(void)143 static int bperf_cgroup_count(void)
144 {
145 	register __u32 idx = 0;  // to have it in a register to pass BPF verifier
146 	register int c = 0;
147 	struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val;
148 	__u32 cpu = bpf_get_smp_processor_id();
149 	__u32 cgrp_idx[BPERF_CGROUP__MAX_LEVELS];
150 	int cgrp_cnt;
151 	__u32 key, cgrp;
152 	long err;
153 
154 	if (use_cgroup_v2)
155 		cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, BPERF_CGROUP__MAX_LEVELS);
156 	else
157 		cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, BPERF_CGROUP__MAX_LEVELS);
158 
159 	for ( ; idx < BPERF_CGROUP__MAX_EVENTS; idx++) {
160 		if (idx == num_events)
161 			break;
162 
163 		// XXX: do not pass idx directly (for verifier)
164 		key = idx;
165 		// this is per-cpu array for diff
166 		prev_val = bpf_map_lookup_elem(&prev_readings, &key);
167 		if (!prev_val) {
168 			val.counter = val.enabled = val.running = 0;
169 			bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY);
170 
171 			prev_val = bpf_map_lookup_elem(&prev_readings, &key);
172 			if (!prev_val)
173 				continue;
174 		}
175 
176 		// read from global perf_event array
177 		key = idx * num_cpus + cpu;
178 		err = bpf_perf_event_read_value(&events, key, &val, sizeof(val));
179 		if (err)
180 			continue;
181 
182 		if (enabled) {
183 			delta.counter = val.counter - prev_val->counter;
184 			delta.enabled = val.enabled - prev_val->enabled;
185 			delta.running = val.running - prev_val->running;
186 
187 			for (c = 0; c < BPERF_CGROUP__MAX_LEVELS; c++) {
188 				if (c == cgrp_cnt)
189 					break;
190 
191 				cgrp = cgrp_idx[c];
192 
193 				// aggregate the result by cgroup
194 				key = cgrp * num_events + idx;
195 				cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key);
196 				if (cgrp_val) {
197 					cgrp_val->counter += delta.counter;
198 					cgrp_val->enabled += delta.enabled;
199 					cgrp_val->running += delta.running;
200 				} else {
201 					bpf_map_update_elem(&cgrp_readings, &key,
202 							    &delta, BPF_ANY);
203 				}
204 			}
205 		}
206 
207 		*prev_val = val;
208 	}
209 	return 0;
210 }
211 
212 // This will be attached to cgroup-switches event for each cpu
213 SEC("perf_event")
BPF_PROG(on_cgrp_switch)214 int BPF_PROG(on_cgrp_switch)
215 {
216 	return bperf_cgroup_count();
217 }
218 
219 SEC("raw_tp/sched_switch")
BPF_PROG(trigger_read)220 int BPF_PROG(trigger_read)
221 {
222 	return bperf_cgroup_count();
223 }
224 
225 char LICENSE[] SEC("license") = "Dual BSD/GPL";
226