1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2 // Copyright (c) 2021 Facebook 3 // Copyright (c) 2021 Google 4 #include "bperf_cgroup.h" 5 #include "vmlinux.h" 6 #include <bpf/bpf_helpers.h> 7 #include <bpf/bpf_tracing.h> 8 #include <bpf/bpf_core_read.h> 9 10 // NOTE: many of map and global data will be modified before loading 11 // from the userspace (perf tool) using the skeleton helpers. 12 13 // single set of global perf events to measure 14 struct { 15 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 16 __uint(key_size, sizeof(__u32)); 17 __uint(value_size, sizeof(int)); 18 __uint(max_entries, 1); 19 } events SEC(".maps"); 20 21 // from cgroup id to event index 22 struct { 23 __uint(type, BPF_MAP_TYPE_HASH); 24 __uint(key_size, sizeof(__u64)); 25 __uint(value_size, sizeof(__u32)); 26 __uint(max_entries, 1); 27 } cgrp_idx SEC(".maps"); 28 29 // per-cpu event snapshots to calculate delta 30 struct { 31 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 32 __uint(key_size, sizeof(__u32)); 33 __uint(value_size, sizeof(struct bpf_perf_event_value)); 34 } prev_readings SEC(".maps"); 35 36 // aggregated event values for each cgroup (per-cpu) 37 // will be read from the user-space 38 struct { 39 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 40 __uint(key_size, sizeof(__u32)); 41 __uint(value_size, sizeof(struct bpf_perf_event_value)); 42 } cgrp_readings SEC(".maps"); 43 44 /* new kernel cgroup definition */ 45 struct cgroup___new { 46 int level; 47 struct cgroup *ancestors[]; 48 } __attribute__((preserve_access_index)); 49 50 /* old kernel cgroup definition */ 51 struct cgroup___old { 52 int level; 53 u64 ancestor_ids[]; 54 } __attribute__((preserve_access_index)); 55 56 const volatile __u32 num_events = 1; 57 const volatile __u32 num_cpus = 1; 58 const volatile int use_cgroup_v2 = 0; 59 60 int enabled = 0; 61 int perf_subsys_id = -1; 62 63 static inline __u64 get_cgroup_v1_ancestor_id(struct cgroup *cgrp, int level) 64 { 65 /* recast pointer to capture new type for compiler */ 66 struct cgroup___new *cgrp_new = (void *)cgrp; 67 68 if (bpf_core_field_exists(cgrp_new->ancestors)) { 69 return BPF_CORE_READ(cgrp_new, ancestors[level], kn, id); 70 } else { 71 /* recast pointer to capture old type for compiler */ 72 struct cgroup___old *cgrp_old = (void *)cgrp; 73 74 return BPF_CORE_READ(cgrp_old, ancestor_ids[level]); 75 } 76 } 77 78 static inline int get_cgroup_v1_idx(__u32 *cgrps, int size) 79 { 80 struct task_struct *p = (void *)bpf_get_current_task(); 81 struct cgroup *cgrp; 82 register int i = 0; 83 __u32 *elem; 84 int level; 85 int cnt; 86 87 if (perf_subsys_id == -1) { 88 #if __has_builtin(__builtin_preserve_enum_value) 89 perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, 90 perf_event_cgrp_id); 91 #else 92 perf_subsys_id = perf_event_cgrp_id; 93 #endif 94 } 95 cgrp = BPF_CORE_READ(p, cgroups, subsys[perf_subsys_id], cgroup); 96 level = BPF_CORE_READ(cgrp, level); 97 98 for (cnt = 0; i < BPERF_CGROUP__MAX_LEVELS; i++) { 99 __u64 cgrp_id; 100 101 if (i > level) 102 break; 103 104 // convert cgroup-id to a map index 105 cgrp_id = get_cgroup_v1_ancestor_id(cgrp, i); 106 elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); 107 if (!elem) 108 continue; 109 110 cgrps[cnt++] = *elem; 111 if (cnt == size) 112 break; 113 } 114 115 return cnt; 116 } 117 118 static inline int get_cgroup_v2_idx(__u32 *cgrps, int size) 119 { 120 register int i = 0; 121 __u32 *elem; 122 int cnt; 123 124 for (cnt = 0; i < BPERF_CGROUP__MAX_LEVELS; i++) { 125 __u64 cgrp_id = bpf_get_current_ancestor_cgroup_id(i); 126 127 if (cgrp_id == 0) 128 break; 129 130 // convert cgroup-id to a map index 131 elem = bpf_map_lookup_elem(&cgrp_idx, &cgrp_id); 132 if (!elem) 133 continue; 134 135 cgrps[cnt++] = *elem; 136 if (cnt == size) 137 break; 138 } 139 140 return cnt; 141 } 142 143 static int bperf_cgroup_count(void) 144 { 145 register __u32 idx = 0; // to have it in a register to pass BPF verifier 146 register int c = 0; 147 struct bpf_perf_event_value val, delta, *prev_val, *cgrp_val; 148 __u32 cpu = bpf_get_smp_processor_id(); 149 __u32 cgrp_idx[BPERF_CGROUP__MAX_LEVELS]; 150 int cgrp_cnt; 151 __u32 key, cgrp; 152 long err; 153 154 if (use_cgroup_v2) 155 cgrp_cnt = get_cgroup_v2_idx(cgrp_idx, BPERF_CGROUP__MAX_LEVELS); 156 else 157 cgrp_cnt = get_cgroup_v1_idx(cgrp_idx, BPERF_CGROUP__MAX_LEVELS); 158 159 for ( ; idx < BPERF_CGROUP__MAX_EVENTS; idx++) { 160 if (idx == num_events) 161 break; 162 163 // XXX: do not pass idx directly (for verifier) 164 key = idx; 165 // this is per-cpu array for diff 166 prev_val = bpf_map_lookup_elem(&prev_readings, &key); 167 if (!prev_val) { 168 val.counter = val.enabled = val.running = 0; 169 bpf_map_update_elem(&prev_readings, &key, &val, BPF_ANY); 170 171 prev_val = bpf_map_lookup_elem(&prev_readings, &key); 172 if (!prev_val) 173 continue; 174 } 175 176 // read from global perf_event array 177 key = idx * num_cpus + cpu; 178 err = bpf_perf_event_read_value(&events, key, &val, sizeof(val)); 179 if (err) 180 continue; 181 182 if (enabled) { 183 delta.counter = val.counter - prev_val->counter; 184 delta.enabled = val.enabled - prev_val->enabled; 185 delta.running = val.running - prev_val->running; 186 187 for (c = 0; c < BPERF_CGROUP__MAX_LEVELS; c++) { 188 if (c == cgrp_cnt) 189 break; 190 191 cgrp = cgrp_idx[c]; 192 193 // aggregate the result by cgroup 194 key = cgrp * num_events + idx; 195 cgrp_val = bpf_map_lookup_elem(&cgrp_readings, &key); 196 if (cgrp_val) { 197 cgrp_val->counter += delta.counter; 198 cgrp_val->enabled += delta.enabled; 199 cgrp_val->running += delta.running; 200 } else { 201 bpf_map_update_elem(&cgrp_readings, &key, 202 &delta, BPF_ANY); 203 } 204 } 205 } 206 207 *prev_val = val; 208 } 209 return 0; 210 } 211 212 // This will be attached to cgroup-switches event for each cpu 213 SEC("perf_event") 214 int BPF_PROG(on_cgrp_switch) 215 { 216 return bperf_cgroup_count(); 217 } 218 219 SEC("raw_tp/sched_switch") 220 int BPF_PROG(trigger_read) 221 { 222 return bperf_cgroup_count(); 223 } 224 225 char LICENSE[] SEC("license") = "Dual BSD/GPL"; 226