1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2 // Copyright (c) 2022 Google 3 #include "vmlinux.h" 4 #include <bpf/bpf_helpers.h> 5 #include <bpf/bpf_tracing.h> 6 #include <bpf/bpf_core_read.h> 7 8 /* task->flags for off-cpu analysis */ 9 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ 10 11 /* task->state for off-cpu analysis */ 12 #define TASK_INTERRUPTIBLE 0x0001 13 #define TASK_UNINTERRUPTIBLE 0x0002 14 15 /* create a new thread */ 16 #define CLONE_THREAD 0x10000 17 18 #define MAX_STACKS 32 19 #define MAX_ENTRIES 102400 20 21 struct tstamp_data { 22 __u32 stack_id; 23 __u32 state; 24 __u64 timestamp; 25 }; 26 27 struct offcpu_key { 28 __u32 pid; 29 __u32 tgid; 30 __u32 stack_id; 31 __u32 state; 32 __u64 cgroup_id; 33 }; 34 35 struct { 36 __uint(type, BPF_MAP_TYPE_STACK_TRACE); 37 __uint(key_size, sizeof(__u32)); 38 __uint(value_size, MAX_STACKS * sizeof(__u64)); 39 __uint(max_entries, MAX_ENTRIES); 40 } stacks SEC(".maps"); 41 42 struct { 43 __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 44 __uint(map_flags, BPF_F_NO_PREALLOC); 45 __type(key, int); 46 __type(value, struct tstamp_data); 47 } tstamp SEC(".maps"); 48 49 struct { 50 __uint(type, BPF_MAP_TYPE_HASH); 51 __uint(key_size, sizeof(struct offcpu_key)); 52 __uint(value_size, sizeof(__u64)); 53 __uint(max_entries, MAX_ENTRIES); 54 } off_cpu SEC(".maps"); 55 56 struct { 57 __uint(type, BPF_MAP_TYPE_HASH); 58 __uint(key_size, sizeof(__u32)); 59 __uint(value_size, sizeof(__u8)); 60 __uint(max_entries, 1); 61 } cpu_filter SEC(".maps"); 62 63 struct { 64 __uint(type, BPF_MAP_TYPE_HASH); 65 __uint(key_size, sizeof(__u32)); 66 __uint(value_size, sizeof(__u8)); 67 __uint(max_entries, 1); 68 } task_filter SEC(".maps"); 69 70 struct { 71 __uint(type, BPF_MAP_TYPE_HASH); 72 __uint(key_size, sizeof(__u64)); 73 __uint(value_size, sizeof(__u8)); 74 __uint(max_entries, 1); 75 } cgroup_filter SEC(".maps"); 76 77 /* new kernel task_struct definition */ 78 struct task_struct___new { 79 long __state; 80 } __attribute__((preserve_access_index)); 81 82 /* old kernel task_struct definition */ 83 struct task_struct___old { 84 long state; 85 } __attribute__((preserve_access_index)); 86 87 int enabled = 0; 88 89 const volatile int has_cpu = 0; 90 const volatile int has_task = 0; 91 const volatile int has_cgroup = 0; 92 const volatile int uses_tgid = 0; 93 94 const volatile bool has_prev_state = false; 95 const volatile bool needs_cgroup = false; 96 const volatile bool uses_cgroup_v1 = false; 97 98 int perf_subsys_id = -1; 99 100 /* 101 * Old kernel used to call it task_struct->state and now it's '__state'. 102 * Use BPF CO-RE "ignored suffix rule" to deal with it like below: 103 * 104 * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes 105 */ 106 static inline int get_task_state(struct task_struct *t) 107 { 108 /* recast pointer to capture new type for compiler */ 109 struct task_struct___new *t_new = (void *)t; 110 111 if (bpf_core_field_exists(t_new->__state)) { 112 return BPF_CORE_READ(t_new, __state); 113 } else { 114 /* recast pointer to capture old type for compiler */ 115 struct task_struct___old *t_old = (void *)t; 116 117 return BPF_CORE_READ(t_old, state); 118 } 119 } 120 121 static inline __u64 get_cgroup_id(struct task_struct *t) 122 { 123 struct cgroup *cgrp; 124 125 if (!uses_cgroup_v1) 126 return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id); 127 128 if (perf_subsys_id == -1) { 129 #if __has_builtin(__builtin_preserve_enum_value) 130 perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, 131 perf_event_cgrp_id); 132 #else 133 perf_subsys_id = perf_event_cgrp_id; 134 #endif 135 } 136 137 cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup); 138 return BPF_CORE_READ(cgrp, kn, id); 139 } 140 141 static inline int can_record(struct task_struct *t, int state) 142 { 143 /* kernel threads don't have user stack */ 144 if (t->flags & PF_KTHREAD) 145 return 0; 146 147 if (state != TASK_INTERRUPTIBLE && 148 state != TASK_UNINTERRUPTIBLE) 149 return 0; 150 151 if (has_cpu) { 152 __u32 cpu = bpf_get_smp_processor_id(); 153 __u8 *ok; 154 155 ok = bpf_map_lookup_elem(&cpu_filter, &cpu); 156 if (!ok) 157 return 0; 158 } 159 160 if (has_task) { 161 __u8 *ok; 162 __u32 pid; 163 164 if (uses_tgid) 165 pid = t->tgid; 166 else 167 pid = t->pid; 168 169 ok = bpf_map_lookup_elem(&task_filter, &pid); 170 if (!ok) 171 return 0; 172 } 173 174 if (has_cgroup) { 175 __u8 *ok; 176 __u64 cgrp_id = get_cgroup_id(t); 177 178 ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id); 179 if (!ok) 180 return 0; 181 } 182 183 return 1; 184 } 185 186 static int off_cpu_stat(u64 *ctx, struct task_struct *prev, 187 struct task_struct *next, int state) 188 { 189 __u64 ts; 190 __u32 stack_id; 191 struct tstamp_data *pelem; 192 193 ts = bpf_ktime_get_ns(); 194 195 if (!can_record(prev, state)) 196 goto next; 197 198 stack_id = bpf_get_stackid(ctx, &stacks, 199 BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK); 200 201 pelem = bpf_task_storage_get(&tstamp, prev, NULL, 202 BPF_LOCAL_STORAGE_GET_F_CREATE); 203 if (!pelem) 204 goto next; 205 206 pelem->timestamp = ts; 207 pelem->state = state; 208 pelem->stack_id = stack_id; 209 210 next: 211 pelem = bpf_task_storage_get(&tstamp, next, NULL, 0); 212 213 if (pelem && pelem->timestamp) { 214 struct offcpu_key key = { 215 .pid = next->pid, 216 .tgid = next->tgid, 217 .stack_id = pelem->stack_id, 218 .state = pelem->state, 219 .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0, 220 }; 221 __u64 delta = ts - pelem->timestamp; 222 __u64 *total; 223 224 total = bpf_map_lookup_elem(&off_cpu, &key); 225 if (total) 226 *total += delta; 227 else 228 bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY); 229 230 /* prevent to reuse the timestamp later */ 231 pelem->timestamp = 0; 232 } 233 234 return 0; 235 } 236 237 SEC("tp_btf/task_newtask") 238 int on_newtask(u64 *ctx) 239 { 240 struct task_struct *task; 241 u64 clone_flags; 242 u32 pid; 243 u8 val = 1; 244 245 if (!uses_tgid) 246 return 0; 247 248 task = (struct task_struct *)bpf_get_current_task(); 249 250 pid = BPF_CORE_READ(task, tgid); 251 if (!bpf_map_lookup_elem(&task_filter, &pid)) 252 return 0; 253 254 task = (struct task_struct *)ctx[0]; 255 clone_flags = ctx[1]; 256 257 pid = task->tgid; 258 if (!(clone_flags & CLONE_THREAD)) 259 bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST); 260 261 return 0; 262 } 263 264 SEC("tp_btf/sched_switch") 265 int on_switch(u64 *ctx) 266 { 267 struct task_struct *prev, *next; 268 int prev_state; 269 270 if (!enabled) 271 return 0; 272 273 prev = (struct task_struct *)ctx[1]; 274 next = (struct task_struct *)ctx[2]; 275 276 if (has_prev_state) 277 prev_state = (int)ctx[3]; 278 else 279 prev_state = get_task_state(prev); 280 281 return off_cpu_stat(ctx, prev, next, prev_state & 0xff); 282 } 283 284 char LICENSE[] SEC("license") = "Dual BSD/GPL"; 285