1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2 // Copyright (c) 2022 Google 3 #include "vmlinux.h" 4 #include <bpf/bpf_helpers.h> 5 #include <bpf/bpf_tracing.h> 6 #include <bpf/bpf_core_read.h> 7 8 /* task->flags for off-cpu analysis */ 9 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ 10 11 /* task->state for off-cpu analysis */ 12 #define TASK_INTERRUPTIBLE 0x0001 13 #define TASK_UNINTERRUPTIBLE 0x0002 14 15 /* create a new thread */ 16 #define CLONE_THREAD 0x10000 17 18 #define MAX_STACKS 32 19 #define MAX_ENTRIES 102400 20 21 struct tstamp_data { 22 __u32 stack_id; 23 __u32 state; 24 __u64 timestamp; 25 }; 26 27 struct offcpu_key { 28 __u32 pid; 29 __u32 tgid; 30 __u32 stack_id; 31 __u32 state; 32 __u64 cgroup_id; 33 }; 34 35 struct { 36 __uint(type, BPF_MAP_TYPE_STACK_TRACE); 37 __uint(key_size, sizeof(__u32)); 38 __uint(value_size, MAX_STACKS * sizeof(__u64)); 39 __uint(max_entries, MAX_ENTRIES); 40 } stacks SEC(".maps"); 41 42 struct { 43 __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 44 __uint(map_flags, BPF_F_NO_PREALLOC); 45 __type(key, int); 46 __type(value, struct tstamp_data); 47 } tstamp SEC(".maps"); 48 49 struct { 50 __uint(type, BPF_MAP_TYPE_HASH); 51 __uint(key_size, sizeof(struct offcpu_key)); 52 __uint(value_size, sizeof(__u64)); 53 __uint(max_entries, MAX_ENTRIES); 54 } off_cpu SEC(".maps"); 55 56 struct { 57 __uint(type, BPF_MAP_TYPE_HASH); 58 __uint(key_size, sizeof(__u32)); 59 __uint(value_size, sizeof(__u8)); 60 __uint(max_entries, 1); 61 } cpu_filter SEC(".maps"); 62 63 struct { 64 __uint(type, BPF_MAP_TYPE_HASH); 65 __uint(key_size, sizeof(__u32)); 66 __uint(value_size, sizeof(__u8)); 67 __uint(max_entries, 1); 68 } task_filter SEC(".maps"); 69 70 struct { 71 __uint(type, BPF_MAP_TYPE_HASH); 72 __uint(key_size, sizeof(__u64)); 73 __uint(value_size, sizeof(__u8)); 74 __uint(max_entries, 1); 75 } cgroup_filter SEC(".maps"); 76 77 /* new kernel task_struct definition */ 78 struct task_struct___new { 79 long __state; 80 } __attribute__((preserve_access_index)); 81 82 /* old kernel task_struct definition */ 83 struct task_struct___old { 84 long state; 85 } __attribute__((preserve_access_index)); 86 87 int enabled = 0; 88 int has_cpu = 0; 89 int has_task = 0; 90 int has_cgroup = 0; 91 int uses_tgid = 0; 92 93 const volatile bool has_prev_state = false; 94 const volatile bool needs_cgroup = false; 95 const volatile bool uses_cgroup_v1 = false; 96 97 int perf_subsys_id = -1; 98 99 /* 100 * Old kernel used to call it task_struct->state and now it's '__state'. 101 * Use BPF CO-RE "ignored suffix rule" to deal with it like below: 102 * 103 * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes 104 */ 105 static inline int get_task_state(struct task_struct *t) 106 { 107 /* recast pointer to capture new type for compiler */ 108 struct task_struct___new *t_new = (void *)t; 109 110 if (bpf_core_field_exists(t_new->__state)) { 111 return BPF_CORE_READ(t_new, __state); 112 } else { 113 /* recast pointer to capture old type for compiler */ 114 struct task_struct___old *t_old = (void *)t; 115 116 return BPF_CORE_READ(t_old, state); 117 } 118 } 119 120 static inline __u64 get_cgroup_id(struct task_struct *t) 121 { 122 struct cgroup *cgrp; 123 124 if (!uses_cgroup_v1) 125 return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id); 126 127 if (perf_subsys_id == -1) { 128 #if __has_builtin(__builtin_preserve_enum_value) 129 perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, 130 perf_event_cgrp_id); 131 #else 132 perf_subsys_id = perf_event_cgrp_id; 133 #endif 134 } 135 136 cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup); 137 return BPF_CORE_READ(cgrp, kn, id); 138 } 139 140 static inline int can_record(struct task_struct *t, int state) 141 { 142 /* kernel threads don't have user stack */ 143 if (t->flags & PF_KTHREAD) 144 return 0; 145 146 if (state != TASK_INTERRUPTIBLE && 147 state != TASK_UNINTERRUPTIBLE) 148 return 0; 149 150 if (has_cpu) { 151 __u32 cpu = bpf_get_smp_processor_id(); 152 __u8 *ok; 153 154 ok = bpf_map_lookup_elem(&cpu_filter, &cpu); 155 if (!ok) 156 return 0; 157 } 158 159 if (has_task) { 160 __u8 *ok; 161 __u32 pid; 162 163 if (uses_tgid) 164 pid = t->tgid; 165 else 166 pid = t->pid; 167 168 ok = bpf_map_lookup_elem(&task_filter, &pid); 169 if (!ok) 170 return 0; 171 } 172 173 if (has_cgroup) { 174 __u8 *ok; 175 __u64 cgrp_id = get_cgroup_id(t); 176 177 ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id); 178 if (!ok) 179 return 0; 180 } 181 182 return 1; 183 } 184 185 static int off_cpu_stat(u64 *ctx, struct task_struct *prev, 186 struct task_struct *next, int state) 187 { 188 __u64 ts; 189 __u32 stack_id; 190 struct tstamp_data *pelem; 191 192 ts = bpf_ktime_get_ns(); 193 194 if (!can_record(prev, state)) 195 goto next; 196 197 stack_id = bpf_get_stackid(ctx, &stacks, 198 BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK); 199 200 pelem = bpf_task_storage_get(&tstamp, prev, NULL, 201 BPF_LOCAL_STORAGE_GET_F_CREATE); 202 if (!pelem) 203 goto next; 204 205 pelem->timestamp = ts; 206 pelem->state = state; 207 pelem->stack_id = stack_id; 208 209 next: 210 pelem = bpf_task_storage_get(&tstamp, next, NULL, 0); 211 212 if (pelem && pelem->timestamp) { 213 struct offcpu_key key = { 214 .pid = next->pid, 215 .tgid = next->tgid, 216 .stack_id = pelem->stack_id, 217 .state = pelem->state, 218 .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0, 219 }; 220 __u64 delta = ts - pelem->timestamp; 221 __u64 *total; 222 223 total = bpf_map_lookup_elem(&off_cpu, &key); 224 if (total) 225 *total += delta; 226 else 227 bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY); 228 229 /* prevent to reuse the timestamp later */ 230 pelem->timestamp = 0; 231 } 232 233 return 0; 234 } 235 236 SEC("tp_btf/task_newtask") 237 int on_newtask(u64 *ctx) 238 { 239 struct task_struct *task; 240 u64 clone_flags; 241 u32 pid; 242 u8 val = 1; 243 244 if (!uses_tgid) 245 return 0; 246 247 task = (struct task_struct *)bpf_get_current_task(); 248 249 pid = BPF_CORE_READ(task, tgid); 250 if (!bpf_map_lookup_elem(&task_filter, &pid)) 251 return 0; 252 253 task = (struct task_struct *)ctx[0]; 254 clone_flags = ctx[1]; 255 256 pid = task->tgid; 257 if (!(clone_flags & CLONE_THREAD)) 258 bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST); 259 260 return 0; 261 } 262 263 SEC("tp_btf/sched_switch") 264 int on_switch(u64 *ctx) 265 { 266 struct task_struct *prev, *next; 267 int prev_state; 268 269 if (!enabled) 270 return 0; 271 272 prev = (struct task_struct *)ctx[1]; 273 next = (struct task_struct *)ctx[2]; 274 275 if (has_prev_state) 276 prev_state = (int)ctx[3]; 277 else 278 prev_state = get_task_state(prev); 279 280 return off_cpu_stat(ctx, prev, next, prev_state & 0xff); 281 } 282 283 char LICENSE[] SEC("license") = "Dual BSD/GPL"; 284