1 // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) 2 // Copyright (c) 2022 Google 3 #include "vmlinux.h" 4 #include <bpf/bpf_helpers.h> 5 #include <bpf/bpf_tracing.h> 6 #include <bpf/bpf_core_read.h> 7 8 /* task->flags for off-cpu analysis */ 9 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ 10 11 /* task->state for off-cpu analysis */ 12 #define TASK_INTERRUPTIBLE 0x0001 13 #define TASK_UNINTERRUPTIBLE 0x0002 14 15 /* create a new thread */ 16 #define CLONE_THREAD 0x10000 17 18 #define MAX_STACKS 32 19 #define MAX_ENTRIES 102400 20 21 #define MAX_CPUS 4096 22 #define MAX_OFFCPU_LEN 37 23 24 // We have a 'struct stack' in vmlinux.h when building with GEN_VMLINUX_H=1 25 struct __stack { 26 u64 array[MAX_STACKS]; 27 }; 28 29 struct tstamp_data { 30 __u32 stack_id; 31 __u32 state; 32 __u64 timestamp; 33 struct __stack stack; 34 }; 35 36 struct offcpu_key { 37 __u32 pid; 38 __u32 tgid; 39 __u32 stack_id; 40 __u32 state; 41 __u64 cgroup_id; 42 }; 43 44 struct { 45 __uint(type, BPF_MAP_TYPE_STACK_TRACE); 46 __uint(key_size, sizeof(__u32)); 47 __uint(value_size, MAX_STACKS * sizeof(__u64)); 48 __uint(max_entries, MAX_ENTRIES); 49 } stacks SEC(".maps"); 50 51 struct offcpu_data { 52 u64 array[MAX_OFFCPU_LEN]; 53 }; 54 55 struct { 56 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY); 57 __uint(key_size, sizeof(int)); 58 __uint(value_size, sizeof(int)); 59 __uint(max_entries, MAX_CPUS); 60 } offcpu_output SEC(".maps"); 61 62 struct { 63 __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); 64 __uint(key_size, sizeof(__u32)); 65 __uint(value_size, sizeof(struct offcpu_data)); 66 __uint(max_entries, 1); 67 } offcpu_payload SEC(".maps"); 68 69 struct { 70 __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 71 __uint(map_flags, BPF_F_NO_PREALLOC); 72 __type(key, int); 73 __type(value, struct tstamp_data); 74 } tstamp SEC(".maps"); 75 76 struct { 77 __uint(type, BPF_MAP_TYPE_HASH); 78 __uint(key_size, sizeof(struct offcpu_key)); 79 __uint(value_size, sizeof(__u64)); 80 __uint(max_entries, MAX_ENTRIES); 81 } off_cpu SEC(".maps"); 82 83 struct { 84 __uint(type, BPF_MAP_TYPE_HASH); 85 __uint(key_size, sizeof(__u32)); 86 __uint(value_size, sizeof(__u8)); 87 __uint(max_entries, 1); 88 } cpu_filter SEC(".maps"); 89 90 struct { 91 __uint(type, BPF_MAP_TYPE_HASH); 92 __uint(key_size, sizeof(__u32)); 93 __uint(value_size, sizeof(__u8)); 94 __uint(max_entries, 1); 95 } task_filter SEC(".maps"); 96 97 struct { 98 __uint(type, BPF_MAP_TYPE_HASH); 99 __uint(key_size, sizeof(__u64)); 100 __uint(value_size, sizeof(__u8)); 101 __uint(max_entries, 1); 102 } cgroup_filter SEC(".maps"); 103 104 /* new kernel task_struct definition */ 105 struct task_struct___new { 106 long __state; 107 } __attribute__((preserve_access_index)); 108 109 /* old kernel task_struct definition */ 110 struct task_struct___old { 111 long state; 112 } __attribute__((preserve_access_index)); 113 114 int enabled = 0; 115 116 const volatile int has_cpu = 0; 117 const volatile int has_task = 0; 118 const volatile int has_cgroup = 0; 119 const volatile int uses_tgid = 0; 120 121 const volatile bool has_prev_state = false; 122 const volatile bool needs_cgroup = false; 123 const volatile bool uses_cgroup_v1 = false; 124 125 int perf_subsys_id = -1; 126 127 __u64 offcpu_thresh_ns; 128 129 /* 130 * Old kernel used to call it task_struct->state and now it's '__state'. 131 * Use BPF CO-RE "ignored suffix rule" to deal with it like below: 132 * 133 * https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes 134 */ 135 static inline int get_task_state(struct task_struct *t) 136 { 137 /* recast pointer to capture new type for compiler */ 138 struct task_struct___new *t_new = (void *)t; 139 140 if (bpf_core_field_exists(t_new->__state)) { 141 return BPF_CORE_READ(t_new, __state); 142 } else { 143 /* recast pointer to capture old type for compiler */ 144 struct task_struct___old *t_old = (void *)t; 145 146 return BPF_CORE_READ(t_old, state); 147 } 148 } 149 150 static inline __u64 get_cgroup_id(struct task_struct *t) 151 { 152 struct cgroup *cgrp; 153 154 if (!uses_cgroup_v1) 155 return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id); 156 157 if (perf_subsys_id == -1) { 158 #if __has_builtin(__builtin_preserve_enum_value) 159 perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id, 160 perf_event_cgrp_id); 161 #else 162 perf_subsys_id = perf_event_cgrp_id; 163 #endif 164 } 165 166 cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup); 167 return BPF_CORE_READ(cgrp, kn, id); 168 } 169 170 static inline int can_record(struct task_struct *t, int state) 171 { 172 /* kernel threads don't have user stack */ 173 if (t->flags & PF_KTHREAD) 174 return 0; 175 176 if (state != TASK_INTERRUPTIBLE && 177 state != TASK_UNINTERRUPTIBLE) 178 return 0; 179 180 if (has_cpu) { 181 __u32 cpu = bpf_get_smp_processor_id(); 182 __u8 *ok; 183 184 ok = bpf_map_lookup_elem(&cpu_filter, &cpu); 185 if (!ok) 186 return 0; 187 } 188 189 if (has_task) { 190 __u8 *ok; 191 __u32 pid; 192 193 if (uses_tgid) 194 pid = t->tgid; 195 else 196 pid = t->pid; 197 198 ok = bpf_map_lookup_elem(&task_filter, &pid); 199 if (!ok) 200 return 0; 201 } 202 203 if (has_cgroup) { 204 __u8 *ok; 205 __u64 cgrp_id = get_cgroup_id(t); 206 207 ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id); 208 if (!ok) 209 return 0; 210 } 211 212 return 1; 213 } 214 215 static inline int copy_stack(struct __stack *from, struct offcpu_data *to, int n) 216 { 217 int len = 0; 218 219 for (int i = 0; i < MAX_STACKS && from->array[i]; ++i, ++len) 220 to->array[n + 2 + i] = from->array[i]; 221 222 return len; 223 } 224 225 /** 226 * off_cpu_dump - dump off-cpu samples to ring buffer 227 * @data: payload for dumping off-cpu samples 228 * @key: off-cpu data 229 * @stack: stack trace of the task before being scheduled out 230 * 231 * If the threshold of off-cpu time is reached, acquire tid, period, callchain, and cgroup id 232 * information of the task, and dump it as a raw sample to perf ring buffer 233 */ 234 static int off_cpu_dump(void *ctx, struct offcpu_data *data, struct offcpu_key *key, 235 struct __stack *stack, __u64 delta) 236 { 237 int n = 0, len = 0; 238 239 data->array[n++] = (u64)key->tgid << 32 | key->pid; 240 data->array[n++] = delta; 241 242 /* data->array[n] is callchain->nr (updated later) */ 243 data->array[n + 1] = PERF_CONTEXT_USER; 244 data->array[n + 2] = 0; 245 len = copy_stack(stack, data, n); 246 247 /* update length of callchain */ 248 data->array[n] = len + 1; 249 n += len + 2; 250 251 data->array[n++] = key->cgroup_id; 252 253 return bpf_perf_event_output(ctx, &offcpu_output, BPF_F_CURRENT_CPU, data, n * sizeof(u64)); 254 } 255 256 static int off_cpu_stat(u64 *ctx, struct task_struct *prev, 257 struct task_struct *next, int state) 258 { 259 __u64 ts; 260 __u32 stack_id; 261 struct tstamp_data *pelem; 262 263 ts = bpf_ktime_get_ns(); 264 265 if (!can_record(prev, state)) 266 goto next; 267 268 stack_id = bpf_get_stackid(ctx, &stacks, 269 BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK); 270 271 pelem = bpf_task_storage_get(&tstamp, prev, NULL, 272 BPF_LOCAL_STORAGE_GET_F_CREATE); 273 if (!pelem) 274 goto next; 275 276 pelem->timestamp = ts; 277 pelem->state = state; 278 pelem->stack_id = stack_id; 279 280 /* 281 * If stacks are successfully collected by bpf_get_stackid(), collect them once more 282 * in task_storage for direct off-cpu sample dumping 283 */ 284 if (stack_id > 0 && bpf_get_stack(ctx, &pelem->stack, MAX_STACKS * sizeof(u64), BPF_F_USER_STACK)) { 285 /* 286 * This empty if block is used to avoid 'result unused warning' from bpf_get_stack(). 287 * If the collection fails, continue with the logic for the next task. 288 */ 289 } 290 next: 291 pelem = bpf_task_storage_get(&tstamp, next, NULL, 0); 292 293 if (pelem && pelem->timestamp) { 294 struct offcpu_key key = { 295 .pid = next->pid, 296 .tgid = next->tgid, 297 .stack_id = pelem->stack_id, 298 .state = pelem->state, 299 .cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0, 300 }; 301 __u64 delta = ts - pelem->timestamp; 302 __u64 *total; 303 304 if (delta >= offcpu_thresh_ns) { 305 int zero = 0; 306 struct offcpu_data *data = bpf_map_lookup_elem(&offcpu_payload, &zero); 307 308 if (data) 309 off_cpu_dump(ctx, data, &key, &pelem->stack, delta); 310 } else { 311 total = bpf_map_lookup_elem(&off_cpu, &key); 312 if (total) 313 *total += delta; 314 else 315 bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY); 316 } 317 318 /* prevent to reuse the timestamp later */ 319 pelem->timestamp = 0; 320 } 321 322 return 0; 323 } 324 325 SEC("tp_btf/task_newtask") 326 int on_newtask(u64 *ctx) 327 { 328 struct task_struct *task; 329 u64 clone_flags; 330 u32 pid; 331 u8 val = 1; 332 333 if (!uses_tgid) 334 return 0; 335 336 task = (struct task_struct *)bpf_get_current_task(); 337 338 pid = BPF_CORE_READ(task, tgid); 339 if (!bpf_map_lookup_elem(&task_filter, &pid)) 340 return 0; 341 342 task = (struct task_struct *)ctx[0]; 343 clone_flags = ctx[1]; 344 345 pid = task->tgid; 346 if (!(clone_flags & CLONE_THREAD)) 347 bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST); 348 349 return 0; 350 } 351 352 SEC("tp_btf/sched_switch") 353 int on_switch(u64 *ctx) 354 { 355 struct task_struct *prev, *next; 356 int prev_state; 357 358 if (!enabled) 359 return 0; 360 361 prev = (struct task_struct *)ctx[1]; 362 next = (struct task_struct *)ctx[2]; 363 364 if (has_prev_state) 365 prev_state = (int)ctx[3]; 366 else 367 prev_state = get_task_state(prev); 368 369 return off_cpu_stat(ctx, prev, next, prev_state & 0xff); 370 } 371 372 char LICENSE[] SEC("license") = "Dual BSD/GPL"; 373