1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. 4 * Copyright (c) 2022 Tejun Heo <tj@kernel.org> 5 * Copyright (c) 2022 David Vernet <dvernet@meta.com> 6 */ 7 #ifndef __SCX_COMMON_BPF_H 8 #define __SCX_COMMON_BPF_H 9 10 #ifdef LSP 11 #define __bpf__ 12 #include "../vmlinux/vmlinux.h" 13 #else 14 #include "vmlinux.h" 15 #endif 16 17 #include <bpf/bpf_helpers.h> 18 #include <bpf/bpf_tracing.h> 19 #include <asm-generic/errno.h> 20 #include "user_exit_info.h" 21 22 #define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ 23 #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ 24 #define PF_EXITING 0x00000004 25 #define CLOCK_MONOTONIC 1 26 27 /* 28 * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can 29 * lead to really confusing misbehaviors. Let's trigger a build failure. 30 */ 31 static inline void ___vmlinux_h_sanity_check___(void) 32 { 33 _Static_assert(SCX_DSQ_FLAG_BUILTIN, 34 "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole"); 35 } 36 37 s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym; 38 s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym; 39 void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak; 40 void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak; 41 u32 scx_bpf_dispatch_nr_slots(void) __ksym; 42 void scx_bpf_dispatch_cancel(void) __ksym; 43 bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym; 44 void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym; 45 void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym; 46 bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; 47 bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak; 48 u32 scx_bpf_reenqueue_local(void) __ksym; 49 void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; 50 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; 51 void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; 52 int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak; 53 struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak; 54 void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak; 55 void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak; 56 void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym; 57 void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak; 58 u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak; 59 u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak; 60 void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak; 61 u32 scx_bpf_nr_cpu_ids(void) __ksym __weak; 62 const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak; 63 const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak; 64 void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak; 65 const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym; 66 const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym; 67 void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym; 68 bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym; 69 s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; 70 s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; 71 bool scx_bpf_task_running(const struct task_struct *p) __ksym; 72 s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; 73 struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym; 74 struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak; 75 76 /* 77 * Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from 78 * within bpf_for_each() loops. 79 */ 80 #define BPF_FOR_EACH_ITER (&___it) 81 82 static inline __attribute__((format(printf, 1, 2))) 83 void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} 84 85 /* 86 * Helper macro for initializing the fmt and variadic argument inputs to both 87 * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to 88 * refer to the initialized list of inputs to the bstr kfunc. 89 */ 90 #define scx_bpf_bstr_preamble(fmt, args...) \ 91 static char ___fmt[] = fmt; \ 92 /* \ 93 * Note that __param[] must have at least one \ 94 * element to keep the verifier happy. \ 95 */ \ 96 unsigned long long ___param[___bpf_narg(args) ?: 1] = {}; \ 97 \ 98 _Pragma("GCC diagnostic push") \ 99 _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ 100 ___bpf_fill(___param, args); \ 101 _Pragma("GCC diagnostic pop") \ 102 103 /* 104 * scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments 105 * instead of an array of u64. Using this macro will cause the scheduler to 106 * exit cleanly with the specified exit code being passed to user space. 107 */ 108 #define scx_bpf_exit(code, fmt, args...) \ 109 ({ \ 110 scx_bpf_bstr_preamble(fmt, args) \ 111 scx_bpf_exit_bstr(code, ___fmt, ___param, sizeof(___param)); \ 112 ___scx_bpf_bstr_format_checker(fmt, ##args); \ 113 }) 114 115 /* 116 * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments 117 * instead of an array of u64. Invoking this macro will cause the scheduler to 118 * exit in an erroneous state, with diagnostic information being passed to the 119 * user. 120 */ 121 #define scx_bpf_error(fmt, args...) \ 122 ({ \ 123 scx_bpf_bstr_preamble(fmt, args) \ 124 scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \ 125 ___scx_bpf_bstr_format_checker(fmt, ##args); \ 126 }) 127 128 /* 129 * scx_bpf_dump() wraps the scx_bpf_dump_bstr() kfunc with variadic arguments 130 * instead of an array of u64. To be used from ops.dump() and friends. 131 */ 132 #define scx_bpf_dump(fmt, args...) \ 133 ({ \ 134 scx_bpf_bstr_preamble(fmt, args) \ 135 scx_bpf_dump_bstr(___fmt, ___param, sizeof(___param)); \ 136 ___scx_bpf_bstr_format_checker(fmt, ##args); \ 137 }) 138 139 #define BPF_STRUCT_OPS(name, args...) \ 140 SEC("struct_ops/"#name) \ 141 BPF_PROG(name, ##args) 142 143 #define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ 144 SEC("struct_ops.s/"#name) \ 145 BPF_PROG(name, ##args) 146 147 /** 148 * RESIZABLE_ARRAY - Generates annotations for an array that may be resized 149 * @elfsec: the data section of the BPF program in which to place the array 150 * @arr: the name of the array 151 * 152 * libbpf has an API for setting map value sizes. Since data sections (i.e. 153 * bss, data, rodata) themselves are maps, a data section can be resized. If 154 * a data section has an array as its last element, the BTF info for that 155 * array will be adjusted so that length of the array is extended to meet the 156 * new length of the data section. This macro annotates an array to have an 157 * element count of one with the assumption that this array can be resized 158 * within the userspace program. It also annotates the section specifier so 159 * this array exists in a custom sub data section which can be resized 160 * independently. 161 * 162 * See RESIZE_ARRAY() for the userspace convenience macro for resizing an 163 * array declared with RESIZABLE_ARRAY(). 164 */ 165 #define RESIZABLE_ARRAY(elfsec, arr) arr[1] SEC("."#elfsec"."#arr) 166 167 /** 168 * MEMBER_VPTR - Obtain the verified pointer to a struct or array member 169 * @base: struct or array to index 170 * @member: dereferenced member (e.g. .field, [idx0][idx1], .field[idx0] ...) 171 * 172 * The verifier often gets confused by the instruction sequence the compiler 173 * generates for indexing struct fields or arrays. This macro forces the 174 * compiler to generate a code sequence which first calculates the byte offset, 175 * checks it against the struct or array size and add that byte offset to 176 * generate the pointer to the member to help the verifier. 177 * 178 * Ideally, we want to abort if the calculated offset is out-of-bounds. However, 179 * BPF currently doesn't support abort, so evaluate to %NULL instead. The caller 180 * must check for %NULL and take appropriate action to appease the verifier. To 181 * avoid confusing the verifier, it's best to check for %NULL and dereference 182 * immediately. 183 * 184 * vptr = MEMBER_VPTR(my_array, [i][j]); 185 * if (!vptr) 186 * return error; 187 * *vptr = new_value; 188 * 189 * sizeof(@base) should encompass the memory area to be accessed and thus can't 190 * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of 191 * `MEMBER_VPTR(ptr, ->member)`. 192 */ 193 #define MEMBER_VPTR(base, member) (typeof((base) member) *) \ 194 ({ \ 195 u64 __base = (u64)&(base); \ 196 u64 __addr = (u64)&((base) member) - __base; \ 197 _Static_assert(sizeof(base) >= sizeof((base) member), \ 198 "@base is smaller than @member, is @base a pointer?"); \ 199 asm volatile ( \ 200 "if %0 <= %[max] goto +2\n" \ 201 "%0 = 0\n" \ 202 "goto +1\n" \ 203 "%0 += %1\n" \ 204 : "+r"(__addr) \ 205 : "r"(__base), \ 206 [max]"i"(sizeof(base) - sizeof((base) member))); \ 207 __addr; \ 208 }) 209 210 /** 211 * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element 212 * @arr: array to index into 213 * @i: array index 214 * @n: number of elements in array 215 * 216 * Similar to MEMBER_VPTR() but is intended for use with arrays where the 217 * element count needs to be explicit. 218 * It can be used in cases where a global array is defined with an initial 219 * size but is intended to be be resized before loading the BPF program. 220 * Without this version of the macro, MEMBER_VPTR() will use the compile time 221 * size of the array to compute the max, which will result in rejection by 222 * the verifier. 223 */ 224 #define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *) \ 225 ({ \ 226 u64 __base = (u64)arr; \ 227 u64 __addr = (u64)&(arr[i]) - __base; \ 228 asm volatile ( \ 229 "if %0 <= %[max] goto +2\n" \ 230 "%0 = 0\n" \ 231 "goto +1\n" \ 232 "%0 += %1\n" \ 233 : "+r"(__addr) \ 234 : "r"(__base), \ 235 [max]"r"(sizeof(arr[0]) * ((n) - 1))); \ 236 __addr; \ 237 }) 238 239 240 /* 241 * BPF declarations and helpers 242 */ 243 244 /* list and rbtree */ 245 #define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node))) 246 #define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) 247 248 void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym; 249 void bpf_obj_drop_impl(void *kptr, void *meta) __ksym; 250 251 #define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL)) 252 #define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL) 253 254 void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; 255 void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; 256 struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym; 257 struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym; 258 struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, 259 struct bpf_rb_node *node) __ksym; 260 int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, 261 bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), 262 void *meta, __u64 off) __ksym; 263 #define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0) 264 265 struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym; 266 267 void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym; 268 #define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL) 269 270 /* task */ 271 struct task_struct *bpf_task_from_pid(s32 pid) __ksym; 272 struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym; 273 void bpf_task_release(struct task_struct *p) __ksym; 274 275 /* cgroup */ 276 struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym; 277 void bpf_cgroup_release(struct cgroup *cgrp) __ksym; 278 struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; 279 280 /* css iteration */ 281 struct bpf_iter_css; 282 struct cgroup_subsys_state; 283 extern int bpf_iter_css_new(struct bpf_iter_css *it, 284 struct cgroup_subsys_state *start, 285 unsigned int flags) __weak __ksym; 286 extern struct cgroup_subsys_state * 287 bpf_iter_css_next(struct bpf_iter_css *it) __weak __ksym; 288 extern void bpf_iter_css_destroy(struct bpf_iter_css *it) __weak __ksym; 289 290 /* cpumask */ 291 struct bpf_cpumask *bpf_cpumask_create(void) __ksym; 292 struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym; 293 void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym; 294 u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym; 295 u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym; 296 void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; 297 void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; 298 bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym; 299 bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; 300 bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; 301 void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym; 302 void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym; 303 bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1, 304 const struct cpumask *src2) __ksym; 305 void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1, 306 const struct cpumask *src2) __ksym; 307 void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1, 308 const struct cpumask *src2) __ksym; 309 bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym; 310 bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym; 311 bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym; 312 bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym; 313 bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym; 314 void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym; 315 u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym; 316 u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, 317 const struct cpumask *src2) __ksym; 318 u32 bpf_cpumask_weight(const struct cpumask *cpumask) __ksym; 319 320 /* 321 * Access a cpumask in read-only mode (typically to check bits). 322 */ 323 static __always_inline const struct cpumask *cast_mask(struct bpf_cpumask *mask) 324 { 325 return (const struct cpumask *)mask; 326 } 327 328 /* rcu */ 329 void bpf_rcu_read_lock(void) __ksym; 330 void bpf_rcu_read_unlock(void) __ksym; 331 332 333 /* 334 * Other helpers 335 */ 336 337 /* useful compiler attributes */ 338 #define likely(x) __builtin_expect(!!(x), 1) 339 #define unlikely(x) __builtin_expect(!!(x), 0) 340 #define __maybe_unused __attribute__((__unused__)) 341 342 /* 343 * READ/WRITE_ONCE() are from kernel (include/asm-generic/rwonce.h). They 344 * prevent compiler from caching, redoing or reordering reads or writes. 345 */ 346 typedef __u8 __attribute__((__may_alias__)) __u8_alias_t; 347 typedef __u16 __attribute__((__may_alias__)) __u16_alias_t; 348 typedef __u32 __attribute__((__may_alias__)) __u32_alias_t; 349 typedef __u64 __attribute__((__may_alias__)) __u64_alias_t; 350 351 static __always_inline void __read_once_size(const volatile void *p, void *res, int size) 352 { 353 switch (size) { 354 case 1: *(__u8_alias_t *) res = *(volatile __u8_alias_t *) p; break; 355 case 2: *(__u16_alias_t *) res = *(volatile __u16_alias_t *) p; break; 356 case 4: *(__u32_alias_t *) res = *(volatile __u32_alias_t *) p; break; 357 case 8: *(__u64_alias_t *) res = *(volatile __u64_alias_t *) p; break; 358 default: 359 barrier(); 360 __builtin_memcpy((void *)res, (const void *)p, size); 361 barrier(); 362 } 363 } 364 365 static __always_inline void __write_once_size(volatile void *p, void *res, int size) 366 { 367 switch (size) { 368 case 1: *(volatile __u8_alias_t *) p = *(__u8_alias_t *) res; break; 369 case 2: *(volatile __u16_alias_t *) p = *(__u16_alias_t *) res; break; 370 case 4: *(volatile __u32_alias_t *) p = *(__u32_alias_t *) res; break; 371 case 8: *(volatile __u64_alias_t *) p = *(__u64_alias_t *) res; break; 372 default: 373 barrier(); 374 __builtin_memcpy((void *)p, (const void *)res, size); 375 barrier(); 376 } 377 } 378 379 #define READ_ONCE(x) \ 380 ({ \ 381 union { typeof(x) __val; char __c[1]; } __u = \ 382 { .__c = { 0 } }; \ 383 __read_once_size(&(x), __u.__c, sizeof(x)); \ 384 __u.__val; \ 385 }) 386 387 #define WRITE_ONCE(x, val) \ 388 ({ \ 389 union { typeof(x) __val; char __c[1]; } __u = \ 390 { .__val = (val) }; \ 391 __write_once_size(&(x), __u.__c, sizeof(x)); \ 392 __u.__val; \ 393 }) 394 395 /* 396 * log2_u32 - Compute the base 2 logarithm of a 32-bit exponential value. 397 * @v: The value for which we're computing the base 2 logarithm. 398 */ 399 static inline u32 log2_u32(u32 v) 400 { 401 u32 r; 402 u32 shift; 403 404 r = (v > 0xFFFF) << 4; v >>= r; 405 shift = (v > 0xFF) << 3; v >>= shift; r |= shift; 406 shift = (v > 0xF) << 2; v >>= shift; r |= shift; 407 shift = (v > 0x3) << 1; v >>= shift; r |= shift; 408 r |= (v >> 1); 409 return r; 410 } 411 412 /* 413 * log2_u64 - Compute the base 2 logarithm of a 64-bit exponential value. 414 * @v: The value for which we're computing the base 2 logarithm. 415 */ 416 static inline u32 log2_u64(u64 v) 417 { 418 u32 hi = v >> 32; 419 if (hi) 420 return log2_u32(hi) + 32 + 1; 421 else 422 return log2_u32(v) + 1; 423 } 424 425 #include "compat.bpf.h" 426 427 #endif /* __SCX_COMMON_BPF_H */ 428