136929ebdSEmil Tsalapatis /* SPDX-License-Identifier: GPL-2.0 */ 236929ebdSEmil Tsalapatis /* 336929ebdSEmil Tsalapatis * Arena-based task data scheduler. This is a variation of scx_simple 436929ebdSEmil Tsalapatis * that uses a combined allocator and indexing structure to organize 536929ebdSEmil Tsalapatis * task data. Task context allocation is done when a task enters the 636929ebdSEmil Tsalapatis * scheduler, while freeing is done when it exits. Task contexts are 736929ebdSEmil Tsalapatis * retrieved from task-local storage, pointing to the allocated memory. 836929ebdSEmil Tsalapatis * 936929ebdSEmil Tsalapatis * The main purpose of this scheduler is to demostrate arena memory 1036929ebdSEmil Tsalapatis * management. 1136929ebdSEmil Tsalapatis * 1236929ebdSEmil Tsalapatis * Copyright (c) 2024-2025 Meta Platforms, Inc. and affiliates. 1336929ebdSEmil Tsalapatis * Copyright (c) 2024-2025 Emil Tsalapatis <etsal@meta.com> 1436929ebdSEmil Tsalapatis * Copyright (c) 2024-2025 Tejun Heo <tj@kernel.org> 1536929ebdSEmil Tsalapatis * 1636929ebdSEmil Tsalapatis */ 1736929ebdSEmil Tsalapatis #include <scx/common.bpf.h> 1836929ebdSEmil Tsalapatis #include <scx/bpf_arena_common.bpf.h> 1936929ebdSEmil Tsalapatis 2036929ebdSEmil Tsalapatis #include "scx_sdt.h" 2136929ebdSEmil Tsalapatis 2236929ebdSEmil Tsalapatis char _license[] SEC("license") = "GPL"; 2336929ebdSEmil Tsalapatis 2436929ebdSEmil Tsalapatis UEI_DEFINE(uei); 2536929ebdSEmil Tsalapatis 2636929ebdSEmil Tsalapatis struct { 2736929ebdSEmil Tsalapatis __uint(type, BPF_MAP_TYPE_ARENA); 2836929ebdSEmil Tsalapatis __uint(map_flags, BPF_F_MMAPABLE); 2936929ebdSEmil Tsalapatis #if defined(__TARGET_ARCH_arm64) || defined(__aarch64__) 3036929ebdSEmil Tsalapatis __uint(max_entries, 1 << 16); /* number of pages */ 3136929ebdSEmil Tsalapatis __ulong(map_extra, (1ull << 32)); /* start of mmap() region */ 3236929ebdSEmil Tsalapatis #else 3336929ebdSEmil Tsalapatis __uint(max_entries, 1 << 20); /* number of pages */ 3436929ebdSEmil Tsalapatis __ulong(map_extra, (1ull << 44)); /* start of mmap() region */ 3536929ebdSEmil Tsalapatis #endif 3636929ebdSEmil Tsalapatis } arena __weak SEC(".maps"); 3736929ebdSEmil Tsalapatis 3836929ebdSEmil Tsalapatis #define SHARED_DSQ 0 3936929ebdSEmil Tsalapatis 4036929ebdSEmil Tsalapatis #define DEFINE_SDT_STAT(metric) \ 4136929ebdSEmil Tsalapatis static inline void \ 4236929ebdSEmil Tsalapatis stat_inc_##metric(struct scx_stats __arena *stats) \ 4336929ebdSEmil Tsalapatis { \ 4436929ebdSEmil Tsalapatis cast_kern(stats); \ 4536929ebdSEmil Tsalapatis stats->metric += 1; \ 4636929ebdSEmil Tsalapatis } \ 4736929ebdSEmil Tsalapatis __u64 stat_##metric; \ 4836929ebdSEmil Tsalapatis 4936929ebdSEmil Tsalapatis DEFINE_SDT_STAT(enqueue); 5036929ebdSEmil Tsalapatis DEFINE_SDT_STAT(init); 5136929ebdSEmil Tsalapatis DEFINE_SDT_STAT(exit); 5236929ebdSEmil Tsalapatis DEFINE_SDT_STAT(select_idle_cpu); 5336929ebdSEmil Tsalapatis DEFINE_SDT_STAT(select_busy_cpu); 5436929ebdSEmil Tsalapatis 5536929ebdSEmil Tsalapatis /* 5636929ebdSEmil Tsalapatis * Necessary for cond_break/can_loop's semantics. According to kernel commit 5736929ebdSEmil Tsalapatis * 011832b, the loop counter variable must be seen as imprecise and bounded 5836929ebdSEmil Tsalapatis * by the verifier. Initializing it from a constant (e.g., i = 0;), then, 5936929ebdSEmil Tsalapatis * makes it precise and prevents may_goto from helping with converging the 6036929ebdSEmil Tsalapatis * loop. For these loops we must initialize the loop counter from a variable 6136929ebdSEmil Tsalapatis * whose value the verifier cannot reason about when checking the program, so 6236929ebdSEmil Tsalapatis * that the loop counter's value is imprecise. 6336929ebdSEmil Tsalapatis */ 6436929ebdSEmil Tsalapatis static __u64 zero = 0; 6536929ebdSEmil Tsalapatis 6636929ebdSEmil Tsalapatis /* 6736929ebdSEmil Tsalapatis * XXX Hack to get the verifier to find the arena for sdt_exit_task. 6836929ebdSEmil Tsalapatis * As of 6.12-rc5, The verifier associates arenas with programs by 6936929ebdSEmil Tsalapatis * checking LD.IMM instruction operands for an arena and populating 7036929ebdSEmil Tsalapatis * the program state with the first instance it finds. This requires 7136929ebdSEmil Tsalapatis * accessing our global arena variable, but scx methods do not necessarily 7236929ebdSEmil Tsalapatis * do so while still using pointers from that arena. Insert a bpf_printk 7336929ebdSEmil Tsalapatis * statement that triggers at most once to generate an LD.IMM instruction 7436929ebdSEmil Tsalapatis * to access the arena and help the verifier. 7536929ebdSEmil Tsalapatis */ 7636929ebdSEmil Tsalapatis static volatile bool scx_arena_verify_once; 7736929ebdSEmil Tsalapatis 7836929ebdSEmil Tsalapatis __hidden void scx_arena_subprog_init(void) 7936929ebdSEmil Tsalapatis { 8036929ebdSEmil Tsalapatis if (scx_arena_verify_once) 8136929ebdSEmil Tsalapatis return; 8236929ebdSEmil Tsalapatis 8336929ebdSEmil Tsalapatis bpf_printk("%s: arena pointer %p", __func__, &arena); 8436929ebdSEmil Tsalapatis scx_arena_verify_once = true; 8536929ebdSEmil Tsalapatis } 8636929ebdSEmil Tsalapatis 8736929ebdSEmil Tsalapatis 8836929ebdSEmil Tsalapatis private(LOCK) struct bpf_spin_lock alloc_lock; 8936929ebdSEmil Tsalapatis private(POOL_LOCK) struct bpf_spin_lock alloc_pool_lock; 9036929ebdSEmil Tsalapatis 9136929ebdSEmil Tsalapatis /* allocation pools */ 9236929ebdSEmil Tsalapatis struct sdt_pool desc_pool; 9336929ebdSEmil Tsalapatis struct sdt_pool chunk_pool; 9436929ebdSEmil Tsalapatis 9536929ebdSEmil Tsalapatis /* Protected by alloc_lock. */ 9636929ebdSEmil Tsalapatis struct scx_alloc_stats alloc_stats; 9736929ebdSEmil Tsalapatis 9836929ebdSEmil Tsalapatis 9936929ebdSEmil Tsalapatis /* Allocate element from the pool. Must be called with a then pool lock held. */ 10036929ebdSEmil Tsalapatis static 10136929ebdSEmil Tsalapatis void __arena *scx_alloc_from_pool(struct sdt_pool *pool) 10236929ebdSEmil Tsalapatis { 10336929ebdSEmil Tsalapatis __u64 elem_size, max_elems; 10436929ebdSEmil Tsalapatis void __arena *slab; 10536929ebdSEmil Tsalapatis void __arena *ptr; 10636929ebdSEmil Tsalapatis 10736929ebdSEmil Tsalapatis elem_size = pool->elem_size; 10836929ebdSEmil Tsalapatis max_elems = pool->max_elems; 10936929ebdSEmil Tsalapatis 11036929ebdSEmil Tsalapatis /* If the chunk is spent, get a new one. */ 11136929ebdSEmil Tsalapatis if (pool->idx >= max_elems) { 11236929ebdSEmil Tsalapatis slab = bpf_arena_alloc_pages(&arena, NULL, 11336929ebdSEmil Tsalapatis div_round_up(max_elems * elem_size, PAGE_SIZE), NUMA_NO_NODE, 0); 11436929ebdSEmil Tsalapatis if (!slab) 11536929ebdSEmil Tsalapatis return NULL; 11636929ebdSEmil Tsalapatis 11736929ebdSEmil Tsalapatis pool->slab = slab; 11836929ebdSEmil Tsalapatis pool->idx = 0; 11936929ebdSEmil Tsalapatis } 12036929ebdSEmil Tsalapatis 12136929ebdSEmil Tsalapatis ptr = (void __arena *)((__u64) pool->slab + elem_size * pool->idx); 12236929ebdSEmil Tsalapatis pool->idx += 1; 12336929ebdSEmil Tsalapatis 12436929ebdSEmil Tsalapatis return ptr; 12536929ebdSEmil Tsalapatis } 12636929ebdSEmil Tsalapatis 12736929ebdSEmil Tsalapatis /* Alloc desc and associated chunk. Called with the allocator spinlock held. */ 12836929ebdSEmil Tsalapatis static sdt_desc_t *scx_alloc_chunk(void) 12936929ebdSEmil Tsalapatis { 13036929ebdSEmil Tsalapatis struct sdt_chunk __arena *chunk; 13136929ebdSEmil Tsalapatis sdt_desc_t *desc; 13236929ebdSEmil Tsalapatis sdt_desc_t *out; 13336929ebdSEmil Tsalapatis 13436929ebdSEmil Tsalapatis chunk = scx_alloc_from_pool(&chunk_pool); 13536929ebdSEmil Tsalapatis if (!chunk) 13636929ebdSEmil Tsalapatis return NULL; 13736929ebdSEmil Tsalapatis 13836929ebdSEmil Tsalapatis desc = scx_alloc_from_pool(&desc_pool); 13936929ebdSEmil Tsalapatis if (!desc) { 14036929ebdSEmil Tsalapatis /* 14136929ebdSEmil Tsalapatis * Effectively frees the previous chunk allocation. 14236929ebdSEmil Tsalapatis * Index cannot be 0, so decrementing is always 14336929ebdSEmil Tsalapatis * valid. 14436929ebdSEmil Tsalapatis */ 14536929ebdSEmil Tsalapatis chunk_pool.idx -= 1; 14636929ebdSEmil Tsalapatis return NULL; 14736929ebdSEmil Tsalapatis } 14836929ebdSEmil Tsalapatis 14936929ebdSEmil Tsalapatis out = desc; 15036929ebdSEmil Tsalapatis 15136929ebdSEmil Tsalapatis desc->nr_free = SDT_TASK_ENTS_PER_CHUNK; 15236929ebdSEmil Tsalapatis desc->chunk = chunk; 15336929ebdSEmil Tsalapatis 15436929ebdSEmil Tsalapatis alloc_stats.chunk_allocs += 1; 15536929ebdSEmil Tsalapatis 15636929ebdSEmil Tsalapatis return out; 15736929ebdSEmil Tsalapatis } 15836929ebdSEmil Tsalapatis 15936929ebdSEmil Tsalapatis static int pool_set_size(struct sdt_pool *pool, __u64 data_size, __u64 nr_pages) 16036929ebdSEmil Tsalapatis { 16136929ebdSEmil Tsalapatis if (unlikely(data_size % 8)) 16236929ebdSEmil Tsalapatis return -EINVAL; 16336929ebdSEmil Tsalapatis 16436929ebdSEmil Tsalapatis if (unlikely(nr_pages == 0)) 16536929ebdSEmil Tsalapatis return -EINVAL; 16636929ebdSEmil Tsalapatis 16736929ebdSEmil Tsalapatis pool->elem_size = data_size; 16836929ebdSEmil Tsalapatis pool->max_elems = (PAGE_SIZE * nr_pages) / pool->elem_size; 16936929ebdSEmil Tsalapatis /* Populate the pool slab on the first allocation. */ 17036929ebdSEmil Tsalapatis pool->idx = pool->max_elems; 17136929ebdSEmil Tsalapatis 17236929ebdSEmil Tsalapatis return 0; 17336929ebdSEmil Tsalapatis } 17436929ebdSEmil Tsalapatis 17536929ebdSEmil Tsalapatis /* Initialize both the base pool allocators and the root chunk of the index. */ 17636929ebdSEmil Tsalapatis __hidden int 17736929ebdSEmil Tsalapatis scx_alloc_init(struct scx_allocator *alloc, __u64 data_size) 17836929ebdSEmil Tsalapatis { 17936929ebdSEmil Tsalapatis size_t min_chunk_size; 18036929ebdSEmil Tsalapatis int ret; 18136929ebdSEmil Tsalapatis 18236929ebdSEmil Tsalapatis _Static_assert(sizeof(struct sdt_chunk) <= PAGE_SIZE, 18336929ebdSEmil Tsalapatis "chunk size must fit into a page"); 18436929ebdSEmil Tsalapatis 18536929ebdSEmil Tsalapatis ret = pool_set_size(&chunk_pool, sizeof(struct sdt_chunk), 1); 18636929ebdSEmil Tsalapatis if (ret != 0) 18736929ebdSEmil Tsalapatis return ret; 18836929ebdSEmil Tsalapatis 18936929ebdSEmil Tsalapatis ret = pool_set_size(&desc_pool, sizeof(struct sdt_desc), 1); 19036929ebdSEmil Tsalapatis if (ret != 0) 19136929ebdSEmil Tsalapatis return ret; 19236929ebdSEmil Tsalapatis 19336929ebdSEmil Tsalapatis /* Wrap data into a descriptor and word align. */ 19436929ebdSEmil Tsalapatis data_size += sizeof(struct sdt_data); 19536929ebdSEmil Tsalapatis data_size = round_up(data_size, 8); 19636929ebdSEmil Tsalapatis 19736929ebdSEmil Tsalapatis /* 19836929ebdSEmil Tsalapatis * Ensure we allocate large enough chunks from the arena to avoid excessive 19936929ebdSEmil Tsalapatis * internal fragmentation when turning chunks it into structs. 20036929ebdSEmil Tsalapatis */ 20136929ebdSEmil Tsalapatis min_chunk_size = div_round_up(SDT_TASK_MIN_ELEM_PER_ALLOC * data_size, PAGE_SIZE); 20236929ebdSEmil Tsalapatis ret = pool_set_size(&alloc->pool, data_size, min_chunk_size); 20336929ebdSEmil Tsalapatis if (ret != 0) 20436929ebdSEmil Tsalapatis return ret; 20536929ebdSEmil Tsalapatis 20636929ebdSEmil Tsalapatis bpf_spin_lock(&alloc_lock); 20736929ebdSEmil Tsalapatis alloc->root = scx_alloc_chunk(); 20836929ebdSEmil Tsalapatis bpf_spin_unlock(&alloc_lock); 20936929ebdSEmil Tsalapatis if (!alloc->root) 21036929ebdSEmil Tsalapatis return -ENOMEM; 21136929ebdSEmil Tsalapatis 21236929ebdSEmil Tsalapatis return 0; 21336929ebdSEmil Tsalapatis } 21436929ebdSEmil Tsalapatis 21536929ebdSEmil Tsalapatis static 21636929ebdSEmil Tsalapatis int set_idx_state(sdt_desc_t *desc, __u64 pos, bool state) 21736929ebdSEmil Tsalapatis { 21836929ebdSEmil Tsalapatis __u64 __arena *allocated = desc->allocated; 21936929ebdSEmil Tsalapatis __u64 bit; 22036929ebdSEmil Tsalapatis 22136929ebdSEmil Tsalapatis if (unlikely(pos >= SDT_TASK_ENTS_PER_CHUNK)) 22236929ebdSEmil Tsalapatis return -EINVAL; 22336929ebdSEmil Tsalapatis 22436929ebdSEmil Tsalapatis bit = (__u64)1 << (pos % 64); 22536929ebdSEmil Tsalapatis 22636929ebdSEmil Tsalapatis if (state) 22736929ebdSEmil Tsalapatis allocated[pos / 64] |= bit; 22836929ebdSEmil Tsalapatis else 22936929ebdSEmil Tsalapatis allocated[pos / 64] &= ~bit; 23036929ebdSEmil Tsalapatis 23136929ebdSEmil Tsalapatis return 0; 23236929ebdSEmil Tsalapatis } 23336929ebdSEmil Tsalapatis 23436929ebdSEmil Tsalapatis static __noinline 23536929ebdSEmil Tsalapatis int mark_nodes_avail(sdt_desc_t *lv_desc[SDT_TASK_LEVELS], __u64 lv_pos[SDT_TASK_LEVELS]) 23636929ebdSEmil Tsalapatis { 23736929ebdSEmil Tsalapatis sdt_desc_t *desc; 23836929ebdSEmil Tsalapatis __u64 u, level; 23936929ebdSEmil Tsalapatis int ret; 24036929ebdSEmil Tsalapatis 24136929ebdSEmil Tsalapatis for (u = zero; u < SDT_TASK_LEVELS && can_loop; u++) { 24236929ebdSEmil Tsalapatis level = SDT_TASK_LEVELS - 1 - u; 24336929ebdSEmil Tsalapatis 24436929ebdSEmil Tsalapatis /* Only propagate upwards if we are the parent's only free chunk. */ 24536929ebdSEmil Tsalapatis desc = lv_desc[level]; 24636929ebdSEmil Tsalapatis 24736929ebdSEmil Tsalapatis ret = set_idx_state(desc, lv_pos[level], false); 24836929ebdSEmil Tsalapatis if (unlikely(ret != 0)) 24936929ebdSEmil Tsalapatis return ret; 25036929ebdSEmil Tsalapatis 25136929ebdSEmil Tsalapatis desc->nr_free += 1; 25236929ebdSEmil Tsalapatis if (desc->nr_free > 1) 25336929ebdSEmil Tsalapatis return 0; 25436929ebdSEmil Tsalapatis } 25536929ebdSEmil Tsalapatis 25636929ebdSEmil Tsalapatis return 0; 25736929ebdSEmil Tsalapatis } 25836929ebdSEmil Tsalapatis 25936929ebdSEmil Tsalapatis /* 26036929ebdSEmil Tsalapatis * Free the allocated struct with the given index. Called with the 26136929ebdSEmil Tsalapatis * allocator lock taken. 26236929ebdSEmil Tsalapatis */ 26336929ebdSEmil Tsalapatis __hidden 26436929ebdSEmil Tsalapatis int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx) 26536929ebdSEmil Tsalapatis { 26636929ebdSEmil Tsalapatis const __u64 mask = (1 << SDT_TASK_ENTS_PER_PAGE_SHIFT) - 1; 26736929ebdSEmil Tsalapatis sdt_desc_t *lv_desc[SDT_TASK_LEVELS]; 26836929ebdSEmil Tsalapatis sdt_desc_t * __arena *desc_children; 26936929ebdSEmil Tsalapatis struct sdt_chunk __arena *chunk; 27036929ebdSEmil Tsalapatis sdt_desc_t *desc; 27136929ebdSEmil Tsalapatis struct sdt_data __arena *data; 27236929ebdSEmil Tsalapatis __u64 level, shift, pos; 27336929ebdSEmil Tsalapatis __u64 lv_pos[SDT_TASK_LEVELS]; 27436929ebdSEmil Tsalapatis int ret; 27536929ebdSEmil Tsalapatis int i; 27636929ebdSEmil Tsalapatis 27736929ebdSEmil Tsalapatis if (!alloc) 27836929ebdSEmil Tsalapatis return 0; 27936929ebdSEmil Tsalapatis 28036929ebdSEmil Tsalapatis desc = alloc->root; 28136929ebdSEmil Tsalapatis if (unlikely(!desc)) 28236929ebdSEmil Tsalapatis return -EINVAL; 28336929ebdSEmil Tsalapatis 28436929ebdSEmil Tsalapatis /* To appease the verifier. */ 28536929ebdSEmil Tsalapatis for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { 28636929ebdSEmil Tsalapatis lv_desc[level] = NULL; 28736929ebdSEmil Tsalapatis lv_pos[level] = 0; 28836929ebdSEmil Tsalapatis } 28936929ebdSEmil Tsalapatis 29036929ebdSEmil Tsalapatis /* Find the leaf node containing the index. */ 29136929ebdSEmil Tsalapatis for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { 29236929ebdSEmil Tsalapatis shift = (SDT_TASK_LEVELS - 1 - level) * SDT_TASK_ENTS_PER_PAGE_SHIFT; 29336929ebdSEmil Tsalapatis pos = (idx >> shift) & mask; 29436929ebdSEmil Tsalapatis 29536929ebdSEmil Tsalapatis lv_desc[level] = desc; 29636929ebdSEmil Tsalapatis lv_pos[level] = pos; 29736929ebdSEmil Tsalapatis 29836929ebdSEmil Tsalapatis if (level == SDT_TASK_LEVELS - 1) 29936929ebdSEmil Tsalapatis break; 30036929ebdSEmil Tsalapatis 30136929ebdSEmil Tsalapatis chunk = desc->chunk; 30236929ebdSEmil Tsalapatis 30336929ebdSEmil Tsalapatis desc_children = (sdt_desc_t * __arena *)chunk->descs; 30436929ebdSEmil Tsalapatis desc = desc_children[pos]; 30536929ebdSEmil Tsalapatis 30636929ebdSEmil Tsalapatis if (unlikely(!desc)) 30736929ebdSEmil Tsalapatis return -EINVAL; 30836929ebdSEmil Tsalapatis } 30936929ebdSEmil Tsalapatis 31036929ebdSEmil Tsalapatis chunk = desc->chunk; 31136929ebdSEmil Tsalapatis 31236929ebdSEmil Tsalapatis pos = idx & mask; 31336929ebdSEmil Tsalapatis data = chunk->data[pos]; 31436929ebdSEmil Tsalapatis if (likely(data)) { 315*2e06d54eSEmil Tsalapatis *data = (struct sdt_data) { 31636929ebdSEmil Tsalapatis .tid.genn = data->tid.genn + 1, 31736929ebdSEmil Tsalapatis }; 31836929ebdSEmil Tsalapatis 31936929ebdSEmil Tsalapatis /* Zero out one word at a time. */ 32036929ebdSEmil Tsalapatis for (i = zero; i < alloc->pool.elem_size / 8 && can_loop; i++) { 32136929ebdSEmil Tsalapatis data->payload[i] = 0; 32236929ebdSEmil Tsalapatis } 32336929ebdSEmil Tsalapatis } 32436929ebdSEmil Tsalapatis 32536929ebdSEmil Tsalapatis ret = mark_nodes_avail(lv_desc, lv_pos); 32636929ebdSEmil Tsalapatis if (unlikely(ret != 0)) 32736929ebdSEmil Tsalapatis return ret; 32836929ebdSEmil Tsalapatis 32936929ebdSEmil Tsalapatis alloc_stats.active_allocs -= 1; 33036929ebdSEmil Tsalapatis alloc_stats.free_ops += 1; 33136929ebdSEmil Tsalapatis 33236929ebdSEmil Tsalapatis return 0; 33336929ebdSEmil Tsalapatis } 33436929ebdSEmil Tsalapatis 33536929ebdSEmil Tsalapatis static inline 33636929ebdSEmil Tsalapatis int ffs(__u64 word) 33736929ebdSEmil Tsalapatis { 33836929ebdSEmil Tsalapatis unsigned int num = 0; 33936929ebdSEmil Tsalapatis 34036929ebdSEmil Tsalapatis if ((word & 0xffffffff) == 0) { 34136929ebdSEmil Tsalapatis num += 32; 34236929ebdSEmil Tsalapatis word >>= 32; 34336929ebdSEmil Tsalapatis } 34436929ebdSEmil Tsalapatis 34536929ebdSEmil Tsalapatis if ((word & 0xffff) == 0) { 34636929ebdSEmil Tsalapatis num += 16; 34736929ebdSEmil Tsalapatis word >>= 16; 34836929ebdSEmil Tsalapatis } 34936929ebdSEmil Tsalapatis 35036929ebdSEmil Tsalapatis if ((word & 0xff) == 0) { 35136929ebdSEmil Tsalapatis num += 8; 35236929ebdSEmil Tsalapatis word >>= 8; 35336929ebdSEmil Tsalapatis } 35436929ebdSEmil Tsalapatis 35536929ebdSEmil Tsalapatis if ((word & 0xf) == 0) { 35636929ebdSEmil Tsalapatis num += 4; 35736929ebdSEmil Tsalapatis word >>= 4; 35836929ebdSEmil Tsalapatis } 35936929ebdSEmil Tsalapatis 36036929ebdSEmil Tsalapatis if ((word & 0x3) == 0) { 36136929ebdSEmil Tsalapatis num += 2; 36236929ebdSEmil Tsalapatis word >>= 2; 36336929ebdSEmil Tsalapatis } 36436929ebdSEmil Tsalapatis 36536929ebdSEmil Tsalapatis if ((word & 0x1) == 0) { 36636929ebdSEmil Tsalapatis num += 1; 36736929ebdSEmil Tsalapatis word >>= 1; 36836929ebdSEmil Tsalapatis } 36936929ebdSEmil Tsalapatis 37036929ebdSEmil Tsalapatis return num; 37136929ebdSEmil Tsalapatis } 37236929ebdSEmil Tsalapatis 37336929ebdSEmil Tsalapatis 37436929ebdSEmil Tsalapatis /* find the first empty slot */ 37536929ebdSEmil Tsalapatis __hidden 37636929ebdSEmil Tsalapatis __u64 chunk_find_empty(sdt_desc_t __arg_arena *desc) 37736929ebdSEmil Tsalapatis { 37836929ebdSEmil Tsalapatis __u64 freeslots; 37936929ebdSEmil Tsalapatis __u64 i; 38036929ebdSEmil Tsalapatis 38136929ebdSEmil Tsalapatis for (i = 0; i < SDT_TASK_CHUNK_BITMAP_U64S; i++) { 38236929ebdSEmil Tsalapatis freeslots = ~desc->allocated[i]; 38336929ebdSEmil Tsalapatis if (freeslots == (__u64)0) 38436929ebdSEmil Tsalapatis continue; 38536929ebdSEmil Tsalapatis 38636929ebdSEmil Tsalapatis return (i * 64) + ffs(freeslots); 38736929ebdSEmil Tsalapatis } 38836929ebdSEmil Tsalapatis 38936929ebdSEmil Tsalapatis return SDT_TASK_ENTS_PER_CHUNK; 39036929ebdSEmil Tsalapatis } 39136929ebdSEmil Tsalapatis 39236929ebdSEmil Tsalapatis /* 39336929ebdSEmil Tsalapatis * Find and return an available idx on the allocator. 39436929ebdSEmil Tsalapatis * Called with the task spinlock held. 39536929ebdSEmil Tsalapatis */ 39636929ebdSEmil Tsalapatis static sdt_desc_t * desc_find_empty(sdt_desc_t *desc, __u64 *idxp) 39736929ebdSEmil Tsalapatis { 39836929ebdSEmil Tsalapatis sdt_desc_t *lv_desc[SDT_TASK_LEVELS]; 39936929ebdSEmil Tsalapatis sdt_desc_t * __arena *desc_children; 40036929ebdSEmil Tsalapatis struct sdt_chunk __arena *chunk; 40136929ebdSEmil Tsalapatis sdt_desc_t *tmp; 40236929ebdSEmil Tsalapatis __u64 lv_pos[SDT_TASK_LEVELS]; 40336929ebdSEmil Tsalapatis __u64 u, pos, level; 40436929ebdSEmil Tsalapatis __u64 idx = 0; 40536929ebdSEmil Tsalapatis int ret; 40636929ebdSEmil Tsalapatis 40736929ebdSEmil Tsalapatis for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { 40836929ebdSEmil Tsalapatis pos = chunk_find_empty(desc); 40936929ebdSEmil Tsalapatis 41036929ebdSEmil Tsalapatis /* If we error out, something has gone very wrong. */ 41136929ebdSEmil Tsalapatis if (unlikely(pos > SDT_TASK_ENTS_PER_CHUNK)) 41236929ebdSEmil Tsalapatis return NULL; 41336929ebdSEmil Tsalapatis 41436929ebdSEmil Tsalapatis if (pos == SDT_TASK_ENTS_PER_CHUNK) 41536929ebdSEmil Tsalapatis return NULL; 41636929ebdSEmil Tsalapatis 41736929ebdSEmil Tsalapatis idx <<= SDT_TASK_ENTS_PER_PAGE_SHIFT; 41836929ebdSEmil Tsalapatis idx |= pos; 41936929ebdSEmil Tsalapatis 42036929ebdSEmil Tsalapatis /* Log the levels to complete allocation. */ 42136929ebdSEmil Tsalapatis lv_desc[level] = desc; 42236929ebdSEmil Tsalapatis lv_pos[level] = pos; 42336929ebdSEmil Tsalapatis 42436929ebdSEmil Tsalapatis /* The rest of the loop is for internal node traversal. */ 42536929ebdSEmil Tsalapatis if (level == SDT_TASK_LEVELS - 1) 42636929ebdSEmil Tsalapatis break; 42736929ebdSEmil Tsalapatis 42836929ebdSEmil Tsalapatis /* Allocate an internal node if necessary. */ 42936929ebdSEmil Tsalapatis chunk = desc->chunk; 43036929ebdSEmil Tsalapatis desc_children = (sdt_desc_t * __arena *)chunk->descs; 43136929ebdSEmil Tsalapatis 43236929ebdSEmil Tsalapatis desc = desc_children[pos]; 43336929ebdSEmil Tsalapatis if (!desc) { 43436929ebdSEmil Tsalapatis desc = scx_alloc_chunk(); 43536929ebdSEmil Tsalapatis if (!desc) 43636929ebdSEmil Tsalapatis return NULL; 43736929ebdSEmil Tsalapatis 43836929ebdSEmil Tsalapatis desc_children[pos] = desc; 43936929ebdSEmil Tsalapatis } 44036929ebdSEmil Tsalapatis } 44136929ebdSEmil Tsalapatis 44236929ebdSEmil Tsalapatis /* 44336929ebdSEmil Tsalapatis * Finding the descriptor along with any internal node 44436929ebdSEmil Tsalapatis * allocations was successful. Update all levels with 44536929ebdSEmil Tsalapatis * the new allocation. 44636929ebdSEmil Tsalapatis */ 44736929ebdSEmil Tsalapatis bpf_for(u, 0, SDT_TASK_LEVELS) { 44836929ebdSEmil Tsalapatis level = SDT_TASK_LEVELS - 1 - u; 44936929ebdSEmil Tsalapatis tmp = lv_desc[level]; 45036929ebdSEmil Tsalapatis 45136929ebdSEmil Tsalapatis ret = set_idx_state(tmp, lv_pos[level], true); 45236929ebdSEmil Tsalapatis if (ret != 0) 45336929ebdSEmil Tsalapatis break; 45436929ebdSEmil Tsalapatis 45536929ebdSEmil Tsalapatis tmp->nr_free -= 1; 45636929ebdSEmil Tsalapatis if (tmp->nr_free > 0) 45736929ebdSEmil Tsalapatis break; 45836929ebdSEmil Tsalapatis } 45936929ebdSEmil Tsalapatis 46036929ebdSEmil Tsalapatis *idxp = idx; 46136929ebdSEmil Tsalapatis 46236929ebdSEmil Tsalapatis return desc; 46336929ebdSEmil Tsalapatis } 46436929ebdSEmil Tsalapatis 46536929ebdSEmil Tsalapatis __hidden 46636929ebdSEmil Tsalapatis void __arena *scx_alloc(struct scx_allocator *alloc) 46736929ebdSEmil Tsalapatis { 46836929ebdSEmil Tsalapatis struct sdt_data __arena *data = NULL; 46936929ebdSEmil Tsalapatis struct sdt_chunk __arena *chunk; 47036929ebdSEmil Tsalapatis sdt_desc_t *desc; 47136929ebdSEmil Tsalapatis __u64 idx, pos; 47236929ebdSEmil Tsalapatis 47336929ebdSEmil Tsalapatis if (!alloc) 47436929ebdSEmil Tsalapatis return NULL; 47536929ebdSEmil Tsalapatis 47636929ebdSEmil Tsalapatis bpf_spin_lock(&alloc_lock); 47736929ebdSEmil Tsalapatis 47836929ebdSEmil Tsalapatis /* We unlock if we encounter an error in the function. */ 47936929ebdSEmil Tsalapatis desc = desc_find_empty(alloc->root, &idx); 48036929ebdSEmil Tsalapatis if (unlikely(desc == NULL)) { 48136929ebdSEmil Tsalapatis bpf_spin_unlock(&alloc_lock); 48236929ebdSEmil Tsalapatis return NULL; 48336929ebdSEmil Tsalapatis } 48436929ebdSEmil Tsalapatis 48536929ebdSEmil Tsalapatis chunk = desc->chunk; 48636929ebdSEmil Tsalapatis 48736929ebdSEmil Tsalapatis /* Populate the leaf node if necessary. */ 48836929ebdSEmil Tsalapatis pos = idx & (SDT_TASK_ENTS_PER_CHUNK - 1); 48936929ebdSEmil Tsalapatis data = chunk->data[pos]; 49036929ebdSEmil Tsalapatis if (!data) { 49136929ebdSEmil Tsalapatis data = scx_alloc_from_pool(&alloc->pool); 49236929ebdSEmil Tsalapatis if (!data) { 49336929ebdSEmil Tsalapatis scx_alloc_free_idx(alloc, idx); 49436929ebdSEmil Tsalapatis bpf_spin_unlock(&alloc_lock); 49536929ebdSEmil Tsalapatis return NULL; 49636929ebdSEmil Tsalapatis } 49736929ebdSEmil Tsalapatis } 49836929ebdSEmil Tsalapatis 49936929ebdSEmil Tsalapatis chunk->data[pos] = data; 50036929ebdSEmil Tsalapatis 50136929ebdSEmil Tsalapatis /* The data counts as a chunk */ 50236929ebdSEmil Tsalapatis alloc_stats.data_allocs += 1; 50336929ebdSEmil Tsalapatis alloc_stats.alloc_ops += 1; 50436929ebdSEmil Tsalapatis alloc_stats.active_allocs += 1; 50536929ebdSEmil Tsalapatis 50636929ebdSEmil Tsalapatis data->tid.idx = idx; 50736929ebdSEmil Tsalapatis 50836929ebdSEmil Tsalapatis bpf_spin_unlock(&alloc_lock); 50936929ebdSEmil Tsalapatis 51036929ebdSEmil Tsalapatis return data; 51136929ebdSEmil Tsalapatis } 51236929ebdSEmil Tsalapatis 51336929ebdSEmil Tsalapatis /* 51436929ebdSEmil Tsalapatis * Task BPF map entry recording the task's assigned ID and pointing to the data 51536929ebdSEmil Tsalapatis * area allocated in arena. 51636929ebdSEmil Tsalapatis */ 51736929ebdSEmil Tsalapatis struct scx_task_map_val { 51836929ebdSEmil Tsalapatis union sdt_id tid; 51936929ebdSEmil Tsalapatis __u64 tptr; 52036929ebdSEmil Tsalapatis struct sdt_data __arena *data; 52136929ebdSEmil Tsalapatis }; 52236929ebdSEmil Tsalapatis 52336929ebdSEmil Tsalapatis struct { 52436929ebdSEmil Tsalapatis __uint(type, BPF_MAP_TYPE_TASK_STORAGE); 52536929ebdSEmil Tsalapatis __uint(map_flags, BPF_F_NO_PREALLOC); 52636929ebdSEmil Tsalapatis __type(key, int); 52736929ebdSEmil Tsalapatis __type(value, struct scx_task_map_val); 52836929ebdSEmil Tsalapatis } scx_task_map SEC(".maps"); 52936929ebdSEmil Tsalapatis 53036929ebdSEmil Tsalapatis static struct scx_allocator scx_task_allocator; 53136929ebdSEmil Tsalapatis 53236929ebdSEmil Tsalapatis __hidden 53336929ebdSEmil Tsalapatis void __arena *scx_task_alloc(struct task_struct *p) 53436929ebdSEmil Tsalapatis { 53536929ebdSEmil Tsalapatis struct sdt_data __arena *data = NULL; 53636929ebdSEmil Tsalapatis struct scx_task_map_val *mval; 53736929ebdSEmil Tsalapatis 53836929ebdSEmil Tsalapatis mval = bpf_task_storage_get(&scx_task_map, p, 0, 53936929ebdSEmil Tsalapatis BPF_LOCAL_STORAGE_GET_F_CREATE); 54036929ebdSEmil Tsalapatis if (!mval) 54136929ebdSEmil Tsalapatis return NULL; 54236929ebdSEmil Tsalapatis 54336929ebdSEmil Tsalapatis data = scx_alloc(&scx_task_allocator); 54436929ebdSEmil Tsalapatis if (unlikely(!data)) 54536929ebdSEmil Tsalapatis return NULL; 54636929ebdSEmil Tsalapatis 54736929ebdSEmil Tsalapatis mval->tid = data->tid; 54836929ebdSEmil Tsalapatis mval->tptr = (__u64) p; 54936929ebdSEmil Tsalapatis mval->data = data; 55036929ebdSEmil Tsalapatis 55136929ebdSEmil Tsalapatis return (void __arena *)data->payload; 55236929ebdSEmil Tsalapatis } 55336929ebdSEmil Tsalapatis 55436929ebdSEmil Tsalapatis __hidden 55536929ebdSEmil Tsalapatis int scx_task_init(__u64 data_size) 55636929ebdSEmil Tsalapatis { 55736929ebdSEmil Tsalapatis return scx_alloc_init(&scx_task_allocator, data_size); 55836929ebdSEmil Tsalapatis } 55936929ebdSEmil Tsalapatis 56036929ebdSEmil Tsalapatis __hidden 56136929ebdSEmil Tsalapatis void __arena *scx_task_data(struct task_struct *p) 56236929ebdSEmil Tsalapatis { 56336929ebdSEmil Tsalapatis struct sdt_data __arena *data; 56436929ebdSEmil Tsalapatis struct scx_task_map_val *mval; 56536929ebdSEmil Tsalapatis 56636929ebdSEmil Tsalapatis scx_arena_subprog_init(); 56736929ebdSEmil Tsalapatis 56836929ebdSEmil Tsalapatis mval = bpf_task_storage_get(&scx_task_map, p, 0, 0); 56936929ebdSEmil Tsalapatis if (!mval) 57036929ebdSEmil Tsalapatis return NULL; 57136929ebdSEmil Tsalapatis 57236929ebdSEmil Tsalapatis data = mval->data; 57336929ebdSEmil Tsalapatis 57436929ebdSEmil Tsalapatis return (void __arena *)data->payload; 57536929ebdSEmil Tsalapatis } 57636929ebdSEmil Tsalapatis 57736929ebdSEmil Tsalapatis __hidden 57836929ebdSEmil Tsalapatis void scx_task_free(struct task_struct *p) 57936929ebdSEmil Tsalapatis { 58036929ebdSEmil Tsalapatis struct scx_task_map_val *mval; 58136929ebdSEmil Tsalapatis 58236929ebdSEmil Tsalapatis scx_arena_subprog_init(); 58336929ebdSEmil Tsalapatis 58436929ebdSEmil Tsalapatis mval = bpf_task_storage_get(&scx_task_map, p, 0, 0); 58536929ebdSEmil Tsalapatis if (!mval) 58636929ebdSEmil Tsalapatis return; 58736929ebdSEmil Tsalapatis 58836929ebdSEmil Tsalapatis bpf_spin_lock(&alloc_lock); 58936929ebdSEmil Tsalapatis scx_alloc_free_idx(&scx_task_allocator, mval->tid.idx); 59036929ebdSEmil Tsalapatis bpf_spin_unlock(&alloc_lock); 59136929ebdSEmil Tsalapatis 59236929ebdSEmil Tsalapatis bpf_task_storage_delete(&scx_task_map, p); 59336929ebdSEmil Tsalapatis } 59436929ebdSEmil Tsalapatis 59536929ebdSEmil Tsalapatis static inline void 59636929ebdSEmil Tsalapatis scx_stat_global_update(struct scx_stats __arena *stats) 59736929ebdSEmil Tsalapatis { 59836929ebdSEmil Tsalapatis cast_kern(stats); 59936929ebdSEmil Tsalapatis __sync_fetch_and_add(&stat_enqueue, stats->enqueue); 60036929ebdSEmil Tsalapatis __sync_fetch_and_add(&stat_init, stats->init); 60136929ebdSEmil Tsalapatis __sync_fetch_and_add(&stat_exit, stats->exit); 60236929ebdSEmil Tsalapatis __sync_fetch_and_add(&stat_select_idle_cpu, stats->select_idle_cpu); 60336929ebdSEmil Tsalapatis __sync_fetch_and_add(&stat_select_busy_cpu, stats->select_busy_cpu); 60436929ebdSEmil Tsalapatis } 60536929ebdSEmil Tsalapatis 60636929ebdSEmil Tsalapatis s32 BPF_STRUCT_OPS(sdt_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) 60736929ebdSEmil Tsalapatis { 60836929ebdSEmil Tsalapatis struct scx_stats __arena *stats; 60936929ebdSEmil Tsalapatis bool is_idle = false; 61036929ebdSEmil Tsalapatis s32 cpu; 61136929ebdSEmil Tsalapatis 61236929ebdSEmil Tsalapatis stats = scx_task_data(p); 61336929ebdSEmil Tsalapatis if (!stats) { 61436929ebdSEmil Tsalapatis scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); 61536929ebdSEmil Tsalapatis return 0; 61636929ebdSEmil Tsalapatis } 61736929ebdSEmil Tsalapatis 61836929ebdSEmil Tsalapatis cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); 61936929ebdSEmil Tsalapatis if (is_idle) { 62036929ebdSEmil Tsalapatis stat_inc_select_idle_cpu(stats); 62136929ebdSEmil Tsalapatis scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); 62236929ebdSEmil Tsalapatis } else { 62336929ebdSEmil Tsalapatis stat_inc_select_busy_cpu(stats); 62436929ebdSEmil Tsalapatis } 62536929ebdSEmil Tsalapatis 62636929ebdSEmil Tsalapatis return cpu; 62736929ebdSEmil Tsalapatis } 62836929ebdSEmil Tsalapatis 62936929ebdSEmil Tsalapatis void BPF_STRUCT_OPS(sdt_enqueue, struct task_struct *p, u64 enq_flags) 63036929ebdSEmil Tsalapatis { 63136929ebdSEmil Tsalapatis struct scx_stats __arena *stats; 63236929ebdSEmil Tsalapatis 63336929ebdSEmil Tsalapatis stats = scx_task_data(p); 63436929ebdSEmil Tsalapatis if (!stats) { 63536929ebdSEmil Tsalapatis scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); 63636929ebdSEmil Tsalapatis return; 63736929ebdSEmil Tsalapatis } 63836929ebdSEmil Tsalapatis 63936929ebdSEmil Tsalapatis stat_inc_enqueue(stats); 64036929ebdSEmil Tsalapatis 64136929ebdSEmil Tsalapatis scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); 64236929ebdSEmil Tsalapatis } 64336929ebdSEmil Tsalapatis 64436929ebdSEmil Tsalapatis void BPF_STRUCT_OPS(sdt_dispatch, s32 cpu, struct task_struct *prev) 64536929ebdSEmil Tsalapatis { 64636929ebdSEmil Tsalapatis scx_bpf_dsq_move_to_local(SHARED_DSQ); 64736929ebdSEmil Tsalapatis } 64836929ebdSEmil Tsalapatis 64936929ebdSEmil Tsalapatis s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init_task, struct task_struct *p, 65036929ebdSEmil Tsalapatis struct scx_init_task_args *args) 65136929ebdSEmil Tsalapatis { 65236929ebdSEmil Tsalapatis struct scx_stats __arena *stats; 65336929ebdSEmil Tsalapatis 65436929ebdSEmil Tsalapatis stats = scx_task_alloc(p); 65536929ebdSEmil Tsalapatis if (!stats) { 65636929ebdSEmil Tsalapatis scx_bpf_error("arena allocator out of memory"); 65736929ebdSEmil Tsalapatis return -ENOMEM; 65836929ebdSEmil Tsalapatis } 65936929ebdSEmil Tsalapatis 66036929ebdSEmil Tsalapatis stats->pid = p->pid; 66136929ebdSEmil Tsalapatis 66236929ebdSEmil Tsalapatis stat_inc_init(stats); 66336929ebdSEmil Tsalapatis 66436929ebdSEmil Tsalapatis return 0; 66536929ebdSEmil Tsalapatis } 66636929ebdSEmil Tsalapatis 66736929ebdSEmil Tsalapatis void BPF_STRUCT_OPS(sdt_exit_task, struct task_struct *p, 66836929ebdSEmil Tsalapatis struct scx_exit_task_args *args) 66936929ebdSEmil Tsalapatis { 67036929ebdSEmil Tsalapatis struct scx_stats __arena *stats; 67136929ebdSEmil Tsalapatis 67236929ebdSEmil Tsalapatis stats = scx_task_data(p); 67336929ebdSEmil Tsalapatis if (!stats) { 67436929ebdSEmil Tsalapatis scx_bpf_error("%s: no stats for pid %d", __func__, p->pid); 67536929ebdSEmil Tsalapatis return; 67636929ebdSEmil Tsalapatis } 67736929ebdSEmil Tsalapatis 67836929ebdSEmil Tsalapatis stat_inc_exit(stats); 67936929ebdSEmil Tsalapatis scx_stat_global_update(stats); 68036929ebdSEmil Tsalapatis 68136929ebdSEmil Tsalapatis scx_task_free(p); 68236929ebdSEmil Tsalapatis } 68336929ebdSEmil Tsalapatis 68436929ebdSEmil Tsalapatis s32 BPF_STRUCT_OPS_SLEEPABLE(sdt_init) 68536929ebdSEmil Tsalapatis { 68636929ebdSEmil Tsalapatis int ret; 68736929ebdSEmil Tsalapatis 68836929ebdSEmil Tsalapatis ret = scx_task_init(sizeof(struct scx_stats)); 68936929ebdSEmil Tsalapatis if (ret < 0) { 69036929ebdSEmil Tsalapatis scx_bpf_error("%s: failed with %d", __func__, ret); 69136929ebdSEmil Tsalapatis return ret; 69236929ebdSEmil Tsalapatis } 69336929ebdSEmil Tsalapatis 694bd4f0822Szhidao su ret = scx_bpf_create_dsq(SHARED_DSQ, -1); 695bd4f0822Szhidao su if (ret) { 696bd4f0822Szhidao su scx_bpf_error("failed to create DSQ %d (%d)", SHARED_DSQ, ret); 697bd4f0822Szhidao su return ret; 698bd4f0822Szhidao su } 699bd4f0822Szhidao su 700bd4f0822Szhidao su return 0; 70136929ebdSEmil Tsalapatis } 70236929ebdSEmil Tsalapatis 70336929ebdSEmil Tsalapatis void BPF_STRUCT_OPS(sdt_exit, struct scx_exit_info *ei) 70436929ebdSEmil Tsalapatis { 70536929ebdSEmil Tsalapatis UEI_RECORD(uei, ei); 70636929ebdSEmil Tsalapatis } 70736929ebdSEmil Tsalapatis 70836929ebdSEmil Tsalapatis SCX_OPS_DEFINE(sdt_ops, 70936929ebdSEmil Tsalapatis .select_cpu = (void *)sdt_select_cpu, 71036929ebdSEmil Tsalapatis .enqueue = (void *)sdt_enqueue, 71136929ebdSEmil Tsalapatis .dispatch = (void *)sdt_dispatch, 71236929ebdSEmil Tsalapatis .init_task = (void *)sdt_init_task, 71336929ebdSEmil Tsalapatis .exit_task = (void *)sdt_exit_task, 71436929ebdSEmil Tsalapatis .init = (void *)sdt_init, 71536929ebdSEmil Tsalapatis .exit = (void *)sdt_exit, 71636929ebdSEmil Tsalapatis .name = "sdt"); 717