1bba2c361STejun Heo /* SPDX-License-Identifier: GPL-2.0 */ 2bba2c361STejun Heo /* 3bba2c361STejun Heo * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst 4bba2c361STejun Heo * 5bba2c361STejun Heo * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. 6bba2c361STejun Heo * Copyright (c) 2026 Tejun Heo <tj@kernel.org> 7bba2c361STejun Heo */ 8bba2c361STejun Heo #include <linux/cacheinfo.h> 9bba2c361STejun Heo 10*3cd1f76bSTejun Heo #include "internal.h" 11*3cd1f76bSTejun Heo #include "cid.h" 12*3cd1f76bSTejun Heo 13bba2c361STejun Heo /* 14bba2c361STejun Heo * cid tables. 15bba2c361STejun Heo * 16bba2c361STejun Heo * Pointers are published once on first enable and never revoked. The default 17bba2c361STejun Heo * mapping is populated before ops.init() runs; scx_bpf_cid_override() commits 18bba2c361STejun Heo * before it returns. As long as the BPF scheduler only uses the tables from 19bba2c361STejun Heo * those points onward, it sees a consistent view. 20bba2c361STejun Heo */ 21bba2c361STejun Heo s16 *scx_cid_to_cpu_tbl; 22bba2c361STejun Heo s16 *scx_cpu_to_cid_tbl; 23bba2c361STejun Heo struct scx_cid_topo *scx_cid_topo; 24bba2c361STejun Heo 25bba2c361STejun Heo #define SCX_CID_TOPO_NEG (struct scx_cid_topo) { \ 26bba2c361STejun Heo .core_cid = -1, .core_idx = -1, .llc_cid = -1, .llc_idx = -1, \ 27bba2c361STejun Heo .node_cid = -1, .node_idx = -1, \ 28bba2c361STejun Heo } 29bba2c361STejun Heo 30bba2c361STejun Heo /* 31bba2c361STejun Heo * Return @cpu's LLC shared_cpu_map. If cacheinfo isn't populated (offline or 32bba2c361STejun Heo * !present), record @cpu in @fallbacks and return its node mask instead - the 33bba2c361STejun Heo * worst that can happen is that the cpu's LLC becomes coarser than reality. 34bba2c361STejun Heo */ 35bba2c361STejun Heo static const struct cpumask *cpu_llc_mask(int cpu, struct cpumask *fallbacks) 36bba2c361STejun Heo { 37bba2c361STejun Heo struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); 38bba2c361STejun Heo 39bba2c361STejun Heo if (!ci || !ci->info_list || !ci->num_leaves) { 40bba2c361STejun Heo cpumask_set_cpu(cpu, fallbacks); 41bba2c361STejun Heo return cpumask_of_node(cpu_to_node(cpu)); 42bba2c361STejun Heo } 43bba2c361STejun Heo return &ci->info_list[ci->num_leaves - 1].shared_cpu_map; 44bba2c361STejun Heo } 45bba2c361STejun Heo 46bba2c361STejun Heo /* Allocate the cid tables once on first enable; never freed. */ 47bba2c361STejun Heo static s32 scx_cid_arrays_alloc(void) 48bba2c361STejun Heo { 49bba2c361STejun Heo u32 npossible = num_possible_cpus(); 50bba2c361STejun Heo s16 *cid_to_cpu, *cpu_to_cid; 51bba2c361STejun Heo struct scx_cid_topo *cid_topo; 52bba2c361STejun Heo 53bba2c361STejun Heo if (scx_cid_to_cpu_tbl) 54bba2c361STejun Heo return 0; 55bba2c361STejun Heo 56bba2c361STejun Heo cid_to_cpu = kzalloc_objs(*scx_cid_to_cpu_tbl, npossible, GFP_KERNEL); 57bba2c361STejun Heo cpu_to_cid = kzalloc_objs(*scx_cpu_to_cid_tbl, nr_cpu_ids, GFP_KERNEL); 58bba2c361STejun Heo cid_topo = kmalloc_objs(*scx_cid_topo, npossible, GFP_KERNEL); 59bba2c361STejun Heo 60bba2c361STejun Heo if (!cid_to_cpu || !cpu_to_cid || !cid_topo) { 61bba2c361STejun Heo kfree(cid_to_cpu); 62bba2c361STejun Heo kfree(cpu_to_cid); 63bba2c361STejun Heo kfree(cid_topo); 64bba2c361STejun Heo return -ENOMEM; 65bba2c361STejun Heo } 66bba2c361STejun Heo 67bba2c361STejun Heo WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu); 68bba2c361STejun Heo WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid); 69bba2c361STejun Heo WRITE_ONCE(scx_cid_topo, cid_topo); 70bba2c361STejun Heo return 0; 71bba2c361STejun Heo } 72bba2c361STejun Heo 73bba2c361STejun Heo /** 74bba2c361STejun Heo * scx_cid_init - build the cid mapping 75bba2c361STejun Heo * @sch: the scx_sched being initialized; used as the scx_error() target 76bba2c361STejun Heo * 77bba2c361STejun Heo * See "Topological CPU IDs" in cid.h for the model. Walk online cpus by 78bba2c361STejun Heo * intersection at each level (parent_scratch & this_level_mask), which keeps 79bba2c361STejun Heo * containment correct by construction and naturally splits a physical LLC 80bba2c361STejun Heo * straddling two NUMA nodes into two LLC units. The caller must hold 81bba2c361STejun Heo * cpus_read_lock. 82bba2c361STejun Heo */ 83bba2c361STejun Heo s32 scx_cid_init(struct scx_sched *sch) 84bba2c361STejun Heo { 85bba2c361STejun Heo cpumask_var_t to_walk __free(free_cpumask_var) = CPUMASK_VAR_NULL; 86bba2c361STejun Heo cpumask_var_t node_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL; 87bba2c361STejun Heo cpumask_var_t llc_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL; 88bba2c361STejun Heo cpumask_var_t core_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL; 89bba2c361STejun Heo cpumask_var_t llc_fallback __free(free_cpumask_var) = CPUMASK_VAR_NULL; 90bba2c361STejun Heo cpumask_var_t online_no_topo __free(free_cpumask_var) = CPUMASK_VAR_NULL; 91bba2c361STejun Heo u32 next_cid = 0; 92bba2c361STejun Heo s32 next_node_idx = 0, next_llc_idx = 0, next_core_idx = 0; 93bba2c361STejun Heo s32 cpu, ret; 94bba2c361STejun Heo 95bba2c361STejun Heo /* CMASK_MAX_WORDS in cid.bpf.h covers NR_CPUS up to 8192 */ 96bba2c361STejun Heo BUILD_BUG_ON(NR_CPUS > 8192); 97bba2c361STejun Heo 98bba2c361STejun Heo lockdep_assert_cpus_held(); 99bba2c361STejun Heo 100bba2c361STejun Heo ret = scx_cid_arrays_alloc(); 101bba2c361STejun Heo if (ret) 102bba2c361STejun Heo return ret; 103bba2c361STejun Heo 104bba2c361STejun Heo if (!zalloc_cpumask_var(&to_walk, GFP_KERNEL) || 105bba2c361STejun Heo !zalloc_cpumask_var(&node_scratch, GFP_KERNEL) || 106bba2c361STejun Heo !zalloc_cpumask_var(&llc_scratch, GFP_KERNEL) || 107bba2c361STejun Heo !zalloc_cpumask_var(&core_scratch, GFP_KERNEL) || 108bba2c361STejun Heo !zalloc_cpumask_var(&llc_fallback, GFP_KERNEL) || 109bba2c361STejun Heo !zalloc_cpumask_var(&online_no_topo, GFP_KERNEL)) 110bba2c361STejun Heo return -ENOMEM; 111bba2c361STejun Heo 112bba2c361STejun Heo /* -1 sentinels for sparse-possible cpu id holes (0 is a valid cid) */ 113bba2c361STejun Heo for (cpu = 0; cpu < nr_cpu_ids; cpu++) 114bba2c361STejun Heo scx_cpu_to_cid_tbl[cpu] = -1; 115bba2c361STejun Heo 116bba2c361STejun Heo cpumask_copy(to_walk, cpu_online_mask); 117bba2c361STejun Heo 118bba2c361STejun Heo while (!cpumask_empty(to_walk)) { 119bba2c361STejun Heo s32 next_cpu = cpumask_first(to_walk); 120bba2c361STejun Heo s32 nid = cpu_to_node(next_cpu); 121bba2c361STejun Heo s32 node_cid = next_cid; 122bba2c361STejun Heo s32 node_idx; 123bba2c361STejun Heo 124bba2c361STejun Heo /* 125bba2c361STejun Heo * No NUMA info: skip and let the tail loop assign a no-topo 126bba2c361STejun Heo * cid. cpumask_of_node(-1) is undefined. 127bba2c361STejun Heo */ 128bba2c361STejun Heo if (nid < 0) { 129bba2c361STejun Heo cpumask_clear_cpu(next_cpu, to_walk); 130bba2c361STejun Heo continue; 131bba2c361STejun Heo } 132bba2c361STejun Heo 133bba2c361STejun Heo node_idx = next_node_idx++; 134bba2c361STejun Heo 135bba2c361STejun Heo /* node_scratch = to_walk & this node */ 136bba2c361STejun Heo cpumask_and(node_scratch, to_walk, cpumask_of_node(nid)); 137bba2c361STejun Heo if (WARN_ON_ONCE(!cpumask_test_cpu(next_cpu, node_scratch))) 138bba2c361STejun Heo return -EINVAL; 139bba2c361STejun Heo 140bba2c361STejun Heo while (!cpumask_empty(node_scratch)) { 141bba2c361STejun Heo s32 ncpu = cpumask_first(node_scratch); 142bba2c361STejun Heo const struct cpumask *llc_mask = cpu_llc_mask(ncpu, llc_fallback); 143bba2c361STejun Heo s32 llc_cid = next_cid; 144bba2c361STejun Heo s32 llc_idx = next_llc_idx++; 145bba2c361STejun Heo 146bba2c361STejun Heo /* llc_scratch = node_scratch & this llc */ 147bba2c361STejun Heo cpumask_and(llc_scratch, node_scratch, llc_mask); 148bba2c361STejun Heo if (WARN_ON_ONCE(!cpumask_test_cpu(ncpu, llc_scratch))) 149bba2c361STejun Heo return -EINVAL; 150bba2c361STejun Heo 151bba2c361STejun Heo while (!cpumask_empty(llc_scratch)) { 152bba2c361STejun Heo s32 lcpu = cpumask_first(llc_scratch); 153bba2c361STejun Heo const struct cpumask *sib = topology_sibling_cpumask(lcpu); 154bba2c361STejun Heo s32 core_cid = next_cid; 155bba2c361STejun Heo s32 core_idx = next_core_idx++; 156bba2c361STejun Heo s32 ccpu; 157bba2c361STejun Heo 158bba2c361STejun Heo /* core_scratch = llc_scratch & this core */ 159bba2c361STejun Heo cpumask_and(core_scratch, llc_scratch, sib); 160bba2c361STejun Heo if (WARN_ON_ONCE(!cpumask_test_cpu(lcpu, core_scratch))) 161bba2c361STejun Heo return -EINVAL; 162bba2c361STejun Heo 163bba2c361STejun Heo for_each_cpu(ccpu, core_scratch) { 164bba2c361STejun Heo s32 cid = next_cid++; 165bba2c361STejun Heo 166bba2c361STejun Heo scx_cid_to_cpu_tbl[cid] = ccpu; 167bba2c361STejun Heo scx_cpu_to_cid_tbl[ccpu] = cid; 168bba2c361STejun Heo scx_cid_topo[cid] = (struct scx_cid_topo){ 169bba2c361STejun Heo .core_cid = core_cid, 170bba2c361STejun Heo .core_idx = core_idx, 171bba2c361STejun Heo .llc_cid = llc_cid, 172bba2c361STejun Heo .llc_idx = llc_idx, 173bba2c361STejun Heo .node_cid = node_cid, 174bba2c361STejun Heo .node_idx = node_idx, 175bba2c361STejun Heo }; 176bba2c361STejun Heo 177bba2c361STejun Heo cpumask_clear_cpu(ccpu, llc_scratch); 178bba2c361STejun Heo cpumask_clear_cpu(ccpu, node_scratch); 179bba2c361STejun Heo cpumask_clear_cpu(ccpu, to_walk); 180bba2c361STejun Heo } 181bba2c361STejun Heo } 182bba2c361STejun Heo } 183bba2c361STejun Heo } 184bba2c361STejun Heo 185bba2c361STejun Heo /* 186bba2c361STejun Heo * No-topo section: any possible cpu without a cid - normally just the 187bba2c361STejun Heo * not-online ones. Collect any currently-online cpus that land here in 188bba2c361STejun Heo * @online_no_topo so we can warn about them at the end. 189bba2c361STejun Heo */ 190bba2c361STejun Heo for_each_cpu(cpu, cpu_possible_mask) { 191bba2c361STejun Heo s32 cid; 192bba2c361STejun Heo 193bba2c361STejun Heo if (__scx_cpu_to_cid(cpu) != -1) 194bba2c361STejun Heo continue; 195bba2c361STejun Heo if (cpu_online(cpu)) 196bba2c361STejun Heo cpumask_set_cpu(cpu, online_no_topo); 197bba2c361STejun Heo 198bba2c361STejun Heo cid = next_cid++; 199bba2c361STejun Heo scx_cid_to_cpu_tbl[cid] = cpu; 200bba2c361STejun Heo scx_cpu_to_cid_tbl[cpu] = cid; 201bba2c361STejun Heo scx_cid_topo[cid] = SCX_CID_TOPO_NEG; 202bba2c361STejun Heo } 203bba2c361STejun Heo 204bba2c361STejun Heo if (!cpumask_empty(llc_fallback)) 205bba2c361STejun Heo pr_warn("scx_cid: cpus without cacheinfo, using node mask as llc: %*pbl\n", 206bba2c361STejun Heo cpumask_pr_args(llc_fallback)); 207bba2c361STejun Heo if (!cpumask_empty(online_no_topo)) 208bba2c361STejun Heo pr_warn("scx_cid: online cpus with no usable topology: %*pbl\n", 209bba2c361STejun Heo cpumask_pr_args(online_no_topo)); 210bba2c361STejun Heo 211bba2c361STejun Heo return 0; 212bba2c361STejun Heo } 213bba2c361STejun Heo 214bba2c361STejun Heo /** 215bba2c361STejun Heo * scx_cmask_clear - Zero every bit in @m's active range 216bba2c361STejun Heo * @m: cmask to clear 217bba2c361STejun Heo * 218bba2c361STejun Heo * Storage past the active range is left as is. 219bba2c361STejun Heo */ 220bba2c361STejun Heo void scx_cmask_clear(struct scx_cmask *m) 221bba2c361STejun Heo { 222bba2c361STejun Heo u32 nr_words; 223bba2c361STejun Heo 224bba2c361STejun Heo if (!m->nr_cids) 225bba2c361STejun Heo return; 226bba2c361STejun Heo nr_words = (m->base + m->nr_cids - 1) / 64 - m->base / 64 + 1; 227bba2c361STejun Heo memset(m->bits, 0, nr_words * sizeof(u64)); 228bba2c361STejun Heo } 229bba2c361STejun Heo 230bba2c361STejun Heo /** 231bba2c361STejun Heo * scx_cmask_fill - Set every bit in @m's active range 232bba2c361STejun Heo * @m: cmask to fill 233bba2c361STejun Heo * 234bba2c361STejun Heo * Counterpart to scx_cmask_clear(). Storage past the active range is left as is. 235bba2c361STejun Heo */ 236bba2c361STejun Heo void scx_cmask_fill(struct scx_cmask *m) 237bba2c361STejun Heo { 238bba2c361STejun Heo u32 nr_words, head_bits, tail_bits; 239bba2c361STejun Heo 240bba2c361STejun Heo if (!m->nr_cids) 241bba2c361STejun Heo return; 242bba2c361STejun Heo nr_words = (m->base + m->nr_cids - 1) / 64 - m->base / 64 + 1; 243bba2c361STejun Heo memset(m->bits, 0xff, nr_words * sizeof(u64)); 244bba2c361STejun Heo 245bba2c361STejun Heo /* clear word-0 bits below base */ 246bba2c361STejun Heo head_bits = m->base & 63; 247bba2c361STejun Heo if (head_bits) 248bba2c361STejun Heo m->bits[0] &= ~((1ULL << head_bits) - 1); 249bba2c361STejun Heo 250bba2c361STejun Heo /* clear last-word bits at or past base + nr_cids */ 251bba2c361STejun Heo tail_bits = (m->base + m->nr_cids) & 63; 252bba2c361STejun Heo if (tail_bits) 253bba2c361STejun Heo m->bits[nr_words - 1] &= (1ULL << tail_bits) - 1; 254bba2c361STejun Heo } 255bba2c361STejun Heo 256bba2c361STejun Heo /** 257bba2c361STejun Heo * scx_cpumask_to_cmask - Translate a kernel cpumask into a cmask 258bba2c361STejun Heo * @src: source cpumask 259bba2c361STejun Heo * @dst: cmask to write 260bba2c361STejun Heo * 261bba2c361STejun Heo * Clear @dst's active range and set the bit for each cid whose cpu is in 262bba2c361STejun Heo * @src and lies within that range. Out-of-range cids are silently ignored. 263bba2c361STejun Heo */ 264bba2c361STejun Heo void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst) 265bba2c361STejun Heo { 266bba2c361STejun Heo s32 cpu; 267bba2c361STejun Heo 268bba2c361STejun Heo scx_cmask_clear(dst); 269bba2c361STejun Heo for_each_cpu(cpu, src) { 270bba2c361STejun Heo s32 cid = __scx_cpu_to_cid(cpu); 271bba2c361STejun Heo 272bba2c361STejun Heo if (cid >= 0) 273bba2c361STejun Heo __scx_cmask_set(cid, dst); 274bba2c361STejun Heo } 275bba2c361STejun Heo } 276bba2c361STejun Heo 277bba2c361STejun Heo __bpf_kfunc_start_defs(); 278bba2c361STejun Heo 279bba2c361STejun Heo /** 280bba2c361STejun Heo * scx_bpf_cid_override - Install an explicit cpu->cid mapping 281bba2c361STejun Heo * @cpu_to_cid: array of nr_cpu_ids s32 entries (cid for each cpu) 282bba2c361STejun Heo * @cpu_to_cid__sz: must be nr_cpu_ids * sizeof(s32) bytes 283bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 284bba2c361STejun Heo * 285bba2c361STejun Heo * May only be called from ops.init() of the root scheduler. Replace the 286bba2c361STejun Heo * topology-probed cid mapping with the caller-provided one. Each possible cpu 287bba2c361STejun Heo * must map to a unique cid in [0, num_possible_cpus()). Topo info is cleared. 288bba2c361STejun Heo * On invalid input, trigger scx_error() to abort the scheduler. 289bba2c361STejun Heo */ 290bba2c361STejun Heo __bpf_kfunc void scx_bpf_cid_override(const s32 *cpu_to_cid, u32 cpu_to_cid__sz, 291bba2c361STejun Heo const struct bpf_prog_aux *aux) 292bba2c361STejun Heo { 293bba2c361STejun Heo cpumask_var_t seen __free(free_cpumask_var) = CPUMASK_VAR_NULL; 294bba2c361STejun Heo struct scx_sched *sch; 295bba2c361STejun Heo bool alloced; 296bba2c361STejun Heo s32 cpu, cid; 297bba2c361STejun Heo 298bba2c361STejun Heo /* GFP_KERNEL alloc must happen before the rcu read section */ 299bba2c361STejun Heo alloced = zalloc_cpumask_var(&seen, GFP_KERNEL); 300bba2c361STejun Heo 301bba2c361STejun Heo guard(rcu)(); 302bba2c361STejun Heo 303bba2c361STejun Heo sch = scx_prog_sched(aux); 304bba2c361STejun Heo if (unlikely(!sch)) 305bba2c361STejun Heo return; 306bba2c361STejun Heo 307bba2c361STejun Heo if (!alloced) { 308bba2c361STejun Heo scx_error(sch, "scx_bpf_cid_override: failed to allocate cpumask"); 309bba2c361STejun Heo return; 310bba2c361STejun Heo } 311bba2c361STejun Heo 312bba2c361STejun Heo if (scx_parent(sch)) { 313bba2c361STejun Heo scx_error(sch, "scx_bpf_cid_override() only allowed from root sched"); 314bba2c361STejun Heo return; 315bba2c361STejun Heo } 316bba2c361STejun Heo 317bba2c361STejun Heo if (cpu_to_cid__sz != nr_cpu_ids * sizeof(s32)) { 318bba2c361STejun Heo scx_error(sch, "scx_bpf_cid_override: expected %zu bytes, got %u", 319bba2c361STejun Heo nr_cpu_ids * sizeof(s32), cpu_to_cid__sz); 320bba2c361STejun Heo return; 321bba2c361STejun Heo } 322bba2c361STejun Heo 323bba2c361STejun Heo for_each_possible_cpu(cpu) { 324bba2c361STejun Heo s32 c = cpu_to_cid[cpu]; 325bba2c361STejun Heo 326bba2c361STejun Heo if (!cid_valid(sch, c)) 327bba2c361STejun Heo return; 328bba2c361STejun Heo if (cpumask_test_and_set_cpu(c, seen)) { 329bba2c361STejun Heo scx_error(sch, "cid %d assigned to multiple cpus", c); 330bba2c361STejun Heo return; 331bba2c361STejun Heo } 332bba2c361STejun Heo scx_cpu_to_cid_tbl[cpu] = c; 333bba2c361STejun Heo scx_cid_to_cpu_tbl[c] = cpu; 334bba2c361STejun Heo } 335bba2c361STejun Heo 336bba2c361STejun Heo /* Invalidate stale topo info - the override carries no topology. */ 337bba2c361STejun Heo for (cid = 0; cid < num_possible_cpus(); cid++) 338bba2c361STejun Heo scx_cid_topo[cid] = SCX_CID_TOPO_NEG; 339bba2c361STejun Heo } 340bba2c361STejun Heo 341bba2c361STejun Heo /** 342bba2c361STejun Heo * scx_bpf_cid_to_cpu - Return the raw CPU id for @cid 343bba2c361STejun Heo * @cid: cid to look up 344bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 345bba2c361STejun Heo * 346bba2c361STejun Heo * Return the raw CPU id for @cid. Trigger scx_error() and return -EINVAL if 347bba2c361STejun Heo * @cid is invalid. The cid<->cpu mapping is static for the lifetime of the 348bba2c361STejun Heo * loaded scheduler, so the BPF side can cache the result to avoid repeated 349bba2c361STejun Heo * kfunc invocations. 350bba2c361STejun Heo */ 351bba2c361STejun Heo __bpf_kfunc s32 scx_bpf_cid_to_cpu(s32 cid, const struct bpf_prog_aux *aux) 352bba2c361STejun Heo { 353bba2c361STejun Heo struct scx_sched *sch; 354bba2c361STejun Heo 355bba2c361STejun Heo guard(rcu)(); 356bba2c361STejun Heo 357bba2c361STejun Heo sch = scx_prog_sched(aux); 358bba2c361STejun Heo if (unlikely(!sch)) 359bba2c361STejun Heo return -EINVAL; 360bba2c361STejun Heo return scx_cid_to_cpu(sch, cid); 361bba2c361STejun Heo } 362bba2c361STejun Heo 363bba2c361STejun Heo /** 364bba2c361STejun Heo * scx_bpf_cpu_to_cid - Return the cid for @cpu 365bba2c361STejun Heo * @cpu: cpu to look up 366bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 367bba2c361STejun Heo * 368bba2c361STejun Heo * Return the cid for @cpu. Trigger scx_error() and return -EINVAL if @cpu is 369bba2c361STejun Heo * invalid. The cid<->cpu mapping is static for the lifetime of the loaded 370bba2c361STejun Heo * scheduler, so the BPF side can cache the result to avoid repeated kfunc 371bba2c361STejun Heo * invocations. 372bba2c361STejun Heo */ 373bba2c361STejun Heo __bpf_kfunc s32 scx_bpf_cpu_to_cid(s32 cpu, const struct bpf_prog_aux *aux) 374bba2c361STejun Heo { 375bba2c361STejun Heo struct scx_sched *sch; 376bba2c361STejun Heo 377bba2c361STejun Heo guard(rcu)(); 378bba2c361STejun Heo 379bba2c361STejun Heo sch = scx_prog_sched(aux); 380bba2c361STejun Heo if (unlikely(!sch)) 381bba2c361STejun Heo return -EINVAL; 382bba2c361STejun Heo return scx_cpu_to_cid(sch, cpu); 383bba2c361STejun Heo } 384bba2c361STejun Heo 385bba2c361STejun Heo /* 386bba2c361STejun Heo * Set ops on cmasks. cmask_walk_op2() shares one walk across mutating 387bba2c361STejun Heo * (and/or/copy/andnot) and predicate (subset/intersects) two-cmask forms; 388bba2c361STejun Heo * cmask_walk_op1() does the same shape over a single cmask range. Every public 389bba2c361STejun Heo * entry passes a compile-time-constant @op; cmask_walk_op{1,2}() and 390bba2c361STejun Heo * cmask_word_op{1,2}() are __always_inline so the inner switch collapses to the 391bba2c361STejun Heo * selected op and cmask_op2_is_pred() folds the predicate early-exit out of 392bba2c361STejun Heo * mutating ops. 393bba2c361STejun Heo * 394bba2c361STejun Heo * Two-cmask ops only touch @dst bits inside the intersection of the two ranges; 395bba2c361STejun Heo * bits outside stay untouched. In particular, scx_cmask_copy() does NOT zero 396bba2c361STejun Heo * @dst bits that lie outside @src's range. 397bba2c361STejun Heo * 398bba2c361STejun Heo * The _RACY variants are otherwise identical to their non-racy counterpart but 399bba2c361STejun Heo * read @src word-by-word via data_race(). Memory ordering with concurrent 400bba2c361STejun Heo * writers is the caller's responsibility. 401bba2c361STejun Heo */ 402bba2c361STejun Heo enum cmask_op2 { 403bba2c361STejun Heo /* mutating */ 404bba2c361STejun Heo CMASK_OP2_AND, 405bba2c361STejun Heo CMASK_OP2_OR, 406bba2c361STejun Heo CMASK_OP2_OR_RACY, 407bba2c361STejun Heo CMASK_OP2_COPY, 408bba2c361STejun Heo CMASK_OP2_COPY_RACY, 409bba2c361STejun Heo CMASK_OP2_ANDNOT, 410bba2c361STejun Heo /* predicates - short-circuit when the per-word result is true */ 411bba2c361STejun Heo CMASK_OP2_SUBSET, 412bba2c361STejun Heo CMASK_OP2_INTERSECTS, 413bba2c361STejun Heo }; 414bba2c361STejun Heo 415bba2c361STejun Heo static __always_inline bool cmask_op2_is_pred(const enum cmask_op2 op) 416bba2c361STejun Heo { 417bba2c361STejun Heo return op == CMASK_OP2_SUBSET || op == CMASK_OP2_INTERSECTS; 418bba2c361STejun Heo } 419bba2c361STejun Heo 420bba2c361STejun Heo static __always_inline bool cmask_word_op2(u64 *av, const u64 *bp, u64 mask, 421bba2c361STejun Heo const enum cmask_op2 op) 422bba2c361STejun Heo { 423bba2c361STejun Heo switch (op) { 424bba2c361STejun Heo case CMASK_OP2_AND: 425bba2c361STejun Heo *av &= ~mask | *bp; 426bba2c361STejun Heo return false; 427bba2c361STejun Heo case CMASK_OP2_OR: 428bba2c361STejun Heo *av |= *bp & mask; 429bba2c361STejun Heo return false; 430bba2c361STejun Heo case CMASK_OP2_OR_RACY: 431bba2c361STejun Heo *av |= data_race(*bp) & mask; 432bba2c361STejun Heo return false; 433bba2c361STejun Heo case CMASK_OP2_COPY: 434bba2c361STejun Heo *av = (*av & ~mask) | (*bp & mask); 435bba2c361STejun Heo return false; 436bba2c361STejun Heo case CMASK_OP2_COPY_RACY: 437bba2c361STejun Heo *av = (*av & ~mask) | (data_race(*bp) & mask); 438bba2c361STejun Heo return false; 439bba2c361STejun Heo case CMASK_OP2_ANDNOT: 440bba2c361STejun Heo *av &= ~(*bp & mask); 441bba2c361STejun Heo return false; 442bba2c361STejun Heo case CMASK_OP2_SUBSET: 443bba2c361STejun Heo /* stop on the first bit in @sub not set in @super */ 444bba2c361STejun Heo return (*bp & ~*av) & mask; 445bba2c361STejun Heo case CMASK_OP2_INTERSECTS: 446bba2c361STejun Heo return (*av & *bp) & mask; 447bba2c361STejun Heo } 448bba2c361STejun Heo unreachable(); 449bba2c361STejun Heo } 450bba2c361STejun Heo 451bba2c361STejun Heo /* 452bba2c361STejun Heo * Walk the intersection of [@a_base, @a_base + @a_nr_cids) with [@b_base, 453bba2c361STejun Heo * @b_base + @b_nr_cids) word by word, applying @op. Mutating ops walk all words 454bba2c361STejun Heo * and return false; predicates return true on the first word whose per-word 455bba2c361STejun Heo * test is true. Empty intersection returns false (matches "no bits to consider" 456bba2c361STejun Heo * for both mutate and predicate). 457bba2c361STejun Heo * 458bba2c361STejun Heo * Base/nr_cids are taken as parameters so callers with snapshotted bounds can 459bba2c361STejun Heo * drive the walk with values independent of the cmask's header. 460bba2c361STejun Heo */ 461bba2c361STejun Heo static __always_inline bool cmask_walk_op2(u64 *a_bits, u32 a_base, u32 a_nr_cids, 462bba2c361STejun Heo const u64 *b_bits, u32 b_base, u32 b_nr_cids, 463bba2c361STejun Heo const enum cmask_op2 op) 464bba2c361STejun Heo { 465bba2c361STejun Heo u32 lo = max(a_base, b_base); 466bba2c361STejun Heo u32 hi = min(a_base + a_nr_cids, b_base + b_nr_cids); 467bba2c361STejun Heo u32 a_word_off = a_base / 64; 468bba2c361STejun Heo u32 b_word_off = b_base / 64; 469bba2c361STejun Heo u32 lo_word = lo / 64; 470bba2c361STejun Heo u32 hi_word = (hi - 1) / 64; 471bba2c361STejun Heo u64 head_mask = GENMASK_U64(63, lo & 63); 472bba2c361STejun Heo u64 tail_mask = GENMASK_U64((hi - 1) & 63, 0); 473bba2c361STejun Heo u32 w; 474bba2c361STejun Heo 475bba2c361STejun Heo if (lo >= hi) 476bba2c361STejun Heo return false; 477bba2c361STejun Heo 478bba2c361STejun Heo if (lo_word == hi_word) 479bba2c361STejun Heo return cmask_word_op2(&a_bits[lo_word - a_word_off], 480bba2c361STejun Heo &b_bits[lo_word - b_word_off], 481bba2c361STejun Heo head_mask & tail_mask, op); 482bba2c361STejun Heo 483bba2c361STejun Heo if (cmask_word_op2(&a_bits[lo_word - a_word_off], 484bba2c361STejun Heo &b_bits[lo_word - b_word_off], head_mask, op) && 485bba2c361STejun Heo cmask_op2_is_pred(op)) 486bba2c361STejun Heo return true; 487bba2c361STejun Heo 488bba2c361STejun Heo for (w = lo_word + 1; w < hi_word; w++) 489bba2c361STejun Heo if (cmask_word_op2(&a_bits[w - a_word_off], 490bba2c361STejun Heo &b_bits[w - b_word_off], ~0ULL, op) && 491bba2c361STejun Heo cmask_op2_is_pred(op)) 492bba2c361STejun Heo return true; 493bba2c361STejun Heo 494bba2c361STejun Heo return cmask_word_op2(&a_bits[hi_word - a_word_off], 495bba2c361STejun Heo &b_bits[hi_word - b_word_off], tail_mask, op); 496bba2c361STejun Heo } 497bba2c361STejun Heo 498bba2c361STejun Heo enum cmask_op1 { 499bba2c361STejun Heo CMASK_OP1_ANY_SET, 500bba2c361STejun Heo }; 501bba2c361STejun Heo 502bba2c361STejun Heo static __always_inline bool cmask_word_op1(const u64 *ap, u64 mask, 503bba2c361STejun Heo const enum cmask_op1 op) 504bba2c361STejun Heo { 505bba2c361STejun Heo switch (op) { 506bba2c361STejun Heo case CMASK_OP1_ANY_SET: 507bba2c361STejun Heo return *ap & mask; 508bba2c361STejun Heo } 509bba2c361STejun Heo unreachable(); 510bba2c361STejun Heo } 511bba2c361STejun Heo 512bba2c361STejun Heo /* 513bba2c361STejun Heo * Walk [@a_base, @a_base + @a_nr_cids) of @a_bits word by word, applying @op. 514bba2c361STejun Heo * Returns true on the first word whose per-word test is true; returns false if 515bba2c361STejun Heo * no word matches or the range is empty. All current op1s short-circuit on 516bba2c361STejun Heo * per-word true; if a non-predicate op1 lands here, add a cmask_op1_is_pred() 517bba2c361STejun Heo * guard analogous to cmask_op2_is_pred(). 518bba2c361STejun Heo */ 519bba2c361STejun Heo static __always_inline bool cmask_walk_op1(const u64 *a_bits, u32 a_base, 520bba2c361STejun Heo u32 a_nr_cids, 521bba2c361STejun Heo const enum cmask_op1 op) 522bba2c361STejun Heo { 523bba2c361STejun Heo u32 lo = a_base; 524bba2c361STejun Heo u32 hi = a_base + a_nr_cids; 525bba2c361STejun Heo u32 a_word_off = a_base / 64; 526bba2c361STejun Heo u32 lo_word = lo / 64; 527bba2c361STejun Heo u32 hi_word = (hi - 1) / 64; 528bba2c361STejun Heo u64 head_mask = GENMASK_U64(63, lo & 63); 529bba2c361STejun Heo u64 tail_mask = GENMASK_U64((hi - 1) & 63, 0); 530bba2c361STejun Heo u32 w; 531bba2c361STejun Heo 532bba2c361STejun Heo if (lo >= hi) 533bba2c361STejun Heo return false; 534bba2c361STejun Heo 535bba2c361STejun Heo if (lo_word == hi_word) 536bba2c361STejun Heo return cmask_word_op1(&a_bits[lo_word - a_word_off], 537bba2c361STejun Heo head_mask & tail_mask, op); 538bba2c361STejun Heo 539bba2c361STejun Heo if (cmask_word_op1(&a_bits[lo_word - a_word_off], head_mask, op)) 540bba2c361STejun Heo return true; 541bba2c361STejun Heo for (w = lo_word + 1; w < hi_word; w++) 542bba2c361STejun Heo if (cmask_word_op1(&a_bits[w - a_word_off], ~0ULL, op)) 543bba2c361STejun Heo return true; 544bba2c361STejun Heo return cmask_word_op1(&a_bits[hi_word - a_word_off], tail_mask, op); 545bba2c361STejun Heo } 546bba2c361STejun Heo 547bba2c361STejun Heo void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src) 548bba2c361STejun Heo { 549bba2c361STejun Heo cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, 550bba2c361STejun Heo src->bits, src->base, src->nr_cids, CMASK_OP2_AND); 551bba2c361STejun Heo } 552bba2c361STejun Heo 553bba2c361STejun Heo void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src) 554bba2c361STejun Heo { 555bba2c361STejun Heo cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, 556bba2c361STejun Heo src->bits, src->base, src->nr_cids, CMASK_OP2_OR); 557bba2c361STejun Heo } 558bba2c361STejun Heo 559bba2c361STejun Heo /** 560bba2c361STejun Heo * scx_cmask_or_racy - OR @src into @dst, reading @src without locking 561bba2c361STejun Heo * 562bba2c361STejun Heo * @src is read word-by-word through data_race(). Same per-bit independence 563bba2c361STejun Heo * rationale as scx_cmask_copy_racy(). Memory ordering with writers is the 564bba2c361STejun Heo * caller's responsibility. 565bba2c361STejun Heo */ 566bba2c361STejun Heo void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src) 567bba2c361STejun Heo { 568bba2c361STejun Heo cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, 569bba2c361STejun Heo src->bits, src->base, src->nr_cids, CMASK_OP2_OR_RACY); 570bba2c361STejun Heo } 571bba2c361STejun Heo 572bba2c361STejun Heo void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src) 573bba2c361STejun Heo { 574bba2c361STejun Heo cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, 575bba2c361STejun Heo src->bits, src->base, src->nr_cids, CMASK_OP2_COPY); 576bba2c361STejun Heo } 577bba2c361STejun Heo 578bba2c361STejun Heo /** 579bba2c361STejun Heo * scx_cmask_copy_racy - Snapshot @src into @dst without locking 580bba2c361STejun Heo * 581bba2c361STejun Heo * @src is read word-by-word through data_race(). Head/tail masking matches 582bba2c361STejun Heo * scx_cmask_copy(). Each bit in a cmask is independent, so partial updates 583bba2c361STejun Heo * just leave some bits fresher than others. Memory ordering with writers is 584bba2c361STejun Heo * the caller's responsibility. 585bba2c361STejun Heo */ 586bba2c361STejun Heo void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src) 587bba2c361STejun Heo { 588bba2c361STejun Heo cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, 589bba2c361STejun Heo src->bits, src->base, src->nr_cids, CMASK_OP2_COPY_RACY); 590bba2c361STejun Heo } 591bba2c361STejun Heo 592bba2c361STejun Heo void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src) 593bba2c361STejun Heo { 594bba2c361STejun Heo cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, 595bba2c361STejun Heo src->bits, src->base, src->nr_cids, CMASK_OP2_ANDNOT); 596bba2c361STejun Heo } 597bba2c361STejun Heo 598bba2c361STejun Heo /* 599bba2c361STejun Heo * Return true if @cm has any bit set in [@lo, @hi). Caller must ensure 600bba2c361STejun Heo * [@lo, @hi) is contained in @cm's range. 601bba2c361STejun Heo */ 602bba2c361STejun Heo static bool cmask_any_set_in_range(const struct scx_cmask *cm, u32 lo, u32 hi) 603bba2c361STejun Heo { 604bba2c361STejun Heo if (lo >= hi) 605bba2c361STejun Heo return false; 606bba2c361STejun Heo return cmask_walk_op1(&cm->bits[lo / 64 - cm->base / 64], lo, hi - lo, 607bba2c361STejun Heo CMASK_OP1_ANY_SET); 608bba2c361STejun Heo } 609bba2c361STejun Heo 610bba2c361STejun Heo /** 611bba2c361STejun Heo * scx_cmask_subset - test whether @sub is a subset of @super 612bba2c361STejun Heo * @sub: cmask to test 613bba2c361STejun Heo * @super: cmask to test against 614bba2c361STejun Heo * 615bba2c361STejun Heo * Return true iff every set bit of @sub is also set in @super. 616bba2c361STejun Heo */ 617bba2c361STejun Heo bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super) 618bba2c361STejun Heo { 619bba2c361STejun Heo u32 super_end = super->base + super->nr_cids; 620bba2c361STejun Heo u32 sub_end = sub->base + sub->nr_cids; 621bba2c361STejun Heo 622bba2c361STejun Heo /* 623bba2c361STejun Heo * Set bits in @sub outside @super's range can't be in @super, so any 624bba2c361STejun Heo * such bit means not a subset. The walk below only visits words 625bba2c361STejun Heo * common to both ranges, so these need a separate scan. 626bba2c361STejun Heo */ 627bba2c361STejun Heo if (sub->base < super->base && 628bba2c361STejun Heo cmask_any_set_in_range(sub, sub->base, min(super->base, sub_end))) 629bba2c361STejun Heo return false; 630bba2c361STejun Heo if (sub_end > super_end && 631bba2c361STejun Heo cmask_any_set_in_range(sub, max(sub->base, super_end), sub_end)) 632bba2c361STejun Heo return false; 633bba2c361STejun Heo 634bba2c361STejun Heo return !cmask_walk_op2((u64 *)super->bits, super->base, super->nr_cids, 635bba2c361STejun Heo sub->bits, sub->base, sub->nr_cids, CMASK_OP2_SUBSET); 636bba2c361STejun Heo } 637bba2c361STejun Heo 638bba2c361STejun Heo bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b) 639bba2c361STejun Heo { 640bba2c361STejun Heo return cmask_walk_op2((u64 *)a->bits, a->base, a->nr_cids, 641bba2c361STejun Heo b->bits, b->base, b->nr_cids, CMASK_OP2_INTERSECTS); 642bba2c361STejun Heo } 643bba2c361STejun Heo 644bba2c361STejun Heo /** 645bba2c361STejun Heo * scx_cmask_empty - Test whether @m has no bits set 646bba2c361STejun Heo * @m: cmask to test 647bba2c361STejun Heo * 648bba2c361STejun Heo * Return true iff @m's active range has no bits set. 649bba2c361STejun Heo */ 650bba2c361STejun Heo bool scx_cmask_empty(const struct scx_cmask *m) 651bba2c361STejun Heo { 652bba2c361STejun Heo return !cmask_any_set_in_range(m, m->base, m->base + m->nr_cids); 653bba2c361STejun Heo } 654bba2c361STejun Heo 655bba2c361STejun Heo /** 656bba2c361STejun Heo * scx_bpf_cid_topo - Copy out per-cid topology info 657bba2c361STejun Heo * @cid: cid to look up 658bba2c361STejun Heo * @out__uninit: where to copy the topology info; fully written by this call 659bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 660bba2c361STejun Heo * 661bba2c361STejun Heo * Fill @out__uninit with the topology info for @cid. Trigger scx_error() if 662bba2c361STejun Heo * @cid is out of range. If @cid is valid but in the no-topo section, all fields 663bba2c361STejun Heo * are set to -1. 664bba2c361STejun Heo */ 665bba2c361STejun Heo __bpf_kfunc void scx_bpf_cid_topo(s32 cid, struct scx_cid_topo *out__uninit, 666bba2c361STejun Heo const struct bpf_prog_aux *aux) 667bba2c361STejun Heo { 668bba2c361STejun Heo struct scx_sched *sch; 669bba2c361STejun Heo 670bba2c361STejun Heo guard(rcu)(); 671bba2c361STejun Heo 672bba2c361STejun Heo sch = scx_prog_sched(aux); 673bba2c361STejun Heo if (unlikely(!sch) || !cid_valid(sch, cid)) { 674bba2c361STejun Heo *out__uninit = SCX_CID_TOPO_NEG; 675bba2c361STejun Heo return; 676bba2c361STejun Heo } 677bba2c361STejun Heo 678bba2c361STejun Heo *out__uninit = READ_ONCE(scx_cid_topo)[cid]; 679bba2c361STejun Heo } 680bba2c361STejun Heo 681bba2c361STejun Heo __bpf_kfunc_end_defs(); 682bba2c361STejun Heo 683bba2c361STejun Heo BTF_KFUNCS_START(scx_kfunc_ids_init) 684bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cid_override, KF_IMPLICIT_ARGS | KF_SLEEPABLE) 685bba2c361STejun Heo BTF_KFUNCS_END(scx_kfunc_ids_init) 686bba2c361STejun Heo 687bba2c361STejun Heo static const struct btf_kfunc_id_set scx_kfunc_set_init = { 688bba2c361STejun Heo .owner = THIS_MODULE, 689bba2c361STejun Heo .set = &scx_kfunc_ids_init, 690bba2c361STejun Heo .filter = scx_kfunc_context_filter, 691bba2c361STejun Heo }; 692bba2c361STejun Heo 693bba2c361STejun Heo BTF_KFUNCS_START(scx_kfunc_ids_cid) 694bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cid_to_cpu, KF_IMPLICIT_ARGS) 695bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cpu_to_cid, KF_IMPLICIT_ARGS) 696bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cid_topo, KF_IMPLICIT_ARGS) 697bba2c361STejun Heo BTF_KFUNCS_END(scx_kfunc_ids_cid) 698bba2c361STejun Heo 699bba2c361STejun Heo static const struct btf_kfunc_id_set scx_kfunc_set_cid = { 700bba2c361STejun Heo .owner = THIS_MODULE, 701bba2c361STejun Heo .set = &scx_kfunc_ids_cid, 702bba2c361STejun Heo }; 703bba2c361STejun Heo 704bba2c361STejun Heo int scx_cid_kfunc_init(void) 705bba2c361STejun Heo { 706bba2c361STejun Heo return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_init) ?: 707bba2c361STejun Heo register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_cid) ?: 708bba2c361STejun Heo register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_cid) ?: 709bba2c361STejun Heo register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_cid); 710bba2c361STejun Heo } 711