1*bba2c361STejun Heo /* SPDX-License-Identifier: GPL-2.0 */ 2*bba2c361STejun Heo /* 3*bba2c361STejun Heo * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst 4*bba2c361STejun Heo * 5*bba2c361STejun Heo * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. 6*bba2c361STejun Heo * Copyright (c) 2026 Tejun Heo <tj@kernel.org> 7*bba2c361STejun Heo */ 8*bba2c361STejun Heo #include <linux/cacheinfo.h> 9*bba2c361STejun Heo 10*bba2c361STejun Heo /* 11*bba2c361STejun Heo * cid tables. 12*bba2c361STejun Heo * 13*bba2c361STejun Heo * Pointers are published once on first enable and never revoked. The default 14*bba2c361STejun Heo * mapping is populated before ops.init() runs; scx_bpf_cid_override() commits 15*bba2c361STejun Heo * before it returns. As long as the BPF scheduler only uses the tables from 16*bba2c361STejun Heo * those points onward, it sees a consistent view. 17*bba2c361STejun Heo */ 18*bba2c361STejun Heo s16 *scx_cid_to_cpu_tbl; 19*bba2c361STejun Heo s16 *scx_cpu_to_cid_tbl; 20*bba2c361STejun Heo struct scx_cid_topo *scx_cid_topo; 21*bba2c361STejun Heo 22*bba2c361STejun Heo #define SCX_CID_TOPO_NEG (struct scx_cid_topo) { \ 23*bba2c361STejun Heo .core_cid = -1, .core_idx = -1, .llc_cid = -1, .llc_idx = -1, \ 24*bba2c361STejun Heo .node_cid = -1, .node_idx = -1, \ 25*bba2c361STejun Heo } 26*bba2c361STejun Heo 27*bba2c361STejun Heo /* 28*bba2c361STejun Heo * Return @cpu's LLC shared_cpu_map. If cacheinfo isn't populated (offline or 29*bba2c361STejun Heo * !present), record @cpu in @fallbacks and return its node mask instead - the 30*bba2c361STejun Heo * worst that can happen is that the cpu's LLC becomes coarser than reality. 31*bba2c361STejun Heo */ 32*bba2c361STejun Heo static const struct cpumask *cpu_llc_mask(int cpu, struct cpumask *fallbacks) 33*bba2c361STejun Heo { 34*bba2c361STejun Heo struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); 35*bba2c361STejun Heo 36*bba2c361STejun Heo if (!ci || !ci->info_list || !ci->num_leaves) { 37*bba2c361STejun Heo cpumask_set_cpu(cpu, fallbacks); 38*bba2c361STejun Heo return cpumask_of_node(cpu_to_node(cpu)); 39*bba2c361STejun Heo } 40*bba2c361STejun Heo return &ci->info_list[ci->num_leaves - 1].shared_cpu_map; 41*bba2c361STejun Heo } 42*bba2c361STejun Heo 43*bba2c361STejun Heo /* Allocate the cid tables once on first enable; never freed. */ 44*bba2c361STejun Heo static s32 scx_cid_arrays_alloc(void) 45*bba2c361STejun Heo { 46*bba2c361STejun Heo u32 npossible = num_possible_cpus(); 47*bba2c361STejun Heo s16 *cid_to_cpu, *cpu_to_cid; 48*bba2c361STejun Heo struct scx_cid_topo *cid_topo; 49*bba2c361STejun Heo 50*bba2c361STejun Heo if (scx_cid_to_cpu_tbl) 51*bba2c361STejun Heo return 0; 52*bba2c361STejun Heo 53*bba2c361STejun Heo cid_to_cpu = kzalloc_objs(*scx_cid_to_cpu_tbl, npossible, GFP_KERNEL); 54*bba2c361STejun Heo cpu_to_cid = kzalloc_objs(*scx_cpu_to_cid_tbl, nr_cpu_ids, GFP_KERNEL); 55*bba2c361STejun Heo cid_topo = kmalloc_objs(*scx_cid_topo, npossible, GFP_KERNEL); 56*bba2c361STejun Heo 57*bba2c361STejun Heo if (!cid_to_cpu || !cpu_to_cid || !cid_topo) { 58*bba2c361STejun Heo kfree(cid_to_cpu); 59*bba2c361STejun Heo kfree(cpu_to_cid); 60*bba2c361STejun Heo kfree(cid_topo); 61*bba2c361STejun Heo return -ENOMEM; 62*bba2c361STejun Heo } 63*bba2c361STejun Heo 64*bba2c361STejun Heo WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu); 65*bba2c361STejun Heo WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid); 66*bba2c361STejun Heo WRITE_ONCE(scx_cid_topo, cid_topo); 67*bba2c361STejun Heo return 0; 68*bba2c361STejun Heo } 69*bba2c361STejun Heo 70*bba2c361STejun Heo /** 71*bba2c361STejun Heo * scx_cid_init - build the cid mapping 72*bba2c361STejun Heo * @sch: the scx_sched being initialized; used as the scx_error() target 73*bba2c361STejun Heo * 74*bba2c361STejun Heo * See "Topological CPU IDs" in cid.h for the model. Walk online cpus by 75*bba2c361STejun Heo * intersection at each level (parent_scratch & this_level_mask), which keeps 76*bba2c361STejun Heo * containment correct by construction and naturally splits a physical LLC 77*bba2c361STejun Heo * straddling two NUMA nodes into two LLC units. The caller must hold 78*bba2c361STejun Heo * cpus_read_lock. 79*bba2c361STejun Heo */ 80*bba2c361STejun Heo s32 scx_cid_init(struct scx_sched *sch) 81*bba2c361STejun Heo { 82*bba2c361STejun Heo cpumask_var_t to_walk __free(free_cpumask_var) = CPUMASK_VAR_NULL; 83*bba2c361STejun Heo cpumask_var_t node_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL; 84*bba2c361STejun Heo cpumask_var_t llc_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL; 85*bba2c361STejun Heo cpumask_var_t core_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL; 86*bba2c361STejun Heo cpumask_var_t llc_fallback __free(free_cpumask_var) = CPUMASK_VAR_NULL; 87*bba2c361STejun Heo cpumask_var_t online_no_topo __free(free_cpumask_var) = CPUMASK_VAR_NULL; 88*bba2c361STejun Heo u32 next_cid = 0; 89*bba2c361STejun Heo s32 next_node_idx = 0, next_llc_idx = 0, next_core_idx = 0; 90*bba2c361STejun Heo s32 cpu, ret; 91*bba2c361STejun Heo 92*bba2c361STejun Heo /* CMASK_MAX_WORDS in cid.bpf.h covers NR_CPUS up to 8192 */ 93*bba2c361STejun Heo BUILD_BUG_ON(NR_CPUS > 8192); 94*bba2c361STejun Heo 95*bba2c361STejun Heo lockdep_assert_cpus_held(); 96*bba2c361STejun Heo 97*bba2c361STejun Heo ret = scx_cid_arrays_alloc(); 98*bba2c361STejun Heo if (ret) 99*bba2c361STejun Heo return ret; 100*bba2c361STejun Heo 101*bba2c361STejun Heo if (!zalloc_cpumask_var(&to_walk, GFP_KERNEL) || 102*bba2c361STejun Heo !zalloc_cpumask_var(&node_scratch, GFP_KERNEL) || 103*bba2c361STejun Heo !zalloc_cpumask_var(&llc_scratch, GFP_KERNEL) || 104*bba2c361STejun Heo !zalloc_cpumask_var(&core_scratch, GFP_KERNEL) || 105*bba2c361STejun Heo !zalloc_cpumask_var(&llc_fallback, GFP_KERNEL) || 106*bba2c361STejun Heo !zalloc_cpumask_var(&online_no_topo, GFP_KERNEL)) 107*bba2c361STejun Heo return -ENOMEM; 108*bba2c361STejun Heo 109*bba2c361STejun Heo /* -1 sentinels for sparse-possible cpu id holes (0 is a valid cid) */ 110*bba2c361STejun Heo for (cpu = 0; cpu < nr_cpu_ids; cpu++) 111*bba2c361STejun Heo scx_cpu_to_cid_tbl[cpu] = -1; 112*bba2c361STejun Heo 113*bba2c361STejun Heo cpumask_copy(to_walk, cpu_online_mask); 114*bba2c361STejun Heo 115*bba2c361STejun Heo while (!cpumask_empty(to_walk)) { 116*bba2c361STejun Heo s32 next_cpu = cpumask_first(to_walk); 117*bba2c361STejun Heo s32 nid = cpu_to_node(next_cpu); 118*bba2c361STejun Heo s32 node_cid = next_cid; 119*bba2c361STejun Heo s32 node_idx; 120*bba2c361STejun Heo 121*bba2c361STejun Heo /* 122*bba2c361STejun Heo * No NUMA info: skip and let the tail loop assign a no-topo 123*bba2c361STejun Heo * cid. cpumask_of_node(-1) is undefined. 124*bba2c361STejun Heo */ 125*bba2c361STejun Heo if (nid < 0) { 126*bba2c361STejun Heo cpumask_clear_cpu(next_cpu, to_walk); 127*bba2c361STejun Heo continue; 128*bba2c361STejun Heo } 129*bba2c361STejun Heo 130*bba2c361STejun Heo node_idx = next_node_idx++; 131*bba2c361STejun Heo 132*bba2c361STejun Heo /* node_scratch = to_walk & this node */ 133*bba2c361STejun Heo cpumask_and(node_scratch, to_walk, cpumask_of_node(nid)); 134*bba2c361STejun Heo if (WARN_ON_ONCE(!cpumask_test_cpu(next_cpu, node_scratch))) 135*bba2c361STejun Heo return -EINVAL; 136*bba2c361STejun Heo 137*bba2c361STejun Heo while (!cpumask_empty(node_scratch)) { 138*bba2c361STejun Heo s32 ncpu = cpumask_first(node_scratch); 139*bba2c361STejun Heo const struct cpumask *llc_mask = cpu_llc_mask(ncpu, llc_fallback); 140*bba2c361STejun Heo s32 llc_cid = next_cid; 141*bba2c361STejun Heo s32 llc_idx = next_llc_idx++; 142*bba2c361STejun Heo 143*bba2c361STejun Heo /* llc_scratch = node_scratch & this llc */ 144*bba2c361STejun Heo cpumask_and(llc_scratch, node_scratch, llc_mask); 145*bba2c361STejun Heo if (WARN_ON_ONCE(!cpumask_test_cpu(ncpu, llc_scratch))) 146*bba2c361STejun Heo return -EINVAL; 147*bba2c361STejun Heo 148*bba2c361STejun Heo while (!cpumask_empty(llc_scratch)) { 149*bba2c361STejun Heo s32 lcpu = cpumask_first(llc_scratch); 150*bba2c361STejun Heo const struct cpumask *sib = topology_sibling_cpumask(lcpu); 151*bba2c361STejun Heo s32 core_cid = next_cid; 152*bba2c361STejun Heo s32 core_idx = next_core_idx++; 153*bba2c361STejun Heo s32 ccpu; 154*bba2c361STejun Heo 155*bba2c361STejun Heo /* core_scratch = llc_scratch & this core */ 156*bba2c361STejun Heo cpumask_and(core_scratch, llc_scratch, sib); 157*bba2c361STejun Heo if (WARN_ON_ONCE(!cpumask_test_cpu(lcpu, core_scratch))) 158*bba2c361STejun Heo return -EINVAL; 159*bba2c361STejun Heo 160*bba2c361STejun Heo for_each_cpu(ccpu, core_scratch) { 161*bba2c361STejun Heo s32 cid = next_cid++; 162*bba2c361STejun Heo 163*bba2c361STejun Heo scx_cid_to_cpu_tbl[cid] = ccpu; 164*bba2c361STejun Heo scx_cpu_to_cid_tbl[ccpu] = cid; 165*bba2c361STejun Heo scx_cid_topo[cid] = (struct scx_cid_topo){ 166*bba2c361STejun Heo .core_cid = core_cid, 167*bba2c361STejun Heo .core_idx = core_idx, 168*bba2c361STejun Heo .llc_cid = llc_cid, 169*bba2c361STejun Heo .llc_idx = llc_idx, 170*bba2c361STejun Heo .node_cid = node_cid, 171*bba2c361STejun Heo .node_idx = node_idx, 172*bba2c361STejun Heo }; 173*bba2c361STejun Heo 174*bba2c361STejun Heo cpumask_clear_cpu(ccpu, llc_scratch); 175*bba2c361STejun Heo cpumask_clear_cpu(ccpu, node_scratch); 176*bba2c361STejun Heo cpumask_clear_cpu(ccpu, to_walk); 177*bba2c361STejun Heo } 178*bba2c361STejun Heo } 179*bba2c361STejun Heo } 180*bba2c361STejun Heo } 181*bba2c361STejun Heo 182*bba2c361STejun Heo /* 183*bba2c361STejun Heo * No-topo section: any possible cpu without a cid - normally just the 184*bba2c361STejun Heo * not-online ones. Collect any currently-online cpus that land here in 185*bba2c361STejun Heo * @online_no_topo so we can warn about them at the end. 186*bba2c361STejun Heo */ 187*bba2c361STejun Heo for_each_cpu(cpu, cpu_possible_mask) { 188*bba2c361STejun Heo s32 cid; 189*bba2c361STejun Heo 190*bba2c361STejun Heo if (__scx_cpu_to_cid(cpu) != -1) 191*bba2c361STejun Heo continue; 192*bba2c361STejun Heo if (cpu_online(cpu)) 193*bba2c361STejun Heo cpumask_set_cpu(cpu, online_no_topo); 194*bba2c361STejun Heo 195*bba2c361STejun Heo cid = next_cid++; 196*bba2c361STejun Heo scx_cid_to_cpu_tbl[cid] = cpu; 197*bba2c361STejun Heo scx_cpu_to_cid_tbl[cpu] = cid; 198*bba2c361STejun Heo scx_cid_topo[cid] = SCX_CID_TOPO_NEG; 199*bba2c361STejun Heo } 200*bba2c361STejun Heo 201*bba2c361STejun Heo if (!cpumask_empty(llc_fallback)) 202*bba2c361STejun Heo pr_warn("scx_cid: cpus without cacheinfo, using node mask as llc: %*pbl\n", 203*bba2c361STejun Heo cpumask_pr_args(llc_fallback)); 204*bba2c361STejun Heo if (!cpumask_empty(online_no_topo)) 205*bba2c361STejun Heo pr_warn("scx_cid: online cpus with no usable topology: %*pbl\n", 206*bba2c361STejun Heo cpumask_pr_args(online_no_topo)); 207*bba2c361STejun Heo 208*bba2c361STejun Heo return 0; 209*bba2c361STejun Heo } 210*bba2c361STejun Heo 211*bba2c361STejun Heo /** 212*bba2c361STejun Heo * scx_cmask_clear - Zero every bit in @m's active range 213*bba2c361STejun Heo * @m: cmask to clear 214*bba2c361STejun Heo * 215*bba2c361STejun Heo * Storage past the active range is left as is. 216*bba2c361STejun Heo */ 217*bba2c361STejun Heo void scx_cmask_clear(struct scx_cmask *m) 218*bba2c361STejun Heo { 219*bba2c361STejun Heo u32 nr_words; 220*bba2c361STejun Heo 221*bba2c361STejun Heo if (!m->nr_cids) 222*bba2c361STejun Heo return; 223*bba2c361STejun Heo nr_words = (m->base + m->nr_cids - 1) / 64 - m->base / 64 + 1; 224*bba2c361STejun Heo memset(m->bits, 0, nr_words * sizeof(u64)); 225*bba2c361STejun Heo } 226*bba2c361STejun Heo 227*bba2c361STejun Heo /** 228*bba2c361STejun Heo * scx_cmask_fill - Set every bit in @m's active range 229*bba2c361STejun Heo * @m: cmask to fill 230*bba2c361STejun Heo * 231*bba2c361STejun Heo * Counterpart to scx_cmask_clear(). Storage past the active range is left as is. 232*bba2c361STejun Heo */ 233*bba2c361STejun Heo void scx_cmask_fill(struct scx_cmask *m) 234*bba2c361STejun Heo { 235*bba2c361STejun Heo u32 nr_words, head_bits, tail_bits; 236*bba2c361STejun Heo 237*bba2c361STejun Heo if (!m->nr_cids) 238*bba2c361STejun Heo return; 239*bba2c361STejun Heo nr_words = (m->base + m->nr_cids - 1) / 64 - m->base / 64 + 1; 240*bba2c361STejun Heo memset(m->bits, 0xff, nr_words * sizeof(u64)); 241*bba2c361STejun Heo 242*bba2c361STejun Heo /* clear word-0 bits below base */ 243*bba2c361STejun Heo head_bits = m->base & 63; 244*bba2c361STejun Heo if (head_bits) 245*bba2c361STejun Heo m->bits[0] &= ~((1ULL << head_bits) - 1); 246*bba2c361STejun Heo 247*bba2c361STejun Heo /* clear last-word bits at or past base + nr_cids */ 248*bba2c361STejun Heo tail_bits = (m->base + m->nr_cids) & 63; 249*bba2c361STejun Heo if (tail_bits) 250*bba2c361STejun Heo m->bits[nr_words - 1] &= (1ULL << tail_bits) - 1; 251*bba2c361STejun Heo } 252*bba2c361STejun Heo 253*bba2c361STejun Heo /** 254*bba2c361STejun Heo * scx_cpumask_to_cmask - Translate a kernel cpumask into a cmask 255*bba2c361STejun Heo * @src: source cpumask 256*bba2c361STejun Heo * @dst: cmask to write 257*bba2c361STejun Heo * 258*bba2c361STejun Heo * Clear @dst's active range and set the bit for each cid whose cpu is in 259*bba2c361STejun Heo * @src and lies within that range. Out-of-range cids are silently ignored. 260*bba2c361STejun Heo */ 261*bba2c361STejun Heo void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst) 262*bba2c361STejun Heo { 263*bba2c361STejun Heo s32 cpu; 264*bba2c361STejun Heo 265*bba2c361STejun Heo scx_cmask_clear(dst); 266*bba2c361STejun Heo for_each_cpu(cpu, src) { 267*bba2c361STejun Heo s32 cid = __scx_cpu_to_cid(cpu); 268*bba2c361STejun Heo 269*bba2c361STejun Heo if (cid >= 0) 270*bba2c361STejun Heo __scx_cmask_set(cid, dst); 271*bba2c361STejun Heo } 272*bba2c361STejun Heo } 273*bba2c361STejun Heo 274*bba2c361STejun Heo __bpf_kfunc_start_defs(); 275*bba2c361STejun Heo 276*bba2c361STejun Heo /** 277*bba2c361STejun Heo * scx_bpf_cid_override - Install an explicit cpu->cid mapping 278*bba2c361STejun Heo * @cpu_to_cid: array of nr_cpu_ids s32 entries (cid for each cpu) 279*bba2c361STejun Heo * @cpu_to_cid__sz: must be nr_cpu_ids * sizeof(s32) bytes 280*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 281*bba2c361STejun Heo * 282*bba2c361STejun Heo * May only be called from ops.init() of the root scheduler. Replace the 283*bba2c361STejun Heo * topology-probed cid mapping with the caller-provided one. Each possible cpu 284*bba2c361STejun Heo * must map to a unique cid in [0, num_possible_cpus()). Topo info is cleared. 285*bba2c361STejun Heo * On invalid input, trigger scx_error() to abort the scheduler. 286*bba2c361STejun Heo */ 287*bba2c361STejun Heo __bpf_kfunc void scx_bpf_cid_override(const s32 *cpu_to_cid, u32 cpu_to_cid__sz, 288*bba2c361STejun Heo const struct bpf_prog_aux *aux) 289*bba2c361STejun Heo { 290*bba2c361STejun Heo cpumask_var_t seen __free(free_cpumask_var) = CPUMASK_VAR_NULL; 291*bba2c361STejun Heo struct scx_sched *sch; 292*bba2c361STejun Heo bool alloced; 293*bba2c361STejun Heo s32 cpu, cid; 294*bba2c361STejun Heo 295*bba2c361STejun Heo /* GFP_KERNEL alloc must happen before the rcu read section */ 296*bba2c361STejun Heo alloced = zalloc_cpumask_var(&seen, GFP_KERNEL); 297*bba2c361STejun Heo 298*bba2c361STejun Heo guard(rcu)(); 299*bba2c361STejun Heo 300*bba2c361STejun Heo sch = scx_prog_sched(aux); 301*bba2c361STejun Heo if (unlikely(!sch)) 302*bba2c361STejun Heo return; 303*bba2c361STejun Heo 304*bba2c361STejun Heo if (!alloced) { 305*bba2c361STejun Heo scx_error(sch, "scx_bpf_cid_override: failed to allocate cpumask"); 306*bba2c361STejun Heo return; 307*bba2c361STejun Heo } 308*bba2c361STejun Heo 309*bba2c361STejun Heo if (scx_parent(sch)) { 310*bba2c361STejun Heo scx_error(sch, "scx_bpf_cid_override() only allowed from root sched"); 311*bba2c361STejun Heo return; 312*bba2c361STejun Heo } 313*bba2c361STejun Heo 314*bba2c361STejun Heo if (cpu_to_cid__sz != nr_cpu_ids * sizeof(s32)) { 315*bba2c361STejun Heo scx_error(sch, "scx_bpf_cid_override: expected %zu bytes, got %u", 316*bba2c361STejun Heo nr_cpu_ids * sizeof(s32), cpu_to_cid__sz); 317*bba2c361STejun Heo return; 318*bba2c361STejun Heo } 319*bba2c361STejun Heo 320*bba2c361STejun Heo for_each_possible_cpu(cpu) { 321*bba2c361STejun Heo s32 c = cpu_to_cid[cpu]; 322*bba2c361STejun Heo 323*bba2c361STejun Heo if (!cid_valid(sch, c)) 324*bba2c361STejun Heo return; 325*bba2c361STejun Heo if (cpumask_test_and_set_cpu(c, seen)) { 326*bba2c361STejun Heo scx_error(sch, "cid %d assigned to multiple cpus", c); 327*bba2c361STejun Heo return; 328*bba2c361STejun Heo } 329*bba2c361STejun Heo scx_cpu_to_cid_tbl[cpu] = c; 330*bba2c361STejun Heo scx_cid_to_cpu_tbl[c] = cpu; 331*bba2c361STejun Heo } 332*bba2c361STejun Heo 333*bba2c361STejun Heo /* Invalidate stale topo info - the override carries no topology. */ 334*bba2c361STejun Heo for (cid = 0; cid < num_possible_cpus(); cid++) 335*bba2c361STejun Heo scx_cid_topo[cid] = SCX_CID_TOPO_NEG; 336*bba2c361STejun Heo } 337*bba2c361STejun Heo 338*bba2c361STejun Heo /** 339*bba2c361STejun Heo * scx_bpf_cid_to_cpu - Return the raw CPU id for @cid 340*bba2c361STejun Heo * @cid: cid to look up 341*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 342*bba2c361STejun Heo * 343*bba2c361STejun Heo * Return the raw CPU id for @cid. Trigger scx_error() and return -EINVAL if 344*bba2c361STejun Heo * @cid is invalid. The cid<->cpu mapping is static for the lifetime of the 345*bba2c361STejun Heo * loaded scheduler, so the BPF side can cache the result to avoid repeated 346*bba2c361STejun Heo * kfunc invocations. 347*bba2c361STejun Heo */ 348*bba2c361STejun Heo __bpf_kfunc s32 scx_bpf_cid_to_cpu(s32 cid, const struct bpf_prog_aux *aux) 349*bba2c361STejun Heo { 350*bba2c361STejun Heo struct scx_sched *sch; 351*bba2c361STejun Heo 352*bba2c361STejun Heo guard(rcu)(); 353*bba2c361STejun Heo 354*bba2c361STejun Heo sch = scx_prog_sched(aux); 355*bba2c361STejun Heo if (unlikely(!sch)) 356*bba2c361STejun Heo return -EINVAL; 357*bba2c361STejun Heo return scx_cid_to_cpu(sch, cid); 358*bba2c361STejun Heo } 359*bba2c361STejun Heo 360*bba2c361STejun Heo /** 361*bba2c361STejun Heo * scx_bpf_cpu_to_cid - Return the cid for @cpu 362*bba2c361STejun Heo * @cpu: cpu to look up 363*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 364*bba2c361STejun Heo * 365*bba2c361STejun Heo * Return the cid for @cpu. Trigger scx_error() and return -EINVAL if @cpu is 366*bba2c361STejun Heo * invalid. The cid<->cpu mapping is static for the lifetime of the loaded 367*bba2c361STejun Heo * scheduler, so the BPF side can cache the result to avoid repeated kfunc 368*bba2c361STejun Heo * invocations. 369*bba2c361STejun Heo */ 370*bba2c361STejun Heo __bpf_kfunc s32 scx_bpf_cpu_to_cid(s32 cpu, const struct bpf_prog_aux *aux) 371*bba2c361STejun Heo { 372*bba2c361STejun Heo struct scx_sched *sch; 373*bba2c361STejun Heo 374*bba2c361STejun Heo guard(rcu)(); 375*bba2c361STejun Heo 376*bba2c361STejun Heo sch = scx_prog_sched(aux); 377*bba2c361STejun Heo if (unlikely(!sch)) 378*bba2c361STejun Heo return -EINVAL; 379*bba2c361STejun Heo return scx_cpu_to_cid(sch, cpu); 380*bba2c361STejun Heo } 381*bba2c361STejun Heo 382*bba2c361STejun Heo /* 383*bba2c361STejun Heo * Set ops on cmasks. cmask_walk_op2() shares one walk across mutating 384*bba2c361STejun Heo * (and/or/copy/andnot) and predicate (subset/intersects) two-cmask forms; 385*bba2c361STejun Heo * cmask_walk_op1() does the same shape over a single cmask range. Every public 386*bba2c361STejun Heo * entry passes a compile-time-constant @op; cmask_walk_op{1,2}() and 387*bba2c361STejun Heo * cmask_word_op{1,2}() are __always_inline so the inner switch collapses to the 388*bba2c361STejun Heo * selected op and cmask_op2_is_pred() folds the predicate early-exit out of 389*bba2c361STejun Heo * mutating ops. 390*bba2c361STejun Heo * 391*bba2c361STejun Heo * Two-cmask ops only touch @dst bits inside the intersection of the two ranges; 392*bba2c361STejun Heo * bits outside stay untouched. In particular, scx_cmask_copy() does NOT zero 393*bba2c361STejun Heo * @dst bits that lie outside @src's range. 394*bba2c361STejun Heo * 395*bba2c361STejun Heo * The _RACY variants are otherwise identical to their non-racy counterpart but 396*bba2c361STejun Heo * read @src word-by-word via data_race(). Memory ordering with concurrent 397*bba2c361STejun Heo * writers is the caller's responsibility. 398*bba2c361STejun Heo */ 399*bba2c361STejun Heo enum cmask_op2 { 400*bba2c361STejun Heo /* mutating */ 401*bba2c361STejun Heo CMASK_OP2_AND, 402*bba2c361STejun Heo CMASK_OP2_OR, 403*bba2c361STejun Heo CMASK_OP2_OR_RACY, 404*bba2c361STejun Heo CMASK_OP2_COPY, 405*bba2c361STejun Heo CMASK_OP2_COPY_RACY, 406*bba2c361STejun Heo CMASK_OP2_ANDNOT, 407*bba2c361STejun Heo /* predicates - short-circuit when the per-word result is true */ 408*bba2c361STejun Heo CMASK_OP2_SUBSET, 409*bba2c361STejun Heo CMASK_OP2_INTERSECTS, 410*bba2c361STejun Heo }; 411*bba2c361STejun Heo 412*bba2c361STejun Heo static __always_inline bool cmask_op2_is_pred(const enum cmask_op2 op) 413*bba2c361STejun Heo { 414*bba2c361STejun Heo return op == CMASK_OP2_SUBSET || op == CMASK_OP2_INTERSECTS; 415*bba2c361STejun Heo } 416*bba2c361STejun Heo 417*bba2c361STejun Heo static __always_inline bool cmask_word_op2(u64 *av, const u64 *bp, u64 mask, 418*bba2c361STejun Heo const enum cmask_op2 op) 419*bba2c361STejun Heo { 420*bba2c361STejun Heo switch (op) { 421*bba2c361STejun Heo case CMASK_OP2_AND: 422*bba2c361STejun Heo *av &= ~mask | *bp; 423*bba2c361STejun Heo return false; 424*bba2c361STejun Heo case CMASK_OP2_OR: 425*bba2c361STejun Heo *av |= *bp & mask; 426*bba2c361STejun Heo return false; 427*bba2c361STejun Heo case CMASK_OP2_OR_RACY: 428*bba2c361STejun Heo *av |= data_race(*bp) & mask; 429*bba2c361STejun Heo return false; 430*bba2c361STejun Heo case CMASK_OP2_COPY: 431*bba2c361STejun Heo *av = (*av & ~mask) | (*bp & mask); 432*bba2c361STejun Heo return false; 433*bba2c361STejun Heo case CMASK_OP2_COPY_RACY: 434*bba2c361STejun Heo *av = (*av & ~mask) | (data_race(*bp) & mask); 435*bba2c361STejun Heo return false; 436*bba2c361STejun Heo case CMASK_OP2_ANDNOT: 437*bba2c361STejun Heo *av &= ~(*bp & mask); 438*bba2c361STejun Heo return false; 439*bba2c361STejun Heo case CMASK_OP2_SUBSET: 440*bba2c361STejun Heo /* stop on the first bit in @sub not set in @super */ 441*bba2c361STejun Heo return (*bp & ~*av) & mask; 442*bba2c361STejun Heo case CMASK_OP2_INTERSECTS: 443*bba2c361STejun Heo return (*av & *bp) & mask; 444*bba2c361STejun Heo } 445*bba2c361STejun Heo unreachable(); 446*bba2c361STejun Heo } 447*bba2c361STejun Heo 448*bba2c361STejun Heo /* 449*bba2c361STejun Heo * Walk the intersection of [@a_base, @a_base + @a_nr_cids) with [@b_base, 450*bba2c361STejun Heo * @b_base + @b_nr_cids) word by word, applying @op. Mutating ops walk all words 451*bba2c361STejun Heo * and return false; predicates return true on the first word whose per-word 452*bba2c361STejun Heo * test is true. Empty intersection returns false (matches "no bits to consider" 453*bba2c361STejun Heo * for both mutate and predicate). 454*bba2c361STejun Heo * 455*bba2c361STejun Heo * Base/nr_cids are taken as parameters so callers with snapshotted bounds can 456*bba2c361STejun Heo * drive the walk with values independent of the cmask's header. 457*bba2c361STejun Heo */ 458*bba2c361STejun Heo static __always_inline bool cmask_walk_op2(u64 *a_bits, u32 a_base, u32 a_nr_cids, 459*bba2c361STejun Heo const u64 *b_bits, u32 b_base, u32 b_nr_cids, 460*bba2c361STejun Heo const enum cmask_op2 op) 461*bba2c361STejun Heo { 462*bba2c361STejun Heo u32 lo = max(a_base, b_base); 463*bba2c361STejun Heo u32 hi = min(a_base + a_nr_cids, b_base + b_nr_cids); 464*bba2c361STejun Heo u32 a_word_off = a_base / 64; 465*bba2c361STejun Heo u32 b_word_off = b_base / 64; 466*bba2c361STejun Heo u32 lo_word = lo / 64; 467*bba2c361STejun Heo u32 hi_word = (hi - 1) / 64; 468*bba2c361STejun Heo u64 head_mask = GENMASK_U64(63, lo & 63); 469*bba2c361STejun Heo u64 tail_mask = GENMASK_U64((hi - 1) & 63, 0); 470*bba2c361STejun Heo u32 w; 471*bba2c361STejun Heo 472*bba2c361STejun Heo if (lo >= hi) 473*bba2c361STejun Heo return false; 474*bba2c361STejun Heo 475*bba2c361STejun Heo if (lo_word == hi_word) 476*bba2c361STejun Heo return cmask_word_op2(&a_bits[lo_word - a_word_off], 477*bba2c361STejun Heo &b_bits[lo_word - b_word_off], 478*bba2c361STejun Heo head_mask & tail_mask, op); 479*bba2c361STejun Heo 480*bba2c361STejun Heo if (cmask_word_op2(&a_bits[lo_word - a_word_off], 481*bba2c361STejun Heo &b_bits[lo_word - b_word_off], head_mask, op) && 482*bba2c361STejun Heo cmask_op2_is_pred(op)) 483*bba2c361STejun Heo return true; 484*bba2c361STejun Heo 485*bba2c361STejun Heo for (w = lo_word + 1; w < hi_word; w++) 486*bba2c361STejun Heo if (cmask_word_op2(&a_bits[w - a_word_off], 487*bba2c361STejun Heo &b_bits[w - b_word_off], ~0ULL, op) && 488*bba2c361STejun Heo cmask_op2_is_pred(op)) 489*bba2c361STejun Heo return true; 490*bba2c361STejun Heo 491*bba2c361STejun Heo return cmask_word_op2(&a_bits[hi_word - a_word_off], 492*bba2c361STejun Heo &b_bits[hi_word - b_word_off], tail_mask, op); 493*bba2c361STejun Heo } 494*bba2c361STejun Heo 495*bba2c361STejun Heo enum cmask_op1 { 496*bba2c361STejun Heo CMASK_OP1_ANY_SET, 497*bba2c361STejun Heo }; 498*bba2c361STejun Heo 499*bba2c361STejun Heo static __always_inline bool cmask_word_op1(const u64 *ap, u64 mask, 500*bba2c361STejun Heo const enum cmask_op1 op) 501*bba2c361STejun Heo { 502*bba2c361STejun Heo switch (op) { 503*bba2c361STejun Heo case CMASK_OP1_ANY_SET: 504*bba2c361STejun Heo return *ap & mask; 505*bba2c361STejun Heo } 506*bba2c361STejun Heo unreachable(); 507*bba2c361STejun Heo } 508*bba2c361STejun Heo 509*bba2c361STejun Heo /* 510*bba2c361STejun Heo * Walk [@a_base, @a_base + @a_nr_cids) of @a_bits word by word, applying @op. 511*bba2c361STejun Heo * Returns true on the first word whose per-word test is true; returns false if 512*bba2c361STejun Heo * no word matches or the range is empty. All current op1s short-circuit on 513*bba2c361STejun Heo * per-word true; if a non-predicate op1 lands here, add a cmask_op1_is_pred() 514*bba2c361STejun Heo * guard analogous to cmask_op2_is_pred(). 515*bba2c361STejun Heo */ 516*bba2c361STejun Heo static __always_inline bool cmask_walk_op1(const u64 *a_bits, u32 a_base, 517*bba2c361STejun Heo u32 a_nr_cids, 518*bba2c361STejun Heo const enum cmask_op1 op) 519*bba2c361STejun Heo { 520*bba2c361STejun Heo u32 lo = a_base; 521*bba2c361STejun Heo u32 hi = a_base + a_nr_cids; 522*bba2c361STejun Heo u32 a_word_off = a_base / 64; 523*bba2c361STejun Heo u32 lo_word = lo / 64; 524*bba2c361STejun Heo u32 hi_word = (hi - 1) / 64; 525*bba2c361STejun Heo u64 head_mask = GENMASK_U64(63, lo & 63); 526*bba2c361STejun Heo u64 tail_mask = GENMASK_U64((hi - 1) & 63, 0); 527*bba2c361STejun Heo u32 w; 528*bba2c361STejun Heo 529*bba2c361STejun Heo if (lo >= hi) 530*bba2c361STejun Heo return false; 531*bba2c361STejun Heo 532*bba2c361STejun Heo if (lo_word == hi_word) 533*bba2c361STejun Heo return cmask_word_op1(&a_bits[lo_word - a_word_off], 534*bba2c361STejun Heo head_mask & tail_mask, op); 535*bba2c361STejun Heo 536*bba2c361STejun Heo if (cmask_word_op1(&a_bits[lo_word - a_word_off], head_mask, op)) 537*bba2c361STejun Heo return true; 538*bba2c361STejun Heo for (w = lo_word + 1; w < hi_word; w++) 539*bba2c361STejun Heo if (cmask_word_op1(&a_bits[w - a_word_off], ~0ULL, op)) 540*bba2c361STejun Heo return true; 541*bba2c361STejun Heo return cmask_word_op1(&a_bits[hi_word - a_word_off], tail_mask, op); 542*bba2c361STejun Heo } 543*bba2c361STejun Heo 544*bba2c361STejun Heo void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src) 545*bba2c361STejun Heo { 546*bba2c361STejun Heo cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, 547*bba2c361STejun Heo src->bits, src->base, src->nr_cids, CMASK_OP2_AND); 548*bba2c361STejun Heo } 549*bba2c361STejun Heo 550*bba2c361STejun Heo void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src) 551*bba2c361STejun Heo { 552*bba2c361STejun Heo cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, 553*bba2c361STejun Heo src->bits, src->base, src->nr_cids, CMASK_OP2_OR); 554*bba2c361STejun Heo } 555*bba2c361STejun Heo 556*bba2c361STejun Heo /** 557*bba2c361STejun Heo * scx_cmask_or_racy - OR @src into @dst, reading @src without locking 558*bba2c361STejun Heo * 559*bba2c361STejun Heo * @src is read word-by-word through data_race(). Same per-bit independence 560*bba2c361STejun Heo * rationale as scx_cmask_copy_racy(). Memory ordering with writers is the 561*bba2c361STejun Heo * caller's responsibility. 562*bba2c361STejun Heo */ 563*bba2c361STejun Heo void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src) 564*bba2c361STejun Heo { 565*bba2c361STejun Heo cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, 566*bba2c361STejun Heo src->bits, src->base, src->nr_cids, CMASK_OP2_OR_RACY); 567*bba2c361STejun Heo } 568*bba2c361STejun Heo 569*bba2c361STejun Heo void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src) 570*bba2c361STejun Heo { 571*bba2c361STejun Heo cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, 572*bba2c361STejun Heo src->bits, src->base, src->nr_cids, CMASK_OP2_COPY); 573*bba2c361STejun Heo } 574*bba2c361STejun Heo 575*bba2c361STejun Heo /** 576*bba2c361STejun Heo * scx_cmask_copy_racy - Snapshot @src into @dst without locking 577*bba2c361STejun Heo * 578*bba2c361STejun Heo * @src is read word-by-word through data_race(). Head/tail masking matches 579*bba2c361STejun Heo * scx_cmask_copy(). Each bit in a cmask is independent, so partial updates 580*bba2c361STejun Heo * just leave some bits fresher than others. Memory ordering with writers is 581*bba2c361STejun Heo * the caller's responsibility. 582*bba2c361STejun Heo */ 583*bba2c361STejun Heo void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src) 584*bba2c361STejun Heo { 585*bba2c361STejun Heo cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, 586*bba2c361STejun Heo src->bits, src->base, src->nr_cids, CMASK_OP2_COPY_RACY); 587*bba2c361STejun Heo } 588*bba2c361STejun Heo 589*bba2c361STejun Heo void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src) 590*bba2c361STejun Heo { 591*bba2c361STejun Heo cmask_walk_op2(dst->bits, dst->base, dst->nr_cids, 592*bba2c361STejun Heo src->bits, src->base, src->nr_cids, CMASK_OP2_ANDNOT); 593*bba2c361STejun Heo } 594*bba2c361STejun Heo 595*bba2c361STejun Heo /* 596*bba2c361STejun Heo * Return true if @cm has any bit set in [@lo, @hi). Caller must ensure 597*bba2c361STejun Heo * [@lo, @hi) is contained in @cm's range. 598*bba2c361STejun Heo */ 599*bba2c361STejun Heo static bool cmask_any_set_in_range(const struct scx_cmask *cm, u32 lo, u32 hi) 600*bba2c361STejun Heo { 601*bba2c361STejun Heo if (lo >= hi) 602*bba2c361STejun Heo return false; 603*bba2c361STejun Heo return cmask_walk_op1(&cm->bits[lo / 64 - cm->base / 64], lo, hi - lo, 604*bba2c361STejun Heo CMASK_OP1_ANY_SET); 605*bba2c361STejun Heo } 606*bba2c361STejun Heo 607*bba2c361STejun Heo /** 608*bba2c361STejun Heo * scx_cmask_subset - test whether @sub is a subset of @super 609*bba2c361STejun Heo * @sub: cmask to test 610*bba2c361STejun Heo * @super: cmask to test against 611*bba2c361STejun Heo * 612*bba2c361STejun Heo * Return true iff every set bit of @sub is also set in @super. 613*bba2c361STejun Heo */ 614*bba2c361STejun Heo bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super) 615*bba2c361STejun Heo { 616*bba2c361STejun Heo u32 super_end = super->base + super->nr_cids; 617*bba2c361STejun Heo u32 sub_end = sub->base + sub->nr_cids; 618*bba2c361STejun Heo 619*bba2c361STejun Heo /* 620*bba2c361STejun Heo * Set bits in @sub outside @super's range can't be in @super, so any 621*bba2c361STejun Heo * such bit means not a subset. The walk below only visits words 622*bba2c361STejun Heo * common to both ranges, so these need a separate scan. 623*bba2c361STejun Heo */ 624*bba2c361STejun Heo if (sub->base < super->base && 625*bba2c361STejun Heo cmask_any_set_in_range(sub, sub->base, min(super->base, sub_end))) 626*bba2c361STejun Heo return false; 627*bba2c361STejun Heo if (sub_end > super_end && 628*bba2c361STejun Heo cmask_any_set_in_range(sub, max(sub->base, super_end), sub_end)) 629*bba2c361STejun Heo return false; 630*bba2c361STejun Heo 631*bba2c361STejun Heo return !cmask_walk_op2((u64 *)super->bits, super->base, super->nr_cids, 632*bba2c361STejun Heo sub->bits, sub->base, sub->nr_cids, CMASK_OP2_SUBSET); 633*bba2c361STejun Heo } 634*bba2c361STejun Heo 635*bba2c361STejun Heo bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b) 636*bba2c361STejun Heo { 637*bba2c361STejun Heo return cmask_walk_op2((u64 *)a->bits, a->base, a->nr_cids, 638*bba2c361STejun Heo b->bits, b->base, b->nr_cids, CMASK_OP2_INTERSECTS); 639*bba2c361STejun Heo } 640*bba2c361STejun Heo 641*bba2c361STejun Heo /** 642*bba2c361STejun Heo * scx_cmask_empty - Test whether @m has no bits set 643*bba2c361STejun Heo * @m: cmask to test 644*bba2c361STejun Heo * 645*bba2c361STejun Heo * Return true iff @m's active range has no bits set. 646*bba2c361STejun Heo */ 647*bba2c361STejun Heo bool scx_cmask_empty(const struct scx_cmask *m) 648*bba2c361STejun Heo { 649*bba2c361STejun Heo return !cmask_any_set_in_range(m, m->base, m->base + m->nr_cids); 650*bba2c361STejun Heo } 651*bba2c361STejun Heo 652*bba2c361STejun Heo /** 653*bba2c361STejun Heo * scx_bpf_cid_topo - Copy out per-cid topology info 654*bba2c361STejun Heo * @cid: cid to look up 655*bba2c361STejun Heo * @out__uninit: where to copy the topology info; fully written by this call 656*bba2c361STejun Heo * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs 657*bba2c361STejun Heo * 658*bba2c361STejun Heo * Fill @out__uninit with the topology info for @cid. Trigger scx_error() if 659*bba2c361STejun Heo * @cid is out of range. If @cid is valid but in the no-topo section, all fields 660*bba2c361STejun Heo * are set to -1. 661*bba2c361STejun Heo */ 662*bba2c361STejun Heo __bpf_kfunc void scx_bpf_cid_topo(s32 cid, struct scx_cid_topo *out__uninit, 663*bba2c361STejun Heo const struct bpf_prog_aux *aux) 664*bba2c361STejun Heo { 665*bba2c361STejun Heo struct scx_sched *sch; 666*bba2c361STejun Heo 667*bba2c361STejun Heo guard(rcu)(); 668*bba2c361STejun Heo 669*bba2c361STejun Heo sch = scx_prog_sched(aux); 670*bba2c361STejun Heo if (unlikely(!sch) || !cid_valid(sch, cid)) { 671*bba2c361STejun Heo *out__uninit = SCX_CID_TOPO_NEG; 672*bba2c361STejun Heo return; 673*bba2c361STejun Heo } 674*bba2c361STejun Heo 675*bba2c361STejun Heo *out__uninit = READ_ONCE(scx_cid_topo)[cid]; 676*bba2c361STejun Heo } 677*bba2c361STejun Heo 678*bba2c361STejun Heo __bpf_kfunc_end_defs(); 679*bba2c361STejun Heo 680*bba2c361STejun Heo BTF_KFUNCS_START(scx_kfunc_ids_init) 681*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cid_override, KF_IMPLICIT_ARGS | KF_SLEEPABLE) 682*bba2c361STejun Heo BTF_KFUNCS_END(scx_kfunc_ids_init) 683*bba2c361STejun Heo 684*bba2c361STejun Heo static const struct btf_kfunc_id_set scx_kfunc_set_init = { 685*bba2c361STejun Heo .owner = THIS_MODULE, 686*bba2c361STejun Heo .set = &scx_kfunc_ids_init, 687*bba2c361STejun Heo .filter = scx_kfunc_context_filter, 688*bba2c361STejun Heo }; 689*bba2c361STejun Heo 690*bba2c361STejun Heo BTF_KFUNCS_START(scx_kfunc_ids_cid) 691*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cid_to_cpu, KF_IMPLICIT_ARGS) 692*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cpu_to_cid, KF_IMPLICIT_ARGS) 693*bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cid_topo, KF_IMPLICIT_ARGS) 694*bba2c361STejun Heo BTF_KFUNCS_END(scx_kfunc_ids_cid) 695*bba2c361STejun Heo 696*bba2c361STejun Heo static const struct btf_kfunc_id_set scx_kfunc_set_cid = { 697*bba2c361STejun Heo .owner = THIS_MODULE, 698*bba2c361STejun Heo .set = &scx_kfunc_ids_cid, 699*bba2c361STejun Heo }; 700*bba2c361STejun Heo 701*bba2c361STejun Heo int scx_cid_kfunc_init(void) 702*bba2c361STejun Heo { 703*bba2c361STejun Heo return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_init) ?: 704*bba2c361STejun Heo register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_cid) ?: 705*bba2c361STejun Heo register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_cid) ?: 706*bba2c361STejun Heo register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_cid); 707*bba2c361STejun Heo } 708