1*bba2c361STejun Heo /* SPDX-License-Identifier: GPL-2.0 */ 2*bba2c361STejun Heo /* 3*bba2c361STejun Heo * Topological CPU IDs (cids) 4*bba2c361STejun Heo * -------------------------- 5*bba2c361STejun Heo * 6*bba2c361STejun Heo * Raw cpu numbers are clumsy for sharding work and communication across 7*bba2c361STejun Heo * topology units, especially from BPF: the space can be sparse, numerical 8*bba2c361STejun Heo * closeness doesn't imply topological closeness (x86 hyperthreading often puts 9*bba2c361STejun Heo * SMT siblings far apart), and a range of cpu ids doesn't mean anything. 10*bba2c361STejun Heo * Sub-scheds make this acute - cpu allocation, revocation and other state are 11*bba2c361STejun Heo * constantly communicated across sub-scheds, and passing whole cpumasks scales 12*bba2c361STejun Heo * poorly with cpu count. cpumasks are also awkward in BPF: a variable-length 13*bba2c361STejun Heo * kernel type sized for the maximum NR_CPUS (4k), with verbose helper sequences 14*bba2c361STejun Heo * for every op. 15*bba2c361STejun Heo * 16*bba2c361STejun Heo * cids give every cpu a dense, topology-ordered id. CPUs sharing a core, LLC or 17*bba2c361STejun Heo * NUMA node get contiguous cid ranges, so a topology unit becomes a (start, 18*bba2c361STejun Heo * length) slice of cid space. Communication can pass a slice instead of a 19*bba2c361STejun Heo * cpumask, and BPF code can process, for example, a u64 word's worth of cids at 20*bba2c361STejun Heo * a time. 21*bba2c361STejun Heo * 22*bba2c361STejun Heo * The mapping is built once at root scheduler enable time by walking the 23*bba2c361STejun Heo * topology of online cpus only. Going by online cpus is out of necessity: 24*bba2c361STejun Heo * depending on the arch, topology info isn't reliably available for offline 25*bba2c361STejun Heo * cpus. The expected usage model is restarting the scheduler on hotplug events 26*bba2c361STejun Heo * so the mapping is rebuilt against the new online set. A scheduler that wants 27*bba2c361STejun Heo * to handle hotplug without a restart can provide its own cid and shard mapping 28*bba2c361STejun Heo * through the override interface. 29*bba2c361STejun Heo * 30*bba2c361STejun Heo * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. 31*bba2c361STejun Heo * Copyright (c) 2026 Tejun Heo <tj@kernel.org> 32*bba2c361STejun Heo */ 33*bba2c361STejun Heo #ifndef _KERNEL_SCHED_EXT_CID_H 34*bba2c361STejun Heo #define _KERNEL_SCHED_EXT_CID_H 35*bba2c361STejun Heo 36*bba2c361STejun Heo struct scx_sched; 37*bba2c361STejun Heo 38*bba2c361STejun Heo /* 39*bba2c361STejun Heo * Cid space (total is always num_possible_cpus()) is laid out with 40*bba2c361STejun Heo * topology-annotated cids first, then no-topo cids at the tail. The 41*bba2c361STejun Heo * topology-annotated block covers the cpus that were online when scx_cid_init() 42*bba2c361STejun Heo * ran and remains valid even after those cpus go offline. The tail block covers 43*bba2c361STejun Heo * possible-but-not-online cpus and carries all-(-1) topo info (see 44*bba2c361STejun Heo * scx_cid_topo); callers detect it via the -1 sentinels. 45*bba2c361STejun Heo * 46*bba2c361STejun Heo * See the comment above the table definitions in cid.c for the 47*bba2c361STejun Heo * memory-ordering and visibility contract. 48*bba2c361STejun Heo */ 49*bba2c361STejun Heo extern s16 *scx_cid_to_cpu_tbl; 50*bba2c361STejun Heo extern s16 *scx_cpu_to_cid_tbl; 51*bba2c361STejun Heo extern struct scx_cid_topo *scx_cid_topo; 52*bba2c361STejun Heo extern struct btf_id_set8 scx_kfunc_ids_init; 53*bba2c361STejun Heo 54*bba2c361STejun Heo void scx_cmask_clear(struct scx_cmask *m); 55*bba2c361STejun Heo void scx_cmask_fill(struct scx_cmask *m); 56*bba2c361STejun Heo void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src); 57*bba2c361STejun Heo void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src); 58*bba2c361STejun Heo void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src); 59*bba2c361STejun Heo void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src); 60*bba2c361STejun Heo void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src); 61*bba2c361STejun Heo void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src); 62*bba2c361STejun Heo bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super); 63*bba2c361STejun Heo bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b); 64*bba2c361STejun Heo bool scx_cmask_empty(const struct scx_cmask *m); 65*bba2c361STejun Heo s32 scx_cid_init(struct scx_sched *sch); 66*bba2c361STejun Heo int scx_cid_kfunc_init(void); 67*bba2c361STejun Heo void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst); 68*bba2c361STejun Heo 69*bba2c361STejun Heo /** 70*bba2c361STejun Heo * cid_valid - Verify a cid value, to be used on ops input args 71*bba2c361STejun Heo * @sch: scx_sched to abort on error 72*bba2c361STejun Heo * @cid: cid which came from a BPF ops 73*bba2c361STejun Heo * 74*bba2c361STejun Heo * Return true if @cid is in [0, num_possible_cpus()). On failure, trigger 75*bba2c361STejun Heo * scx_error() and return false. 76*bba2c361STejun Heo */ 77*bba2c361STejun Heo static inline bool cid_valid(struct scx_sched *sch, s32 cid) 78*bba2c361STejun Heo { 79*bba2c361STejun Heo if (likely(cid >= 0 && cid < num_possible_cpus())) 80*bba2c361STejun Heo return true; 81*bba2c361STejun Heo scx_error(sch, "invalid cid %d", cid); 82*bba2c361STejun Heo return false; 83*bba2c361STejun Heo } 84*bba2c361STejun Heo 85*bba2c361STejun Heo /** 86*bba2c361STejun Heo * __scx_cid_to_cpu - Unchecked cid->cpu table lookup 87*bba2c361STejun Heo * @cid: cid to look up. Must be in [0, num_possible_cpus()). 88*bba2c361STejun Heo * 89*bba2c361STejun Heo * Intended for callsites that have already validated @cid and that hold a 90*bba2c361STejun Heo * non-NULL @sch from scx_prog_sched() - a live sched implies the table has 91*bba2c361STejun Heo * been allocated, so no NULL check is needed here. 92*bba2c361STejun Heo */ 93*bba2c361STejun Heo static inline s32 __scx_cid_to_cpu(s32 cid) 94*bba2c361STejun Heo { 95*bba2c361STejun Heo /* READ_ONCE pairs with WRITE_ONCE in scx_cid_arrays_alloc() */ 96*bba2c361STejun Heo return READ_ONCE(scx_cid_to_cpu_tbl)[cid]; 97*bba2c361STejun Heo } 98*bba2c361STejun Heo 99*bba2c361STejun Heo /** 100*bba2c361STejun Heo * __scx_cpu_to_cid - Unchecked cpu->cid table lookup 101*bba2c361STejun Heo * @cpu: cpu to look up. Must be a valid possible cpu id. 102*bba2c361STejun Heo * 103*bba2c361STejun Heo * Same usage constraints as __scx_cid_to_cpu(). 104*bba2c361STejun Heo */ 105*bba2c361STejun Heo static inline s32 __scx_cpu_to_cid(s32 cpu) 106*bba2c361STejun Heo { 107*bba2c361STejun Heo return READ_ONCE(scx_cpu_to_cid_tbl)[cpu]; 108*bba2c361STejun Heo } 109*bba2c361STejun Heo 110*bba2c361STejun Heo /** 111*bba2c361STejun Heo * scx_cid_to_cpu - Translate @cid to its cpu 112*bba2c361STejun Heo * @sch: scx_sched for error reporting 113*bba2c361STejun Heo * @cid: cid to look up 114*bba2c361STejun Heo * 115*bba2c361STejun Heo * Return the cpu for @cid or a negative errno on failure. Invalid cid triggers 116*bba2c361STejun Heo * scx_error() on @sch. The cid arrays are allocated on first scheduler enable 117*bba2c361STejun Heo * and never freed, so the returned cpu is stable for the lifetime of the loaded 118*bba2c361STejun Heo * scheduler. 119*bba2c361STejun Heo */ 120*bba2c361STejun Heo static inline s32 scx_cid_to_cpu(struct scx_sched *sch, s32 cid) 121*bba2c361STejun Heo { 122*bba2c361STejun Heo if (!cid_valid(sch, cid)) 123*bba2c361STejun Heo return -EINVAL; 124*bba2c361STejun Heo return __scx_cid_to_cpu(cid); 125*bba2c361STejun Heo } 126*bba2c361STejun Heo 127*bba2c361STejun Heo /** 128*bba2c361STejun Heo * scx_cpu_to_cid - Translate @cpu to its cid 129*bba2c361STejun Heo * @sch: scx_sched for error reporting 130*bba2c361STejun Heo * @cpu: cpu to look up 131*bba2c361STejun Heo * 132*bba2c361STejun Heo * Return the cid for @cpu or a negative errno on failure. Invalid cpu triggers 133*bba2c361STejun Heo * scx_error() on @sch. Same lifetime guarantee as scx_cid_to_cpu(). 134*bba2c361STejun Heo */ 135*bba2c361STejun Heo static inline s32 scx_cpu_to_cid(struct scx_sched *sch, s32 cpu) 136*bba2c361STejun Heo { 137*bba2c361STejun Heo if (!scx_cpu_valid(sch, cpu, NULL)) 138*bba2c361STejun Heo return -EINVAL; 139*bba2c361STejun Heo return __scx_cpu_to_cid(cpu); 140*bba2c361STejun Heo } 141*bba2c361STejun Heo 142*bba2c361STejun Heo /** 143*bba2c361STejun Heo * scx_is_cid_type - Test whether the active scheduler hierarchy is cid-form 144*bba2c361STejun Heo */ 145*bba2c361STejun Heo static inline bool scx_is_cid_type(void) 146*bba2c361STejun Heo { 147*bba2c361STejun Heo return static_branch_unlikely(&__scx_is_cid_type); 148*bba2c361STejun Heo } 149*bba2c361STejun Heo 150*bba2c361STejun Heo static inline bool __scx_cmask_contains(u32 cid, const struct scx_cmask *m) 151*bba2c361STejun Heo { 152*bba2c361STejun Heo return likely(cid >= m->base && cid < m->base + m->nr_cids); 153*bba2c361STejun Heo } 154*bba2c361STejun Heo 155*bba2c361STejun Heo /* Word in bits[] covering @cid. @cid must satisfy __scx_cmask_contains(). */ 156*bba2c361STejun Heo static inline u64 *__scx_cmask_word(u32 cid, const struct scx_cmask *m) 157*bba2c361STejun Heo { 158*bba2c361STejun Heo return (u64 *)&m->bits[cid / 64 - m->base / 64]; 159*bba2c361STejun Heo } 160*bba2c361STejun Heo 161*bba2c361STejun Heo /** 162*bba2c361STejun Heo * __scx_cmask_init - Initialize @m with explicit storage capacity 163*bba2c361STejun Heo * @m: cmask to initialize 164*bba2c361STejun Heo * @base: first cid of the active range 165*bba2c361STejun Heo * @nr_cids: number of cids in the active range 166*bba2c361STejun Heo * @alloc_cids: storage capacity in cids, at least @nr_cids 167*bba2c361STejun Heo * 168*bba2c361STejun Heo * Use when storage is sized larger than the initial active range. All of 169*bba2c361STejun Heo * bits[] is zeroed. 170*bba2c361STejun Heo */ 171*bba2c361STejun Heo static inline void __scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids, 172*bba2c361STejun Heo u32 alloc_cids) 173*bba2c361STejun Heo { 174*bba2c361STejun Heo if (WARN_ON_ONCE(alloc_cids < nr_cids)) 175*bba2c361STejun Heo nr_cids = alloc_cids; 176*bba2c361STejun Heo 177*bba2c361STejun Heo m->base = base; 178*bba2c361STejun Heo m->nr_cids = nr_cids; 179*bba2c361STejun Heo m->alloc_words = SCX_CMASK_NR_WORDS(alloc_cids); 180*bba2c361STejun Heo memset(m->bits, 0, m->alloc_words * sizeof(u64)); 181*bba2c361STejun Heo } 182*bba2c361STejun Heo 183*bba2c361STejun Heo /** 184*bba2c361STejun Heo * scx_cmask_init - Initialize @m on tight storage 185*bba2c361STejun Heo * @m: cmask to initialize 186*bba2c361STejun Heo * @base: first cid of the active range 187*bba2c361STejun Heo * @nr_cids: number of cids in the active range 188*bba2c361STejun Heo * 189*bba2c361STejun Heo * All of bits[] is zeroed. 190*bba2c361STejun Heo */ 191*bba2c361STejun Heo static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids) 192*bba2c361STejun Heo { 193*bba2c361STejun Heo __scx_cmask_init(m, base, nr_cids, nr_cids); 194*bba2c361STejun Heo } 195*bba2c361STejun Heo 196*bba2c361STejun Heo /** 197*bba2c361STejun Heo * scx_cmask_reframe - Reshape @m's active range without resizing storage 198*bba2c361STejun Heo * @m: cmask to reframe 199*bba2c361STejun Heo * @base: new active range base 200*bba2c361STejun Heo * @nr_cids: new active range length, must fit within @m->alloc_words 201*bba2c361STejun Heo * 202*bba2c361STejun Heo * Body bits within the new range become garbage - only the head and tail 203*bba2c361STejun Heo * words are zeroed to keep the padding invariant. 204*bba2c361STejun Heo */ 205*bba2c361STejun Heo static inline void scx_cmask_reframe(struct scx_cmask *m, u32 base, u32 nr_cids) 206*bba2c361STejun Heo { 207*bba2c361STejun Heo if (WARN_ON_ONCE(SCX_CMASK_NR_WORDS(nr_cids) > m->alloc_words)) 208*bba2c361STejun Heo return; 209*bba2c361STejun Heo 210*bba2c361STejun Heo if (nr_cids) { 211*bba2c361STejun Heo u32 last_word = ((base & 63) + nr_cids - 1) / 64; 212*bba2c361STejun Heo 213*bba2c361STejun Heo m->bits[0] = 0; 214*bba2c361STejun Heo m->bits[last_word] = 0; 215*bba2c361STejun Heo } 216*bba2c361STejun Heo 217*bba2c361STejun Heo m->base = base; 218*bba2c361STejun Heo m->nr_cids = nr_cids; 219*bba2c361STejun Heo } 220*bba2c361STejun Heo 221*bba2c361STejun Heo static inline void __scx_cmask_set(u32 cid, struct scx_cmask *m) 222*bba2c361STejun Heo { 223*bba2c361STejun Heo if (!__scx_cmask_contains(cid, m)) 224*bba2c361STejun Heo return; 225*bba2c361STejun Heo *__scx_cmask_word(cid, m) |= BIT_U64(cid & 63); 226*bba2c361STejun Heo } 227*bba2c361STejun Heo 228*bba2c361STejun Heo /** 229*bba2c361STejun Heo * scx_cmask_test - test whether @cid is set in @m 230*bba2c361STejun Heo * @cid: cid to test 231*bba2c361STejun Heo * @m: cmask to test 232*bba2c361STejun Heo * 233*bba2c361STejun Heo * Return %false if @cid is outside @m's active range. Otherwise return the 234*bba2c361STejun Heo * bit's value. Read via READ_ONCE so callers can race set/clear writers. 235*bba2c361STejun Heo */ 236*bba2c361STejun Heo static inline bool scx_cmask_test(u32 cid, const struct scx_cmask *m) 237*bba2c361STejun Heo { 238*bba2c361STejun Heo if (!__scx_cmask_contains(cid, m)) 239*bba2c361STejun Heo return false; 240*bba2c361STejun Heo return READ_ONCE(*__scx_cmask_word(cid, m)) & BIT_U64(cid & 63); 241*bba2c361STejun Heo } 242*bba2c361STejun Heo 243*bba2c361STejun Heo /* 244*bba2c361STejun Heo * Words of bits[] the active range spans, 0 if empty. Tighter than the storage 245*bba2c361STejun Heo * SCX_CMASK_NR_WORDS() sizes for the worst-case base alignment. 246*bba2c361STejun Heo */ 247*bba2c361STejun Heo static inline u32 scx_cmask_nr_used_words(const struct scx_cmask *m) 248*bba2c361STejun Heo { 249*bba2c361STejun Heo if (!m->nr_cids) 250*bba2c361STejun Heo return 0; 251*bba2c361STejun Heo return ((m->base & 63) + m->nr_cids - 1) / 64 + 1; 252*bba2c361STejun Heo } 253*bba2c361STejun Heo 254*bba2c361STejun Heo /** 255*bba2c361STejun Heo * scx_cmask_for_each_cid - iterate set cids in @m 256*bba2c361STejun Heo * @cid: s32 loop var that receives each set cid in turn 257*bba2c361STejun Heo * @m: cmask to iterate 258*bba2c361STejun Heo * 259*bba2c361STejun Heo * Visits set bits within @m's active range in ascending order. Scans only the 260*bba2c361STejun Heo * words the active range spans, where head and tail padding is kept zero, so 261*bba2c361STejun Heo * no per-cid range check is needed. 262*bba2c361STejun Heo */ 263*bba2c361STejun Heo #define scx_cmask_for_each_cid(cid, m) \ 264*bba2c361STejun Heo for (u64 __bs = (m)->base & ~63u, __wi = 0, \ 265*bba2c361STejun Heo __nw = scx_cmask_nr_used_words(m); \ 266*bba2c361STejun Heo __wi < __nw; __wi++) \ 267*bba2c361STejun Heo for (u64 __w = READ_ONCE((m)->bits[__wi]); \ 268*bba2c361STejun Heo __w && ((cid) = __bs + __wi * 64 + __ffs64(__w), true); \ 269*bba2c361STejun Heo __w &= __w - 1) 270*bba2c361STejun Heo 271*bba2c361STejun Heo #endif /* _KERNEL_SCHED_EXT_CID_H */ 272