1bba2c361STejun Heo /* SPDX-License-Identifier: GPL-2.0 */ 2bba2c361STejun Heo /* 3bba2c361STejun Heo * Topological CPU IDs (cids) 4bba2c361STejun Heo * -------------------------- 5bba2c361STejun Heo * 6bba2c361STejun Heo * Raw cpu numbers are clumsy for sharding work and communication across 7bba2c361STejun Heo * topology units, especially from BPF: the space can be sparse, numerical 8bba2c361STejun Heo * closeness doesn't imply topological closeness (x86 hyperthreading often puts 9bba2c361STejun Heo * SMT siblings far apart), and a range of cpu ids doesn't mean anything. 10bba2c361STejun Heo * Sub-scheds make this acute - cpu allocation, revocation and other state are 11bba2c361STejun Heo * constantly communicated across sub-scheds, and passing whole cpumasks scales 12bba2c361STejun Heo * poorly with cpu count. cpumasks are also awkward in BPF: a variable-length 13bba2c361STejun Heo * kernel type sized for the maximum NR_CPUS (4k), with verbose helper sequences 14bba2c361STejun Heo * for every op. 15bba2c361STejun Heo * 16bba2c361STejun Heo * cids give every cpu a dense, topology-ordered id. CPUs sharing a core, LLC or 17bba2c361STejun Heo * NUMA node get contiguous cid ranges, so a topology unit becomes a (start, 18bba2c361STejun Heo * length) slice of cid space. Communication can pass a slice instead of a 19bba2c361STejun Heo * cpumask, and BPF code can process, for example, a u64 word's worth of cids at 20bba2c361STejun Heo * a time. 21bba2c361STejun Heo * 22bba2c361STejun Heo * The mapping is built once at root scheduler enable time by walking the 23bba2c361STejun Heo * topology of online cpus only. Going by online cpus is out of necessity: 24bba2c361STejun Heo * depending on the arch, topology info isn't reliably available for offline 25bba2c361STejun Heo * cpus. The expected usage model is restarting the scheduler on hotplug events 26bba2c361STejun Heo * so the mapping is rebuilt against the new online set. A scheduler that wants 27bba2c361STejun Heo * to handle hotplug without a restart can provide its own cid and shard mapping 28bba2c361STejun Heo * through the override interface. 29bba2c361STejun Heo * 30bba2c361STejun Heo * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. 31bba2c361STejun Heo * Copyright (c) 2026 Tejun Heo <tj@kernel.org> 32bba2c361STejun Heo */ 33bba2c361STejun Heo #ifndef _KERNEL_SCHED_EXT_CID_H 34bba2c361STejun Heo #define _KERNEL_SCHED_EXT_CID_H 35bba2c361STejun Heo 363cd1f76bSTejun Heo #include "internal.h" 373cd1f76bSTejun Heo 38bba2c361STejun Heo struct scx_sched; 39bba2c361STejun Heo 40bba2c361STejun Heo /* 41bba2c361STejun Heo * Cid space (total is always num_possible_cpus()) is laid out with 42bba2c361STejun Heo * topology-annotated cids first, then no-topo cids at the tail. The 43bba2c361STejun Heo * topology-annotated block covers the cpus that were online when scx_cid_init() 44bba2c361STejun Heo * ran and remains valid even after those cpus go offline. The tail block covers 45bba2c361STejun Heo * possible-but-not-online cpus and carries all-(-1) topo info (see 46bba2c361STejun Heo * scx_cid_topo); callers detect it via the -1 sentinels. 47bba2c361STejun Heo * 48bba2c361STejun Heo * See the comment above the table definitions in cid.c for the 49bba2c361STejun Heo * memory-ordering and visibility contract. 50bba2c361STejun Heo */ 51bba2c361STejun Heo extern s16 *scx_cid_to_cpu_tbl; 52bba2c361STejun Heo extern s16 *scx_cpu_to_cid_tbl; 53bba2c361STejun Heo extern struct scx_cid_topo *scx_cid_topo; 54bba2c361STejun Heo extern struct btf_id_set8 scx_kfunc_ids_init; 55bba2c361STejun Heo 56bba2c361STejun Heo void scx_cmask_clear(struct scx_cmask *m); 57bba2c361STejun Heo void scx_cmask_fill(struct scx_cmask *m); 58bba2c361STejun Heo void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src); 59bba2c361STejun Heo void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src); 60bba2c361STejun Heo void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src); 61bba2c361STejun Heo void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src); 62bba2c361STejun Heo void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src); 63bba2c361STejun Heo void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src); 64bba2c361STejun Heo bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super); 65bba2c361STejun Heo bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b); 66bba2c361STejun Heo bool scx_cmask_empty(const struct scx_cmask *m); 67bba2c361STejun Heo s32 scx_cid_init(struct scx_sched *sch); 68bba2c361STejun Heo int scx_cid_kfunc_init(void); 69bba2c361STejun Heo void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst); 70bba2c361STejun Heo 71bba2c361STejun Heo /** 72bba2c361STejun Heo * cid_valid - Verify a cid value, to be used on ops input args 73bba2c361STejun Heo * @sch: scx_sched to abort on error 74bba2c361STejun Heo * @cid: cid which came from a BPF ops 75bba2c361STejun Heo * 76bba2c361STejun Heo * Return true if @cid is in [0, num_possible_cpus()). On failure, trigger 77bba2c361STejun Heo * scx_error() and return false. 78bba2c361STejun Heo */ 79bba2c361STejun Heo static inline bool cid_valid(struct scx_sched *sch, s32 cid) 80bba2c361STejun Heo { 81bba2c361STejun Heo if (likely(cid >= 0 && cid < num_possible_cpus())) 82bba2c361STejun Heo return true; 83bba2c361STejun Heo scx_error(sch, "invalid cid %d", cid); 84bba2c361STejun Heo return false; 85bba2c361STejun Heo } 86bba2c361STejun Heo 87bba2c361STejun Heo /** 88bba2c361STejun Heo * __scx_cid_to_cpu - Unchecked cid->cpu table lookup 89bba2c361STejun Heo * @cid: cid to look up. Must be in [0, num_possible_cpus()). 90bba2c361STejun Heo * 91bba2c361STejun Heo * Intended for callsites that have already validated @cid and that hold a 92bba2c361STejun Heo * non-NULL @sch from scx_prog_sched() - a live sched implies the table has 93bba2c361STejun Heo * been allocated, so no NULL check is needed here. 94bba2c361STejun Heo */ 95bba2c361STejun Heo static inline s32 __scx_cid_to_cpu(s32 cid) 96bba2c361STejun Heo { 97bba2c361STejun Heo /* READ_ONCE pairs with WRITE_ONCE in scx_cid_arrays_alloc() */ 98bba2c361STejun Heo return READ_ONCE(scx_cid_to_cpu_tbl)[cid]; 99bba2c361STejun Heo } 100bba2c361STejun Heo 101bba2c361STejun Heo /** 102bba2c361STejun Heo * __scx_cpu_to_cid - Unchecked cpu->cid table lookup 103bba2c361STejun Heo * @cpu: cpu to look up. Must be a valid possible cpu id. 104bba2c361STejun Heo * 105bba2c361STejun Heo * Same usage constraints as __scx_cid_to_cpu(). 106bba2c361STejun Heo */ 107bba2c361STejun Heo static inline s32 __scx_cpu_to_cid(s32 cpu) 108bba2c361STejun Heo { 109bba2c361STejun Heo return READ_ONCE(scx_cpu_to_cid_tbl)[cpu]; 110bba2c361STejun Heo } 111bba2c361STejun Heo 112bba2c361STejun Heo /** 113bba2c361STejun Heo * scx_cid_to_cpu - Translate @cid to its cpu 114bba2c361STejun Heo * @sch: scx_sched for error reporting 115bba2c361STejun Heo * @cid: cid to look up 116bba2c361STejun Heo * 117bba2c361STejun Heo * Return the cpu for @cid or a negative errno on failure. Invalid cid triggers 118bba2c361STejun Heo * scx_error() on @sch. The cid arrays are allocated on first scheduler enable 119bba2c361STejun Heo * and never freed, so the returned cpu is stable for the lifetime of the loaded 120bba2c361STejun Heo * scheduler. 121bba2c361STejun Heo */ 122bba2c361STejun Heo static inline s32 scx_cid_to_cpu(struct scx_sched *sch, s32 cid) 123bba2c361STejun Heo { 124bba2c361STejun Heo if (!cid_valid(sch, cid)) 125bba2c361STejun Heo return -EINVAL; 126bba2c361STejun Heo return __scx_cid_to_cpu(cid); 127bba2c361STejun Heo } 128bba2c361STejun Heo 129bba2c361STejun Heo /** 130bba2c361STejun Heo * scx_cpu_to_cid - Translate @cpu to its cid 131bba2c361STejun Heo * @sch: scx_sched for error reporting 132bba2c361STejun Heo * @cpu: cpu to look up 133bba2c361STejun Heo * 134bba2c361STejun Heo * Return the cid for @cpu or a negative errno on failure. Invalid cpu triggers 135bba2c361STejun Heo * scx_error() on @sch. Same lifetime guarantee as scx_cid_to_cpu(). 136bba2c361STejun Heo */ 137bba2c361STejun Heo static inline s32 scx_cpu_to_cid(struct scx_sched *sch, s32 cpu) 138bba2c361STejun Heo { 139bba2c361STejun Heo if (!scx_cpu_valid(sch, cpu, NULL)) 140bba2c361STejun Heo return -EINVAL; 141bba2c361STejun Heo return __scx_cpu_to_cid(cpu); 142bba2c361STejun Heo } 143bba2c361STejun Heo 144bba2c361STejun Heo /** 145bba2c361STejun Heo * scx_is_cid_type - Test whether the active scheduler hierarchy is cid-form 146bba2c361STejun Heo */ 147bba2c361STejun Heo static inline bool scx_is_cid_type(void) 148bba2c361STejun Heo { 149bba2c361STejun Heo return static_branch_unlikely(&__scx_is_cid_type); 150bba2c361STejun Heo } 151bba2c361STejun Heo 152bba2c361STejun Heo static inline bool __scx_cmask_contains(u32 cid, const struct scx_cmask *m) 153bba2c361STejun Heo { 154bba2c361STejun Heo return likely(cid >= m->base && cid < m->base + m->nr_cids); 155bba2c361STejun Heo } 156bba2c361STejun Heo 157bba2c361STejun Heo /* Word in bits[] covering @cid. @cid must satisfy __scx_cmask_contains(). */ 158bba2c361STejun Heo static inline u64 *__scx_cmask_word(u32 cid, const struct scx_cmask *m) 159bba2c361STejun Heo { 160bba2c361STejun Heo return (u64 *)&m->bits[cid / 64 - m->base / 64]; 161bba2c361STejun Heo } 162bba2c361STejun Heo 163bba2c361STejun Heo /** 164bba2c361STejun Heo * __scx_cmask_init - Initialize @m with explicit storage capacity 165bba2c361STejun Heo * @m: cmask to initialize 166bba2c361STejun Heo * @base: first cid of the active range 167bba2c361STejun Heo * @nr_cids: number of cids in the active range 168bba2c361STejun Heo * @alloc_cids: storage capacity in cids, at least @nr_cids 169bba2c361STejun Heo * 170bba2c361STejun Heo * Use when storage is sized larger than the initial active range. All of 171bba2c361STejun Heo * bits[] is zeroed. 172bba2c361STejun Heo */ 173bba2c361STejun Heo static inline void __scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids, 174bba2c361STejun Heo u32 alloc_cids) 175bba2c361STejun Heo { 176bba2c361STejun Heo if (WARN_ON_ONCE(alloc_cids < nr_cids)) 177bba2c361STejun Heo nr_cids = alloc_cids; 178bba2c361STejun Heo 179bba2c361STejun Heo m->base = base; 180bba2c361STejun Heo m->nr_cids = nr_cids; 181bba2c361STejun Heo m->alloc_words = SCX_CMASK_NR_WORDS(alloc_cids); 182bba2c361STejun Heo memset(m->bits, 0, m->alloc_words * sizeof(u64)); 183bba2c361STejun Heo } 184bba2c361STejun Heo 185bba2c361STejun Heo /** 186bba2c361STejun Heo * scx_cmask_init - Initialize @m on tight storage 187bba2c361STejun Heo * @m: cmask to initialize 188bba2c361STejun Heo * @base: first cid of the active range 189bba2c361STejun Heo * @nr_cids: number of cids in the active range 190bba2c361STejun Heo * 191bba2c361STejun Heo * All of bits[] is zeroed. 192bba2c361STejun Heo */ 193bba2c361STejun Heo static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids) 194bba2c361STejun Heo { 195bba2c361STejun Heo __scx_cmask_init(m, base, nr_cids, nr_cids); 196bba2c361STejun Heo } 197bba2c361STejun Heo 198bba2c361STejun Heo /** 199bba2c361STejun Heo * scx_cmask_reframe - Reshape @m's active range without resizing storage 200bba2c361STejun Heo * @m: cmask to reframe 201bba2c361STejun Heo * @base: new active range base 202bba2c361STejun Heo * @nr_cids: new active range length, must fit within @m->alloc_words 203bba2c361STejun Heo * 204bba2c361STejun Heo * Body bits within the new range become garbage - only the head and tail 205bba2c361STejun Heo * words are zeroed to keep the padding invariant. 206bba2c361STejun Heo */ 207bba2c361STejun Heo static inline void scx_cmask_reframe(struct scx_cmask *m, u32 base, u32 nr_cids) 208bba2c361STejun Heo { 209bba2c361STejun Heo if (WARN_ON_ONCE(SCX_CMASK_NR_WORDS(nr_cids) > m->alloc_words)) 210bba2c361STejun Heo return; 211bba2c361STejun Heo 212bba2c361STejun Heo if (nr_cids) { 213bba2c361STejun Heo u32 last_word = ((base & 63) + nr_cids - 1) / 64; 214bba2c361STejun Heo 215bba2c361STejun Heo m->bits[0] = 0; 216bba2c361STejun Heo m->bits[last_word] = 0; 217bba2c361STejun Heo } 218bba2c361STejun Heo 219bba2c361STejun Heo m->base = base; 220bba2c361STejun Heo m->nr_cids = nr_cids; 221bba2c361STejun Heo } 222bba2c361STejun Heo 223bba2c361STejun Heo static inline void __scx_cmask_set(u32 cid, struct scx_cmask *m) 224bba2c361STejun Heo { 225bba2c361STejun Heo if (!__scx_cmask_contains(cid, m)) 226bba2c361STejun Heo return; 227bba2c361STejun Heo *__scx_cmask_word(cid, m) |= BIT_U64(cid & 63); 228bba2c361STejun Heo } 229bba2c361STejun Heo 230bba2c361STejun Heo /** 231bba2c361STejun Heo * scx_cmask_test - test whether @cid is set in @m 232bba2c361STejun Heo * @cid: cid to test 233bba2c361STejun Heo * @m: cmask to test 234bba2c361STejun Heo * 235bba2c361STejun Heo * Return %false if @cid is outside @m's active range. Otherwise return the 236bba2c361STejun Heo * bit's value. Read via READ_ONCE so callers can race set/clear writers. 237bba2c361STejun Heo */ 238bba2c361STejun Heo static inline bool scx_cmask_test(u32 cid, const struct scx_cmask *m) 239bba2c361STejun Heo { 240bba2c361STejun Heo if (!__scx_cmask_contains(cid, m)) 241bba2c361STejun Heo return false; 242bba2c361STejun Heo return READ_ONCE(*__scx_cmask_word(cid, m)) & BIT_U64(cid & 63); 243bba2c361STejun Heo } 244bba2c361STejun Heo 245bba2c361STejun Heo /* 246bba2c361STejun Heo * Words of bits[] the active range spans, 0 if empty. Tighter than the storage 247bba2c361STejun Heo * SCX_CMASK_NR_WORDS() sizes for the worst-case base alignment. 248bba2c361STejun Heo */ 249bba2c361STejun Heo static inline u32 scx_cmask_nr_used_words(const struct scx_cmask *m) 250bba2c361STejun Heo { 251bba2c361STejun Heo if (!m->nr_cids) 252bba2c361STejun Heo return 0; 253bba2c361STejun Heo return ((m->base & 63) + m->nr_cids - 1) / 64 + 1; 254bba2c361STejun Heo } 255bba2c361STejun Heo 256bba2c361STejun Heo /** 257bba2c361STejun Heo * scx_cmask_for_each_cid - iterate set cids in @m 258bba2c361STejun Heo * @cid: s32 loop var that receives each set cid in turn 259bba2c361STejun Heo * @m: cmask to iterate 260bba2c361STejun Heo * 261bba2c361STejun Heo * Visits set bits within @m's active range in ascending order. Scans only the 262bba2c361STejun Heo * words the active range spans, where head and tail padding is kept zero, so 263bba2c361STejun Heo * no per-cid range check is needed. 264bba2c361STejun Heo */ 265bba2c361STejun Heo #define scx_cmask_for_each_cid(cid, m) \ 266bba2c361STejun Heo for (u64 __bs = (m)->base & ~63u, __wi = 0, \ 267bba2c361STejun Heo __nw = scx_cmask_nr_used_words(m); \ 268bba2c361STejun Heo __wi < __nw; __wi++) \ 269bba2c361STejun Heo for (u64 __w = READ_ONCE((m)->bits[__wi]); \ 270bba2c361STejun Heo __w && ((cid) = __bs + __wi * 64 + __ffs64(__w), true); \ 271bba2c361STejun Heo __w &= __w - 1) 272bba2c361STejun Heo 273*4437ad12STejun Heo /* 274*4437ad12STejun Heo * scx_cpu_arg() wraps a cpu arg being handed to an SCX op. For cid-form 275*4437ad12STejun Heo * schedulers it resolves to the matching cid; for cpu-form it passes @cpu 276*4437ad12STejun Heo * through. scx_cpu_ret() is the inverse for a cpu/cid returned from an op 277*4437ad12STejun Heo * (currently only ops.select_cpu); it validates the BPF-supplied cid and 278*4437ad12STejun Heo * triggers scx_error() on @sch if invalid. 279*4437ad12STejun Heo */ 280*4437ad12STejun Heo static inline s32 scx_cpu_arg(s32 cpu) 281*4437ad12STejun Heo { 282*4437ad12STejun Heo if (scx_is_cid_type()) 283*4437ad12STejun Heo return __scx_cpu_to_cid(cpu); 284*4437ad12STejun Heo return cpu; 285*4437ad12STejun Heo } 286*4437ad12STejun Heo 287*4437ad12STejun Heo static inline s32 scx_cpu_ret(struct scx_sched *sch, s32 cpu_or_cid) 288*4437ad12STejun Heo { 289*4437ad12STejun Heo if (cpu_or_cid < 0 || !scx_is_cid_type()) 290*4437ad12STejun Heo return cpu_or_cid; 291*4437ad12STejun Heo return scx_cid_to_cpu(sch, cpu_or_cid); 292*4437ad12STejun Heo } 293*4437ad12STejun Heo 294bba2c361STejun Heo #endif /* _KERNEL_SCHED_EXT_CID_H */ 295