1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * Topological CPU IDs (cids) 4 * -------------------------- 5 * 6 * Raw cpu numbers are clumsy for sharding work and communication across 7 * topology units, especially from BPF: the space can be sparse, numerical 8 * closeness doesn't imply topological closeness (x86 hyperthreading often puts 9 * SMT siblings far apart), and a range of cpu ids doesn't mean anything. 10 * Sub-scheds make this acute - cpu allocation, revocation and other state are 11 * constantly communicated across sub-scheds, and passing whole cpumasks scales 12 * poorly with cpu count. cpumasks are also awkward in BPF: a variable-length 13 * kernel type sized for the maximum NR_CPUS (4k), with verbose helper sequences 14 * for every op. 15 * 16 * cids give every cpu a dense, topology-ordered id. CPUs sharing a core, LLC or 17 * NUMA node get contiguous cid ranges, so a topology unit becomes a (start, 18 * length) slice of cid space. Communication can pass a slice instead of a 19 * cpumask, and BPF code can process, for example, a u64 word's worth of cids at 20 * a time. 21 * 22 * The mapping is built once at root scheduler enable time by walking the 23 * topology of online cpus only. Going by online cpus is out of necessity: 24 * depending on the arch, topology info isn't reliably available for offline 25 * cpus. The expected usage model is restarting the scheduler on hotplug events 26 * so the mapping is rebuilt against the new online set. A scheduler that wants 27 * to handle hotplug without a restart can provide its own cid and shard mapping 28 * through the override interface. 29 * 30 * Copyright (c) 2026 Meta Platforms, Inc. and affiliates. 31 * Copyright (c) 2026 Tejun Heo <tj@kernel.org> 32 */ 33 #ifndef _KERNEL_SCHED_EXT_CID_H 34 #define _KERNEL_SCHED_EXT_CID_H 35 36 #include "internal.h" 37 38 struct scx_sched; 39 40 /* 41 * Cid space (total is always num_possible_cpus()) is laid out with 42 * topology-annotated cids first, then no-topo cids at the tail. The 43 * topology-annotated block covers the cpus that were online when scx_cid_init() 44 * ran and remains valid even after those cpus go offline. The tail block covers 45 * possible-but-not-online cpus and carries all-(-1) topo info (see 46 * scx_cid_topo); callers detect it via the -1 sentinels. 47 * 48 * See the comment above the table definitions in cid.c for the 49 * memory-ordering and visibility contract. 50 */ 51 extern s16 *scx_cid_to_cpu_tbl; 52 extern s16 *scx_cpu_to_cid_tbl; 53 extern struct scx_cid_topo *scx_cid_topo; 54 extern struct btf_id_set8 scx_kfunc_ids_init; 55 56 void scx_cmask_clear(struct scx_cmask *m); 57 void scx_cmask_fill(struct scx_cmask *m); 58 void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src); 59 void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src); 60 void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src); 61 void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src); 62 void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src); 63 void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src); 64 bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super); 65 bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b); 66 bool scx_cmask_empty(const struct scx_cmask *m); 67 s32 scx_cid_init(struct scx_sched *sch); 68 int scx_cid_kfunc_init(void); 69 void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst); 70 71 /** 72 * cid_valid - Verify a cid value, to be used on ops input args 73 * @sch: scx_sched to abort on error 74 * @cid: cid which came from a BPF ops 75 * 76 * Return true if @cid is in [0, num_possible_cpus()). On failure, trigger 77 * scx_error() and return false. 78 */ 79 static inline bool cid_valid(struct scx_sched *sch, s32 cid) 80 { 81 if (likely(cid >= 0 && cid < num_possible_cpus())) 82 return true; 83 scx_error(sch, "invalid cid %d", cid); 84 return false; 85 } 86 87 /** 88 * __scx_cid_to_cpu - Unchecked cid->cpu table lookup 89 * @cid: cid to look up. Must be in [0, num_possible_cpus()). 90 * 91 * Intended for callsites that have already validated @cid and that hold a 92 * non-NULL @sch from scx_prog_sched() - a live sched implies the table has 93 * been allocated, so no NULL check is needed here. 94 */ 95 static inline s32 __scx_cid_to_cpu(s32 cid) 96 { 97 /* READ_ONCE pairs with WRITE_ONCE in scx_cid_arrays_alloc() */ 98 return READ_ONCE(scx_cid_to_cpu_tbl)[cid]; 99 } 100 101 /** 102 * __scx_cpu_to_cid - Unchecked cpu->cid table lookup 103 * @cpu: cpu to look up. Must be a valid possible cpu id. 104 * 105 * Same usage constraints as __scx_cid_to_cpu(). 106 */ 107 static inline s32 __scx_cpu_to_cid(s32 cpu) 108 { 109 return READ_ONCE(scx_cpu_to_cid_tbl)[cpu]; 110 } 111 112 /** 113 * scx_cid_to_cpu - Translate @cid to its cpu 114 * @sch: scx_sched for error reporting 115 * @cid: cid to look up 116 * 117 * Return the cpu for @cid or a negative errno on failure. Invalid cid triggers 118 * scx_error() on @sch. The cid arrays are allocated on first scheduler enable 119 * and never freed, so the returned cpu is stable for the lifetime of the loaded 120 * scheduler. 121 */ 122 static inline s32 scx_cid_to_cpu(struct scx_sched *sch, s32 cid) 123 { 124 if (!cid_valid(sch, cid)) 125 return -EINVAL; 126 return __scx_cid_to_cpu(cid); 127 } 128 129 /** 130 * scx_cpu_to_cid - Translate @cpu to its cid 131 * @sch: scx_sched for error reporting 132 * @cpu: cpu to look up 133 * 134 * Return the cid for @cpu or a negative errno on failure. Invalid cpu triggers 135 * scx_error() on @sch. Same lifetime guarantee as scx_cid_to_cpu(). 136 */ 137 static inline s32 scx_cpu_to_cid(struct scx_sched *sch, s32 cpu) 138 { 139 if (!scx_cpu_valid(sch, cpu, NULL)) 140 return -EINVAL; 141 return __scx_cpu_to_cid(cpu); 142 } 143 144 /** 145 * scx_is_cid_type - Test whether the active scheduler hierarchy is cid-form 146 */ 147 static inline bool scx_is_cid_type(void) 148 { 149 return static_branch_unlikely(&__scx_is_cid_type); 150 } 151 152 static inline bool __scx_cmask_contains(u32 cid, const struct scx_cmask *m) 153 { 154 return likely(cid >= m->base && cid < m->base + m->nr_cids); 155 } 156 157 /* Word in bits[] covering @cid. @cid must satisfy __scx_cmask_contains(). */ 158 static inline u64 *__scx_cmask_word(u32 cid, const struct scx_cmask *m) 159 { 160 return (u64 *)&m->bits[cid / 64 - m->base / 64]; 161 } 162 163 /** 164 * __scx_cmask_init - Initialize @m with explicit storage capacity 165 * @m: cmask to initialize 166 * @base: first cid of the active range 167 * @nr_cids: number of cids in the active range 168 * @alloc_cids: storage capacity in cids, at least @nr_cids 169 * 170 * Use when storage is sized larger than the initial active range. All of 171 * bits[] is zeroed. 172 */ 173 static inline void __scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids, 174 u32 alloc_cids) 175 { 176 if (WARN_ON_ONCE(alloc_cids < nr_cids)) 177 nr_cids = alloc_cids; 178 179 m->base = base; 180 m->nr_cids = nr_cids; 181 m->alloc_words = SCX_CMASK_NR_WORDS(alloc_cids); 182 memset(m->bits, 0, m->alloc_words * sizeof(u64)); 183 } 184 185 /** 186 * scx_cmask_init - Initialize @m on tight storage 187 * @m: cmask to initialize 188 * @base: first cid of the active range 189 * @nr_cids: number of cids in the active range 190 * 191 * All of bits[] is zeroed. 192 */ 193 static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids) 194 { 195 __scx_cmask_init(m, base, nr_cids, nr_cids); 196 } 197 198 /** 199 * scx_cmask_reframe - Reshape @m's active range without resizing storage 200 * @m: cmask to reframe 201 * @base: new active range base 202 * @nr_cids: new active range length, must fit within @m->alloc_words 203 * 204 * Body bits within the new range become garbage - only the head and tail 205 * words are zeroed to keep the padding invariant. 206 */ 207 static inline void scx_cmask_reframe(struct scx_cmask *m, u32 base, u32 nr_cids) 208 { 209 if (WARN_ON_ONCE(SCX_CMASK_NR_WORDS(nr_cids) > m->alloc_words)) 210 return; 211 212 if (nr_cids) { 213 u32 last_word = ((base & 63) + nr_cids - 1) / 64; 214 215 m->bits[0] = 0; 216 m->bits[last_word] = 0; 217 } 218 219 m->base = base; 220 m->nr_cids = nr_cids; 221 } 222 223 static inline void __scx_cmask_set(u32 cid, struct scx_cmask *m) 224 { 225 if (!__scx_cmask_contains(cid, m)) 226 return; 227 *__scx_cmask_word(cid, m) |= BIT_U64(cid & 63); 228 } 229 230 /** 231 * scx_cmask_test - test whether @cid is set in @m 232 * @cid: cid to test 233 * @m: cmask to test 234 * 235 * Return %false if @cid is outside @m's active range. Otherwise return the 236 * bit's value. Read via READ_ONCE so callers can race set/clear writers. 237 */ 238 static inline bool scx_cmask_test(u32 cid, const struct scx_cmask *m) 239 { 240 if (!__scx_cmask_contains(cid, m)) 241 return false; 242 return READ_ONCE(*__scx_cmask_word(cid, m)) & BIT_U64(cid & 63); 243 } 244 245 /* 246 * Words of bits[] the active range spans, 0 if empty. Tighter than the storage 247 * SCX_CMASK_NR_WORDS() sizes for the worst-case base alignment. 248 */ 249 static inline u32 scx_cmask_nr_used_words(const struct scx_cmask *m) 250 { 251 if (!m->nr_cids) 252 return 0; 253 return ((m->base & 63) + m->nr_cids - 1) / 64 + 1; 254 } 255 256 /** 257 * scx_cmask_for_each_cid - iterate set cids in @m 258 * @cid: s32 loop var that receives each set cid in turn 259 * @m: cmask to iterate 260 * 261 * Visits set bits within @m's active range in ascending order. Scans only the 262 * words the active range spans, where head and tail padding is kept zero, so 263 * no per-cid range check is needed. 264 */ 265 #define scx_cmask_for_each_cid(cid, m) \ 266 for (u64 __bs = (m)->base & ~63u, __wi = 0, \ 267 __nw = scx_cmask_nr_used_words(m); \ 268 __wi < __nw; __wi++) \ 269 for (u64 __w = READ_ONCE((m)->bits[__wi]); \ 270 __w && ((cid) = __bs + __wi * 64 + __ffs64(__w), true); \ 271 __w &= __w - 1) 272 273 #endif /* _KERNEL_SCHED_EXT_CID_H */ 274