xref: /linux/kernel/sched/ext/cid.h (revision bba2c3615bd6cfee7456d1130f2e6b01b3f4e9ba)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * Topological CPU IDs (cids)
4  * --------------------------
5  *
6  * Raw cpu numbers are clumsy for sharding work and communication across
7  * topology units, especially from BPF: the space can be sparse, numerical
8  * closeness doesn't imply topological closeness (x86 hyperthreading often puts
9  * SMT siblings far apart), and a range of cpu ids doesn't mean anything.
10  * Sub-scheds make this acute - cpu allocation, revocation and other state are
11  * constantly communicated across sub-scheds, and passing whole cpumasks scales
12  * poorly with cpu count. cpumasks are also awkward in BPF: a variable-length
13  * kernel type sized for the maximum NR_CPUS (4k), with verbose helper sequences
14  * for every op.
15  *
16  * cids give every cpu a dense, topology-ordered id. CPUs sharing a core, LLC or
17  * NUMA node get contiguous cid ranges, so a topology unit becomes a (start,
18  * length) slice of cid space. Communication can pass a slice instead of a
19  * cpumask, and BPF code can process, for example, a u64 word's worth of cids at
20  * a time.
21  *
22  * The mapping is built once at root scheduler enable time by walking the
23  * topology of online cpus only. Going by online cpus is out of necessity:
24  * depending on the arch, topology info isn't reliably available for offline
25  * cpus. The expected usage model is restarting the scheduler on hotplug events
26  * so the mapping is rebuilt against the new online set. A scheduler that wants
27  * to handle hotplug without a restart can provide its own cid and shard mapping
28  * through the override interface.
29  *
30  * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
31  * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
32  */
33 #ifndef _KERNEL_SCHED_EXT_CID_H
34 #define _KERNEL_SCHED_EXT_CID_H
35 
36 struct scx_sched;
37 
38 /*
39  * Cid space (total is always num_possible_cpus()) is laid out with
40  * topology-annotated cids first, then no-topo cids at the tail. The
41  * topology-annotated block covers the cpus that were online when scx_cid_init()
42  * ran and remains valid even after those cpus go offline. The tail block covers
43  * possible-but-not-online cpus and carries all-(-1) topo info (see
44  * scx_cid_topo); callers detect it via the -1 sentinels.
45  *
46  * See the comment above the table definitions in cid.c for the
47  * memory-ordering and visibility contract.
48  */
49 extern s16 *scx_cid_to_cpu_tbl;
50 extern s16 *scx_cpu_to_cid_tbl;
51 extern struct scx_cid_topo *scx_cid_topo;
52 extern struct btf_id_set8 scx_kfunc_ids_init;
53 
54 void scx_cmask_clear(struct scx_cmask *m);
55 void scx_cmask_fill(struct scx_cmask *m);
56 void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src);
57 void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src);
58 void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src);
59 void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src);
60 void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src);
61 void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src);
62 bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super);
63 bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b);
64 bool scx_cmask_empty(const struct scx_cmask *m);
65 s32 scx_cid_init(struct scx_sched *sch);
66 int scx_cid_kfunc_init(void);
67 void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst);
68 
69 /**
70  * cid_valid - Verify a cid value, to be used on ops input args
71  * @sch: scx_sched to abort on error
72  * @cid: cid which came from a BPF ops
73  *
74  * Return true if @cid is in [0, num_possible_cpus()). On failure, trigger
75  * scx_error() and return false.
76  */
77 static inline bool cid_valid(struct scx_sched *sch, s32 cid)
78 {
79 	if (likely(cid >= 0 && cid < num_possible_cpus()))
80 		return true;
81 	scx_error(sch, "invalid cid %d", cid);
82 	return false;
83 }
84 
85 /**
86  * __scx_cid_to_cpu - Unchecked cid->cpu table lookup
87  * @cid: cid to look up. Must be in [0, num_possible_cpus()).
88  *
89  * Intended for callsites that have already validated @cid and that hold a
90  * non-NULL @sch from scx_prog_sched() - a live sched implies the table has
91  * been allocated, so no NULL check is needed here.
92  */
93 static inline s32 __scx_cid_to_cpu(s32 cid)
94 {
95 	/* READ_ONCE pairs with WRITE_ONCE in scx_cid_arrays_alloc() */
96 	return READ_ONCE(scx_cid_to_cpu_tbl)[cid];
97 }
98 
99 /**
100  * __scx_cpu_to_cid - Unchecked cpu->cid table lookup
101  * @cpu: cpu to look up. Must be a valid possible cpu id.
102  *
103  * Same usage constraints as __scx_cid_to_cpu().
104  */
105 static inline s32 __scx_cpu_to_cid(s32 cpu)
106 {
107 	return READ_ONCE(scx_cpu_to_cid_tbl)[cpu];
108 }
109 
110 /**
111  * scx_cid_to_cpu - Translate @cid to its cpu
112  * @sch: scx_sched for error reporting
113  * @cid: cid to look up
114  *
115  * Return the cpu for @cid or a negative errno on failure. Invalid cid triggers
116  * scx_error() on @sch. The cid arrays are allocated on first scheduler enable
117  * and never freed, so the returned cpu is stable for the lifetime of the loaded
118  * scheduler.
119  */
120 static inline s32 scx_cid_to_cpu(struct scx_sched *sch, s32 cid)
121 {
122 	if (!cid_valid(sch, cid))
123 		return -EINVAL;
124 	return __scx_cid_to_cpu(cid);
125 }
126 
127 /**
128  * scx_cpu_to_cid - Translate @cpu to its cid
129  * @sch: scx_sched for error reporting
130  * @cpu: cpu to look up
131  *
132  * Return the cid for @cpu or a negative errno on failure. Invalid cpu triggers
133  * scx_error() on @sch. Same lifetime guarantee as scx_cid_to_cpu().
134  */
135 static inline s32 scx_cpu_to_cid(struct scx_sched *sch, s32 cpu)
136 {
137 	if (!scx_cpu_valid(sch, cpu, NULL))
138 		return -EINVAL;
139 	return __scx_cpu_to_cid(cpu);
140 }
141 
142 /**
143  * scx_is_cid_type - Test whether the active scheduler hierarchy is cid-form
144  */
145 static inline bool scx_is_cid_type(void)
146 {
147 	return static_branch_unlikely(&__scx_is_cid_type);
148 }
149 
150 static inline bool __scx_cmask_contains(u32 cid, const struct scx_cmask *m)
151 {
152 	return likely(cid >= m->base && cid < m->base + m->nr_cids);
153 }
154 
155 /* Word in bits[] covering @cid. @cid must satisfy __scx_cmask_contains(). */
156 static inline u64 *__scx_cmask_word(u32 cid, const struct scx_cmask *m)
157 {
158 	return (u64 *)&m->bits[cid / 64 - m->base / 64];
159 }
160 
161 /**
162  * __scx_cmask_init - Initialize @m with explicit storage capacity
163  * @m: cmask to initialize
164  * @base: first cid of the active range
165  * @nr_cids: number of cids in the active range
166  * @alloc_cids: storage capacity in cids, at least @nr_cids
167  *
168  * Use when storage is sized larger than the initial active range. All of
169  * bits[] is zeroed.
170  */
171 static inline void __scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids,
172 				    u32 alloc_cids)
173 {
174 	if (WARN_ON_ONCE(alloc_cids < nr_cids))
175 		nr_cids = alloc_cids;
176 
177 	m->base = base;
178 	m->nr_cids = nr_cids;
179 	m->alloc_words = SCX_CMASK_NR_WORDS(alloc_cids);
180 	memset(m->bits, 0, m->alloc_words * sizeof(u64));
181 }
182 
183 /**
184  * scx_cmask_init - Initialize @m on tight storage
185  * @m: cmask to initialize
186  * @base: first cid of the active range
187  * @nr_cids: number of cids in the active range
188  *
189  * All of bits[] is zeroed.
190  */
191 static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids)
192 {
193 	__scx_cmask_init(m, base, nr_cids, nr_cids);
194 }
195 
196 /**
197  * scx_cmask_reframe - Reshape @m's active range without resizing storage
198  * @m: cmask to reframe
199  * @base: new active range base
200  * @nr_cids: new active range length, must fit within @m->alloc_words
201  *
202  * Body bits within the new range become garbage - only the head and tail
203  * words are zeroed to keep the padding invariant.
204  */
205 static inline void scx_cmask_reframe(struct scx_cmask *m, u32 base, u32 nr_cids)
206 {
207 	if (WARN_ON_ONCE(SCX_CMASK_NR_WORDS(nr_cids) > m->alloc_words))
208 		return;
209 
210 	if (nr_cids) {
211 		u32 last_word = ((base & 63) + nr_cids - 1) / 64;
212 
213 		m->bits[0] = 0;
214 		m->bits[last_word] = 0;
215 	}
216 
217 	m->base = base;
218 	m->nr_cids = nr_cids;
219 }
220 
221 static inline void __scx_cmask_set(u32 cid, struct scx_cmask *m)
222 {
223 	if (!__scx_cmask_contains(cid, m))
224 		return;
225 	*__scx_cmask_word(cid, m) |= BIT_U64(cid & 63);
226 }
227 
228 /**
229  * scx_cmask_test - test whether @cid is set in @m
230  * @cid: cid to test
231  * @m: cmask to test
232  *
233  * Return %false if @cid is outside @m's active range. Otherwise return the
234  * bit's value. Read via READ_ONCE so callers can race set/clear writers.
235  */
236 static inline bool scx_cmask_test(u32 cid, const struct scx_cmask *m)
237 {
238 	if (!__scx_cmask_contains(cid, m))
239 		return false;
240 	return READ_ONCE(*__scx_cmask_word(cid, m)) & BIT_U64(cid & 63);
241 }
242 
243 /*
244  * Words of bits[] the active range spans, 0 if empty. Tighter than the storage
245  * SCX_CMASK_NR_WORDS() sizes for the worst-case base alignment.
246  */
247 static inline u32 scx_cmask_nr_used_words(const struct scx_cmask *m)
248 {
249 	if (!m->nr_cids)
250 		return 0;
251 	return ((m->base & 63) + m->nr_cids - 1) / 64 + 1;
252 }
253 
254 /**
255  * scx_cmask_for_each_cid - iterate set cids in @m
256  * @cid: s32 loop var that receives each set cid in turn
257  * @m: cmask to iterate
258  *
259  * Visits set bits within @m's active range in ascending order. Scans only the
260  * words the active range spans, where head and tail padding is kept zero, so
261  * no per-cid range check is needed.
262  */
263 #define scx_cmask_for_each_cid(cid, m)						\
264 	for (u64 __bs = (m)->base & ~63u, __wi = 0,				\
265 		     __nw = scx_cmask_nr_used_words(m);				\
266 	     __wi < __nw; __wi++)						\
267 		for (u64 __w = READ_ONCE((m)->bits[__wi]);			\
268 		     __w && ((cid) = __bs + __wi * 64 + __ffs64(__w), true);	\
269 		     __w &= __w - 1)
270 
271 #endif /* _KERNEL_SCHED_EXT_CID_H */
272