xref: /linux/kernel/sched/ext/cid.h (revision 7603d8e78023e5883e075b4625fbdf059c6384f7)
1bba2c361STejun Heo /* SPDX-License-Identifier: GPL-2.0 */
2bba2c361STejun Heo /*
3bba2c361STejun Heo  * Topological CPU IDs (cids)
4bba2c361STejun Heo  * --------------------------
5bba2c361STejun Heo  *
6bba2c361STejun Heo  * Raw cpu numbers are clumsy for sharding work and communication across
7bba2c361STejun Heo  * topology units, especially from BPF: the space can be sparse, numerical
8bba2c361STejun Heo  * closeness doesn't imply topological closeness (x86 hyperthreading often puts
9bba2c361STejun Heo  * SMT siblings far apart), and a range of cpu ids doesn't mean anything.
10bba2c361STejun Heo  * Sub-scheds make this acute - cpu allocation, revocation and other state are
11bba2c361STejun Heo  * constantly communicated across sub-scheds, and passing whole cpumasks scales
12bba2c361STejun Heo  * poorly with cpu count. cpumasks are also awkward in BPF: a variable-length
13bba2c361STejun Heo  * kernel type sized for the maximum NR_CPUS (4k), with verbose helper sequences
14bba2c361STejun Heo  * for every op.
15bba2c361STejun Heo  *
16bba2c361STejun Heo  * cids give every cpu a dense, topology-ordered id. CPUs sharing a core, LLC or
17bba2c361STejun Heo  * NUMA node get contiguous cid ranges, so a topology unit becomes a (start,
18bba2c361STejun Heo  * length) slice of cid space. Communication can pass a slice instead of a
19bba2c361STejun Heo  * cpumask, and BPF code can process, for example, a u64 word's worth of cids at
20bba2c361STejun Heo  * a time.
21bba2c361STejun Heo  *
22bba2c361STejun Heo  * The mapping is built once at root scheduler enable time by walking the
23bba2c361STejun Heo  * topology of online cpus only. Going by online cpus is out of necessity:
24bba2c361STejun Heo  * depending on the arch, topology info isn't reliably available for offline
25bba2c361STejun Heo  * cpus. The expected usage model is restarting the scheduler on hotplug events
26bba2c361STejun Heo  * so the mapping is rebuilt against the new online set. A scheduler that wants
27bba2c361STejun Heo  * to handle hotplug without a restart can provide its own cid and shard mapping
28bba2c361STejun Heo  * through the override interface.
29bba2c361STejun Heo  *
30bba2c361STejun Heo  * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
31bba2c361STejun Heo  * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
32bba2c361STejun Heo  */
33bba2c361STejun Heo #ifndef _KERNEL_SCHED_EXT_CID_H
34bba2c361STejun Heo #define _KERNEL_SCHED_EXT_CID_H
35bba2c361STejun Heo 
363cd1f76bSTejun Heo #include "internal.h"
373cd1f76bSTejun Heo 
38bba2c361STejun Heo struct scx_sched;
39bba2c361STejun Heo 
40bba2c361STejun Heo /*
41bba2c361STejun Heo  * Cid space (total is always num_possible_cpus()) is laid out with
42bba2c361STejun Heo  * topology-annotated cids first, then no-topo cids at the tail. The
43bba2c361STejun Heo  * topology-annotated block covers the cpus that were online when scx_cid_init()
44bba2c361STejun Heo  * ran and remains valid even after those cpus go offline. The tail block covers
45bba2c361STejun Heo  * possible-but-not-online cpus and carries all-(-1) topo info (see
46bba2c361STejun Heo  * scx_cid_topo); callers detect it via the -1 sentinels.
47bba2c361STejun Heo  *
48bba2c361STejun Heo  * See the comment above the table definitions in cid.c for the
49bba2c361STejun Heo  * memory-ordering and visibility contract.
50bba2c361STejun Heo  */
51bba2c361STejun Heo extern s16 *scx_cid_to_cpu_tbl;
52bba2c361STejun Heo extern s16 *scx_cpu_to_cid_tbl;
53bba2c361STejun Heo extern struct scx_cid_topo *scx_cid_topo;
54bba2c361STejun Heo extern struct btf_id_set8 scx_kfunc_ids_init;
55bba2c361STejun Heo 
56bba2c361STejun Heo void scx_cmask_clear(struct scx_cmask *m);
57bba2c361STejun Heo void scx_cmask_fill(struct scx_cmask *m);
58bba2c361STejun Heo void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src);
59bba2c361STejun Heo void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src);
60bba2c361STejun Heo void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src);
61bba2c361STejun Heo void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src);
62bba2c361STejun Heo void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src);
63bba2c361STejun Heo void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src);
64bba2c361STejun Heo bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super);
65bba2c361STejun Heo bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b);
66bba2c361STejun Heo bool scx_cmask_empty(const struct scx_cmask *m);
67bba2c361STejun Heo s32 scx_cid_init(struct scx_sched *sch);
68bba2c361STejun Heo int scx_cid_kfunc_init(void);
69bba2c361STejun Heo void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst);
70bba2c361STejun Heo 
71bba2c361STejun Heo /**
72bba2c361STejun Heo  * cid_valid - Verify a cid value, to be used on ops input args
73bba2c361STejun Heo  * @sch: scx_sched to abort on error
74bba2c361STejun Heo  * @cid: cid which came from a BPF ops
75bba2c361STejun Heo  *
76bba2c361STejun Heo  * Return true if @cid is in [0, num_possible_cpus()). On failure, trigger
77bba2c361STejun Heo  * scx_error() and return false.
78bba2c361STejun Heo  */
79bba2c361STejun Heo static inline bool cid_valid(struct scx_sched *sch, s32 cid)
80bba2c361STejun Heo {
81bba2c361STejun Heo 	if (likely(cid >= 0 && cid < num_possible_cpus()))
82bba2c361STejun Heo 		return true;
83bba2c361STejun Heo 	scx_error(sch, "invalid cid %d", cid);
84bba2c361STejun Heo 	return false;
85bba2c361STejun Heo }
86bba2c361STejun Heo 
87bba2c361STejun Heo /**
88bba2c361STejun Heo  * __scx_cid_to_cpu - Unchecked cid->cpu table lookup
89bba2c361STejun Heo  * @cid: cid to look up. Must be in [0, num_possible_cpus()).
90bba2c361STejun Heo  *
91bba2c361STejun Heo  * Intended for callsites that have already validated @cid and that hold a
92bba2c361STejun Heo  * non-NULL @sch from scx_prog_sched() - a live sched implies the table has
93bba2c361STejun Heo  * been allocated, so no NULL check is needed here.
94bba2c361STejun Heo  */
95bba2c361STejun Heo static inline s32 __scx_cid_to_cpu(s32 cid)
96bba2c361STejun Heo {
97bba2c361STejun Heo 	/* READ_ONCE pairs with WRITE_ONCE in scx_cid_arrays_alloc() */
98bba2c361STejun Heo 	return READ_ONCE(scx_cid_to_cpu_tbl)[cid];
99bba2c361STejun Heo }
100bba2c361STejun Heo 
101bba2c361STejun Heo /**
102bba2c361STejun Heo  * __scx_cpu_to_cid - Unchecked cpu->cid table lookup
103bba2c361STejun Heo  * @cpu: cpu to look up. Must be a valid possible cpu id.
104bba2c361STejun Heo  *
105bba2c361STejun Heo  * Same usage constraints as __scx_cid_to_cpu().
106bba2c361STejun Heo  */
107bba2c361STejun Heo static inline s32 __scx_cpu_to_cid(s32 cpu)
108bba2c361STejun Heo {
109bba2c361STejun Heo 	return READ_ONCE(scx_cpu_to_cid_tbl)[cpu];
110bba2c361STejun Heo }
111bba2c361STejun Heo 
112bba2c361STejun Heo /**
113bba2c361STejun Heo  * scx_cid_to_cpu - Translate @cid to its cpu
114bba2c361STejun Heo  * @sch: scx_sched for error reporting
115bba2c361STejun Heo  * @cid: cid to look up
116bba2c361STejun Heo  *
117bba2c361STejun Heo  * Return the cpu for @cid or a negative errno on failure. Invalid cid triggers
118bba2c361STejun Heo  * scx_error() on @sch. The cid arrays are allocated on first scheduler enable
119bba2c361STejun Heo  * and never freed, so the returned cpu is stable for the lifetime of the loaded
120bba2c361STejun Heo  * scheduler.
121bba2c361STejun Heo  */
122bba2c361STejun Heo static inline s32 scx_cid_to_cpu(struct scx_sched *sch, s32 cid)
123bba2c361STejun Heo {
124bba2c361STejun Heo 	if (!cid_valid(sch, cid))
125bba2c361STejun Heo 		return -EINVAL;
126bba2c361STejun Heo 	return __scx_cid_to_cpu(cid);
127bba2c361STejun Heo }
128bba2c361STejun Heo 
129bba2c361STejun Heo /**
130bba2c361STejun Heo  * scx_cpu_to_cid - Translate @cpu to its cid
131bba2c361STejun Heo  * @sch: scx_sched for error reporting
132bba2c361STejun Heo  * @cpu: cpu to look up
133bba2c361STejun Heo  *
134bba2c361STejun Heo  * Return the cid for @cpu or a negative errno on failure. Invalid cpu triggers
135bba2c361STejun Heo  * scx_error() on @sch. Same lifetime guarantee as scx_cid_to_cpu().
136bba2c361STejun Heo  */
137bba2c361STejun Heo static inline s32 scx_cpu_to_cid(struct scx_sched *sch, s32 cpu)
138bba2c361STejun Heo {
139bba2c361STejun Heo 	if (!scx_cpu_valid(sch, cpu, NULL))
140bba2c361STejun Heo 		return -EINVAL;
141bba2c361STejun Heo 	return __scx_cpu_to_cid(cpu);
142bba2c361STejun Heo }
143bba2c361STejun Heo 
144bba2c361STejun Heo /**
145bba2c361STejun Heo  * scx_is_cid_type - Test whether the active scheduler hierarchy is cid-form
146bba2c361STejun Heo  */
147bba2c361STejun Heo static inline bool scx_is_cid_type(void)
148bba2c361STejun Heo {
149bba2c361STejun Heo 	return static_branch_unlikely(&__scx_is_cid_type);
150bba2c361STejun Heo }
151bba2c361STejun Heo 
152bba2c361STejun Heo static inline bool __scx_cmask_contains(u32 cid, const struct scx_cmask *m)
153bba2c361STejun Heo {
154bba2c361STejun Heo 	return likely(cid >= m->base && cid < m->base + m->nr_cids);
155bba2c361STejun Heo }
156bba2c361STejun Heo 
157bba2c361STejun Heo /* Word in bits[] covering @cid. @cid must satisfy __scx_cmask_contains(). */
158bba2c361STejun Heo static inline u64 *__scx_cmask_word(u32 cid, const struct scx_cmask *m)
159bba2c361STejun Heo {
160bba2c361STejun Heo 	return (u64 *)&m->bits[cid / 64 - m->base / 64];
161bba2c361STejun Heo }
162bba2c361STejun Heo 
163bba2c361STejun Heo /**
164bba2c361STejun Heo  * __scx_cmask_init - Initialize @m with explicit storage capacity
165bba2c361STejun Heo  * @m: cmask to initialize
166bba2c361STejun Heo  * @base: first cid of the active range
167bba2c361STejun Heo  * @nr_cids: number of cids in the active range
168bba2c361STejun Heo  * @alloc_cids: storage capacity in cids, at least @nr_cids
169bba2c361STejun Heo  *
170bba2c361STejun Heo  * Use when storage is sized larger than the initial active range. All of
171bba2c361STejun Heo  * bits[] is zeroed.
172bba2c361STejun Heo  */
173bba2c361STejun Heo static inline void __scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids,
174bba2c361STejun Heo 				    u32 alloc_cids)
175bba2c361STejun Heo {
176bba2c361STejun Heo 	if (WARN_ON_ONCE(alloc_cids < nr_cids))
177bba2c361STejun Heo 		nr_cids = alloc_cids;
178bba2c361STejun Heo 
179bba2c361STejun Heo 	m->base = base;
180bba2c361STejun Heo 	m->nr_cids = nr_cids;
181bba2c361STejun Heo 	m->alloc_words = SCX_CMASK_NR_WORDS(alloc_cids);
182bba2c361STejun Heo 	memset(m->bits, 0, m->alloc_words * sizeof(u64));
183bba2c361STejun Heo }
184bba2c361STejun Heo 
185bba2c361STejun Heo /**
186bba2c361STejun Heo  * scx_cmask_init - Initialize @m on tight storage
187bba2c361STejun Heo  * @m: cmask to initialize
188bba2c361STejun Heo  * @base: first cid of the active range
189bba2c361STejun Heo  * @nr_cids: number of cids in the active range
190bba2c361STejun Heo  *
191bba2c361STejun Heo  * All of bits[] is zeroed.
192bba2c361STejun Heo  */
193bba2c361STejun Heo static inline void scx_cmask_init(struct scx_cmask *m, u32 base, u32 nr_cids)
194bba2c361STejun Heo {
195bba2c361STejun Heo 	__scx_cmask_init(m, base, nr_cids, nr_cids);
196bba2c361STejun Heo }
197bba2c361STejun Heo 
198bba2c361STejun Heo /**
199bba2c361STejun Heo  * scx_cmask_reframe - Reshape @m's active range without resizing storage
200bba2c361STejun Heo  * @m: cmask to reframe
201bba2c361STejun Heo  * @base: new active range base
202bba2c361STejun Heo  * @nr_cids: new active range length, must fit within @m->alloc_words
203bba2c361STejun Heo  *
204bba2c361STejun Heo  * Body bits within the new range become garbage - only the head and tail
205bba2c361STejun Heo  * words are zeroed to keep the padding invariant.
206bba2c361STejun Heo  */
207bba2c361STejun Heo static inline void scx_cmask_reframe(struct scx_cmask *m, u32 base, u32 nr_cids)
208bba2c361STejun Heo {
209bba2c361STejun Heo 	if (WARN_ON_ONCE(SCX_CMASK_NR_WORDS(nr_cids) > m->alloc_words))
210bba2c361STejun Heo 		return;
211bba2c361STejun Heo 
212bba2c361STejun Heo 	if (nr_cids) {
213bba2c361STejun Heo 		u32 last_word = ((base & 63) + nr_cids - 1) / 64;
214bba2c361STejun Heo 
215bba2c361STejun Heo 		m->bits[0] = 0;
216bba2c361STejun Heo 		m->bits[last_word] = 0;
217bba2c361STejun Heo 	}
218bba2c361STejun Heo 
219bba2c361STejun Heo 	m->base = base;
220bba2c361STejun Heo 	m->nr_cids = nr_cids;
221bba2c361STejun Heo }
222bba2c361STejun Heo 
223bba2c361STejun Heo static inline void __scx_cmask_set(u32 cid, struct scx_cmask *m)
224bba2c361STejun Heo {
225bba2c361STejun Heo 	if (!__scx_cmask_contains(cid, m))
226bba2c361STejun Heo 		return;
227bba2c361STejun Heo 	*__scx_cmask_word(cid, m) |= BIT_U64(cid & 63);
228bba2c361STejun Heo }
229bba2c361STejun Heo 
230bba2c361STejun Heo /**
231bba2c361STejun Heo  * scx_cmask_test - test whether @cid is set in @m
232bba2c361STejun Heo  * @cid: cid to test
233bba2c361STejun Heo  * @m: cmask to test
234bba2c361STejun Heo  *
235bba2c361STejun Heo  * Return %false if @cid is outside @m's active range. Otherwise return the
236bba2c361STejun Heo  * bit's value. Read via READ_ONCE so callers can race set/clear writers.
237bba2c361STejun Heo  */
238bba2c361STejun Heo static inline bool scx_cmask_test(u32 cid, const struct scx_cmask *m)
239bba2c361STejun Heo {
240bba2c361STejun Heo 	if (!__scx_cmask_contains(cid, m))
241bba2c361STejun Heo 		return false;
242bba2c361STejun Heo 	return READ_ONCE(*__scx_cmask_word(cid, m)) & BIT_U64(cid & 63);
243bba2c361STejun Heo }
244bba2c361STejun Heo 
245bba2c361STejun Heo /*
246bba2c361STejun Heo  * Words of bits[] the active range spans, 0 if empty. Tighter than the storage
247bba2c361STejun Heo  * SCX_CMASK_NR_WORDS() sizes for the worst-case base alignment.
248bba2c361STejun Heo  */
249bba2c361STejun Heo static inline u32 scx_cmask_nr_used_words(const struct scx_cmask *m)
250bba2c361STejun Heo {
251bba2c361STejun Heo 	if (!m->nr_cids)
252bba2c361STejun Heo 		return 0;
253bba2c361STejun Heo 	return ((m->base & 63) + m->nr_cids - 1) / 64 + 1;
254bba2c361STejun Heo }
255bba2c361STejun Heo 
256bba2c361STejun Heo /**
257bba2c361STejun Heo  * scx_cmask_for_each_cid - iterate set cids in @m
258bba2c361STejun Heo  * @cid: s32 loop var that receives each set cid in turn
259bba2c361STejun Heo  * @m: cmask to iterate
260bba2c361STejun Heo  *
261bba2c361STejun Heo  * Visits set bits within @m's active range in ascending order. Scans only the
262bba2c361STejun Heo  * words the active range spans, where head and tail padding is kept zero, so
263bba2c361STejun Heo  * no per-cid range check is needed.
264bba2c361STejun Heo  */
265bba2c361STejun Heo #define scx_cmask_for_each_cid(cid, m)						\
266bba2c361STejun Heo 	for (u64 __bs = (m)->base & ~63u, __wi = 0,				\
267bba2c361STejun Heo 		     __nw = scx_cmask_nr_used_words(m);				\
268bba2c361STejun Heo 	     __wi < __nw; __wi++)						\
269bba2c361STejun Heo 		for (u64 __w = READ_ONCE((m)->bits[__wi]);			\
270bba2c361STejun Heo 		     __w && ((cid) = __bs + __wi * 64 + __ffs64(__w), true);	\
271bba2c361STejun Heo 		     __w &= __w - 1)
272bba2c361STejun Heo 
273*4437ad12STejun Heo /*
274*4437ad12STejun Heo  * scx_cpu_arg() wraps a cpu arg being handed to an SCX op. For cid-form
275*4437ad12STejun Heo  * schedulers it resolves to the matching cid; for cpu-form it passes @cpu
276*4437ad12STejun Heo  * through. scx_cpu_ret() is the inverse for a cpu/cid returned from an op
277*4437ad12STejun Heo  * (currently only ops.select_cpu); it validates the BPF-supplied cid and
278*4437ad12STejun Heo  * triggers scx_error() on @sch if invalid.
279*4437ad12STejun Heo  */
280*4437ad12STejun Heo static inline s32 scx_cpu_arg(s32 cpu)
281*4437ad12STejun Heo {
282*4437ad12STejun Heo 	if (scx_is_cid_type())
283*4437ad12STejun Heo 		return __scx_cpu_to_cid(cpu);
284*4437ad12STejun Heo 	return cpu;
285*4437ad12STejun Heo }
286*4437ad12STejun Heo 
287*4437ad12STejun Heo static inline s32 scx_cpu_ret(struct scx_sched *sch, s32 cpu_or_cid)
288*4437ad12STejun Heo {
289*4437ad12STejun Heo 	if (cpu_or_cid < 0 || !scx_is_cid_type())
290*4437ad12STejun Heo 		return cpu_or_cid;
291*4437ad12STejun Heo 	return scx_cid_to_cpu(sch, cpu_or_cid);
292*4437ad12STejun Heo }
293*4437ad12STejun Heo 
294bba2c361STejun Heo #endif /* _KERNEL_SCHED_EXT_CID_H */
295