xref: /linux/kernel/sched/ext/cid.c (revision 7603d8e78023e5883e075b4625fbdf059c6384f7)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
4  *
5  * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
6  * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
7  */
8 #include <linux/cacheinfo.h>
9 
10 #include "internal.h"
11 #include "cid.h"
12 
13 /*
14  * cid tables.
15  *
16  * Pointers are published once on first enable and never revoked. The default
17  * mapping is populated before ops.init() runs; scx_bpf_cid_override() commits
18  * before it returns. As long as the BPF scheduler only uses the tables from
19  * those points onward, it sees a consistent view.
20  */
21 s16 *scx_cid_to_cpu_tbl;
22 s16 *scx_cpu_to_cid_tbl;
23 struct scx_cid_topo *scx_cid_topo;
24 
25 #define SCX_CID_TOPO_NEG	(struct scx_cid_topo) {				\
26 	.core_cid = -1, .core_idx = -1, .llc_cid = -1, .llc_idx = -1,		\
27 	.node_cid = -1, .node_idx = -1,						\
28 }
29 
30 /*
31  * Return @cpu's LLC shared_cpu_map. If cacheinfo isn't populated (offline or
32  * !present), record @cpu in @fallbacks and return its node mask instead - the
33  * worst that can happen is that the cpu's LLC becomes coarser than reality.
34  */
35 static const struct cpumask *cpu_llc_mask(int cpu, struct cpumask *fallbacks)
36 {
37 	struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu);
38 
39 	if (!ci || !ci->info_list || !ci->num_leaves) {
40 		cpumask_set_cpu(cpu, fallbacks);
41 		return cpumask_of_node(cpu_to_node(cpu));
42 	}
43 	return &ci->info_list[ci->num_leaves - 1].shared_cpu_map;
44 }
45 
46 /* Allocate the cid tables once on first enable; never freed. */
47 static s32 scx_cid_arrays_alloc(void)
48 {
49 	u32 npossible = num_possible_cpus();
50 	s16 *cid_to_cpu, *cpu_to_cid;
51 	struct scx_cid_topo *cid_topo;
52 
53 	if (scx_cid_to_cpu_tbl)
54 		return 0;
55 
56 	cid_to_cpu = kzalloc_objs(*scx_cid_to_cpu_tbl, npossible, GFP_KERNEL);
57 	cpu_to_cid = kzalloc_objs(*scx_cpu_to_cid_tbl, nr_cpu_ids, GFP_KERNEL);
58 	cid_topo = kmalloc_objs(*scx_cid_topo, npossible, GFP_KERNEL);
59 
60 	if (!cid_to_cpu || !cpu_to_cid || !cid_topo) {
61 		kfree(cid_to_cpu);
62 		kfree(cpu_to_cid);
63 		kfree(cid_topo);
64 		return -ENOMEM;
65 	}
66 
67 	WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu);
68 	WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid);
69 	WRITE_ONCE(scx_cid_topo, cid_topo);
70 	return 0;
71 }
72 
73 /**
74  * scx_cid_init - build the cid mapping
75  * @sch: the scx_sched being initialized; used as the scx_error() target
76  *
77  * See "Topological CPU IDs" in cid.h for the model. Walk online cpus by
78  * intersection at each level (parent_scratch & this_level_mask), which keeps
79  * containment correct by construction and naturally splits a physical LLC
80  * straddling two NUMA nodes into two LLC units. The caller must hold
81  * cpus_read_lock.
82  */
83 s32 scx_cid_init(struct scx_sched *sch)
84 {
85 	cpumask_var_t to_walk __free(free_cpumask_var) = CPUMASK_VAR_NULL;
86 	cpumask_var_t node_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL;
87 	cpumask_var_t llc_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL;
88 	cpumask_var_t core_scratch __free(free_cpumask_var) = CPUMASK_VAR_NULL;
89 	cpumask_var_t llc_fallback __free(free_cpumask_var) = CPUMASK_VAR_NULL;
90 	cpumask_var_t online_no_topo __free(free_cpumask_var) = CPUMASK_VAR_NULL;
91 	u32 next_cid = 0;
92 	s32 next_node_idx = 0, next_llc_idx = 0, next_core_idx = 0;
93 	s32 cpu, ret;
94 
95 	/* CMASK_MAX_WORDS in cid.bpf.h covers NR_CPUS up to 8192 */
96 	BUILD_BUG_ON(NR_CPUS > 8192);
97 
98 	lockdep_assert_cpus_held();
99 
100 	ret = scx_cid_arrays_alloc();
101 	if (ret)
102 		return ret;
103 
104 	if (!zalloc_cpumask_var(&to_walk, GFP_KERNEL) ||
105 	    !zalloc_cpumask_var(&node_scratch, GFP_KERNEL) ||
106 	    !zalloc_cpumask_var(&llc_scratch, GFP_KERNEL) ||
107 	    !zalloc_cpumask_var(&core_scratch, GFP_KERNEL) ||
108 	    !zalloc_cpumask_var(&llc_fallback, GFP_KERNEL) ||
109 	    !zalloc_cpumask_var(&online_no_topo, GFP_KERNEL))
110 		return -ENOMEM;
111 
112 	/* -1 sentinels for sparse-possible cpu id holes (0 is a valid cid) */
113 	for (cpu = 0; cpu < nr_cpu_ids; cpu++)
114 		scx_cpu_to_cid_tbl[cpu] = -1;
115 
116 	cpumask_copy(to_walk, cpu_online_mask);
117 
118 	while (!cpumask_empty(to_walk)) {
119 		s32 next_cpu = cpumask_first(to_walk);
120 		s32 nid = cpu_to_node(next_cpu);
121 		s32 node_cid = next_cid;
122 		s32 node_idx;
123 
124 		/*
125 		 * No NUMA info: skip and let the tail loop assign a no-topo
126 		 * cid. cpumask_of_node(-1) is undefined.
127 		 */
128 		if (nid < 0) {
129 			cpumask_clear_cpu(next_cpu, to_walk);
130 			continue;
131 		}
132 
133 		node_idx = next_node_idx++;
134 
135 		/* node_scratch = to_walk & this node */
136 		cpumask_and(node_scratch, to_walk, cpumask_of_node(nid));
137 		if (WARN_ON_ONCE(!cpumask_test_cpu(next_cpu, node_scratch)))
138 			return -EINVAL;
139 
140 		while (!cpumask_empty(node_scratch)) {
141 			s32 ncpu = cpumask_first(node_scratch);
142 			const struct cpumask *llc_mask = cpu_llc_mask(ncpu, llc_fallback);
143 			s32 llc_cid = next_cid;
144 			s32 llc_idx = next_llc_idx++;
145 
146 			/* llc_scratch = node_scratch & this llc */
147 			cpumask_and(llc_scratch, node_scratch, llc_mask);
148 			if (WARN_ON_ONCE(!cpumask_test_cpu(ncpu, llc_scratch)))
149 				return -EINVAL;
150 
151 			while (!cpumask_empty(llc_scratch)) {
152 				s32 lcpu = cpumask_first(llc_scratch);
153 				const struct cpumask *sib = topology_sibling_cpumask(lcpu);
154 				s32 core_cid = next_cid;
155 				s32 core_idx = next_core_idx++;
156 				s32 ccpu;
157 
158 				/* core_scratch = llc_scratch & this core */
159 				cpumask_and(core_scratch, llc_scratch, sib);
160 				if (WARN_ON_ONCE(!cpumask_test_cpu(lcpu, core_scratch)))
161 					return -EINVAL;
162 
163 				for_each_cpu(ccpu, core_scratch) {
164 					s32 cid = next_cid++;
165 
166 					scx_cid_to_cpu_tbl[cid] = ccpu;
167 					scx_cpu_to_cid_tbl[ccpu] = cid;
168 					scx_cid_topo[cid] = (struct scx_cid_topo){
169 						.core_cid = core_cid,
170 						.core_idx = core_idx,
171 						.llc_cid = llc_cid,
172 						.llc_idx = llc_idx,
173 						.node_cid = node_cid,
174 						.node_idx = node_idx,
175 					};
176 
177 					cpumask_clear_cpu(ccpu, llc_scratch);
178 					cpumask_clear_cpu(ccpu, node_scratch);
179 					cpumask_clear_cpu(ccpu, to_walk);
180 				}
181 			}
182 		}
183 	}
184 
185 	/*
186 	 * No-topo section: any possible cpu without a cid - normally just the
187 	 * not-online ones. Collect any currently-online cpus that land here in
188 	 * @online_no_topo so we can warn about them at the end.
189 	 */
190 	for_each_cpu(cpu, cpu_possible_mask) {
191 		s32 cid;
192 
193 		if (__scx_cpu_to_cid(cpu) != -1)
194 			continue;
195 		if (cpu_online(cpu))
196 			cpumask_set_cpu(cpu, online_no_topo);
197 
198 		cid = next_cid++;
199 		scx_cid_to_cpu_tbl[cid] = cpu;
200 		scx_cpu_to_cid_tbl[cpu] = cid;
201 		scx_cid_topo[cid] = SCX_CID_TOPO_NEG;
202 	}
203 
204 	if (!cpumask_empty(llc_fallback))
205 		pr_warn("scx_cid: cpus without cacheinfo, using node mask as llc: %*pbl\n",
206 			cpumask_pr_args(llc_fallback));
207 	if (!cpumask_empty(online_no_topo))
208 		pr_warn("scx_cid: online cpus with no usable topology: %*pbl\n",
209 			cpumask_pr_args(online_no_topo));
210 
211 	return 0;
212 }
213 
214 /**
215  * scx_cmask_clear - Zero every bit in @m's active range
216  * @m: cmask to clear
217  *
218  * Storage past the active range is left as is.
219  */
220 void scx_cmask_clear(struct scx_cmask *m)
221 {
222 	u32 nr_words;
223 
224 	if (!m->nr_cids)
225 		return;
226 	nr_words = (m->base + m->nr_cids - 1) / 64 - m->base / 64 + 1;
227 	memset(m->bits, 0, nr_words * sizeof(u64));
228 }
229 
230 /**
231  * scx_cmask_fill - Set every bit in @m's active range
232  * @m: cmask to fill
233  *
234  * Counterpart to scx_cmask_clear(). Storage past the active range is left as is.
235  */
236 void scx_cmask_fill(struct scx_cmask *m)
237 {
238 	u32 nr_words, head_bits, tail_bits;
239 
240 	if (!m->nr_cids)
241 		return;
242 	nr_words = (m->base + m->nr_cids - 1) / 64 - m->base / 64 + 1;
243 	memset(m->bits, 0xff, nr_words * sizeof(u64));
244 
245 	/* clear word-0 bits below base */
246 	head_bits = m->base & 63;
247 	if (head_bits)
248 		m->bits[0] &= ~((1ULL << head_bits) - 1);
249 
250 	/* clear last-word bits at or past base + nr_cids */
251 	tail_bits = (m->base + m->nr_cids) & 63;
252 	if (tail_bits)
253 		m->bits[nr_words - 1] &= (1ULL << tail_bits) - 1;
254 }
255 
256 /**
257  * scx_cpumask_to_cmask - Translate a kernel cpumask into a cmask
258  * @src: source cpumask
259  * @dst: cmask to write
260  *
261  * Clear @dst's active range and set the bit for each cid whose cpu is in
262  * @src and lies within that range. Out-of-range cids are silently ignored.
263  */
264 void scx_cpumask_to_cmask(const struct cpumask *src, struct scx_cmask *dst)
265 {
266 	s32 cpu;
267 
268 	scx_cmask_clear(dst);
269 	for_each_cpu(cpu, src) {
270 		s32 cid = __scx_cpu_to_cid(cpu);
271 
272 		if (cid >= 0)
273 			__scx_cmask_set(cid, dst);
274 	}
275 }
276 
277 __bpf_kfunc_start_defs();
278 
279 /**
280  * scx_bpf_cid_override - Install an explicit cpu->cid mapping
281  * @cpu_to_cid: array of nr_cpu_ids s32 entries (cid for each cpu)
282  * @cpu_to_cid__sz: must be nr_cpu_ids * sizeof(s32) bytes
283  * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
284  *
285  * May only be called from ops.init() of the root scheduler. Replace the
286  * topology-probed cid mapping with the caller-provided one. Each possible cpu
287  * must map to a unique cid in [0, num_possible_cpus()). Topo info is cleared.
288  * On invalid input, trigger scx_error() to abort the scheduler.
289  */
290 __bpf_kfunc void scx_bpf_cid_override(const s32 *cpu_to_cid, u32 cpu_to_cid__sz,
291 				      const struct bpf_prog_aux *aux)
292 {
293 	cpumask_var_t seen __free(free_cpumask_var) = CPUMASK_VAR_NULL;
294 	struct scx_sched *sch;
295 	bool alloced;
296 	s32 cpu, cid;
297 
298 	/* GFP_KERNEL alloc must happen before the rcu read section */
299 	alloced = zalloc_cpumask_var(&seen, GFP_KERNEL);
300 
301 	guard(rcu)();
302 
303 	sch = scx_prog_sched(aux);
304 	if (unlikely(!sch))
305 		return;
306 
307 	if (!alloced) {
308 		scx_error(sch, "scx_bpf_cid_override: failed to allocate cpumask");
309 		return;
310 	}
311 
312 	if (scx_parent(sch)) {
313 		scx_error(sch, "scx_bpf_cid_override() only allowed from root sched");
314 		return;
315 	}
316 
317 	if (cpu_to_cid__sz != nr_cpu_ids * sizeof(s32)) {
318 		scx_error(sch, "scx_bpf_cid_override: expected %zu bytes, got %u",
319 			  nr_cpu_ids * sizeof(s32), cpu_to_cid__sz);
320 		return;
321 	}
322 
323 	for_each_possible_cpu(cpu) {
324 		s32 c = cpu_to_cid[cpu];
325 
326 		if (!cid_valid(sch, c))
327 			return;
328 		if (cpumask_test_and_set_cpu(c, seen)) {
329 			scx_error(sch, "cid %d assigned to multiple cpus", c);
330 			return;
331 		}
332 		scx_cpu_to_cid_tbl[cpu] = c;
333 		scx_cid_to_cpu_tbl[c] = cpu;
334 	}
335 
336 	/* Invalidate stale topo info - the override carries no topology. */
337 	for (cid = 0; cid < num_possible_cpus(); cid++)
338 		scx_cid_topo[cid] = SCX_CID_TOPO_NEG;
339 }
340 
341 /**
342  * scx_bpf_cid_to_cpu - Return the raw CPU id for @cid
343  * @cid: cid to look up
344  * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
345  *
346  * Return the raw CPU id for @cid. Trigger scx_error() and return -EINVAL if
347  * @cid is invalid. The cid<->cpu mapping is static for the lifetime of the
348  * loaded scheduler, so the BPF side can cache the result to avoid repeated
349  * kfunc invocations.
350  */
351 __bpf_kfunc s32 scx_bpf_cid_to_cpu(s32 cid, const struct bpf_prog_aux *aux)
352 {
353 	struct scx_sched *sch;
354 
355 	guard(rcu)();
356 
357 	sch = scx_prog_sched(aux);
358 	if (unlikely(!sch))
359 		return -EINVAL;
360 	return scx_cid_to_cpu(sch, cid);
361 }
362 
363 /**
364  * scx_bpf_cpu_to_cid - Return the cid for @cpu
365  * @cpu: cpu to look up
366  * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
367  *
368  * Return the cid for @cpu. Trigger scx_error() and return -EINVAL if @cpu is
369  * invalid. The cid<->cpu mapping is static for the lifetime of the loaded
370  * scheduler, so the BPF side can cache the result to avoid repeated kfunc
371  * invocations.
372  */
373 __bpf_kfunc s32 scx_bpf_cpu_to_cid(s32 cpu, const struct bpf_prog_aux *aux)
374 {
375 	struct scx_sched *sch;
376 
377 	guard(rcu)();
378 
379 	sch = scx_prog_sched(aux);
380 	if (unlikely(!sch))
381 		return -EINVAL;
382 	return scx_cpu_to_cid(sch, cpu);
383 }
384 
385 /*
386  * Set ops on cmasks. cmask_walk_op2() shares one walk across mutating
387  * (and/or/copy/andnot) and predicate (subset/intersects) two-cmask forms;
388  * cmask_walk_op1() does the same shape over a single cmask range. Every public
389  * entry passes a compile-time-constant @op; cmask_walk_op{1,2}() and
390  * cmask_word_op{1,2}() are __always_inline so the inner switch collapses to the
391  * selected op and cmask_op2_is_pred() folds the predicate early-exit out of
392  * mutating ops.
393  *
394  * Two-cmask ops only touch @dst bits inside the intersection of the two ranges;
395  * bits outside stay untouched. In particular, scx_cmask_copy() does NOT zero
396  * @dst bits that lie outside @src's range.
397  *
398  * The _RACY variants are otherwise identical to their non-racy counterpart but
399  * read @src word-by-word via data_race(). Memory ordering with concurrent
400  * writers is the caller's responsibility.
401  */
402 enum cmask_op2 {
403 	/* mutating */
404 	CMASK_OP2_AND,
405 	CMASK_OP2_OR,
406 	CMASK_OP2_OR_RACY,
407 	CMASK_OP2_COPY,
408 	CMASK_OP2_COPY_RACY,
409 	CMASK_OP2_ANDNOT,
410 	/* predicates - short-circuit when the per-word result is true */
411 	CMASK_OP2_SUBSET,
412 	CMASK_OP2_INTERSECTS,
413 };
414 
415 static __always_inline bool cmask_op2_is_pred(const enum cmask_op2 op)
416 {
417 	return op == CMASK_OP2_SUBSET || op == CMASK_OP2_INTERSECTS;
418 }
419 
420 static __always_inline bool cmask_word_op2(u64 *av, const u64 *bp, u64 mask,
421 					   const enum cmask_op2 op)
422 {
423 	switch (op) {
424 	case CMASK_OP2_AND:
425 		*av &= ~mask | *bp;
426 		return false;
427 	case CMASK_OP2_OR:
428 		*av |= *bp & mask;
429 		return false;
430 	case CMASK_OP2_OR_RACY:
431 		*av |= data_race(*bp) & mask;
432 		return false;
433 	case CMASK_OP2_COPY:
434 		*av = (*av & ~mask) | (*bp & mask);
435 		return false;
436 	case CMASK_OP2_COPY_RACY:
437 		*av = (*av & ~mask) | (data_race(*bp) & mask);
438 		return false;
439 	case CMASK_OP2_ANDNOT:
440 		*av &= ~(*bp & mask);
441 		return false;
442 	case CMASK_OP2_SUBSET:
443 		/* stop on the first bit in @sub not set in @super */
444 		return (*bp & ~*av) & mask;
445 	case CMASK_OP2_INTERSECTS:
446 		return (*av & *bp) & mask;
447 	}
448 	unreachable();
449 }
450 
451 /*
452  * Walk the intersection of [@a_base, @a_base + @a_nr_cids) with [@b_base,
453  * @b_base + @b_nr_cids) word by word, applying @op. Mutating ops walk all words
454  * and return false; predicates return true on the first word whose per-word
455  * test is true. Empty intersection returns false (matches "no bits to consider"
456  * for both mutate and predicate).
457  *
458  * Base/nr_cids are taken as parameters so callers with snapshotted bounds can
459  * drive the walk with values independent of the cmask's header.
460  */
461 static __always_inline bool cmask_walk_op2(u64 *a_bits, u32 a_base, u32 a_nr_cids,
462 					   const u64 *b_bits, u32 b_base, u32 b_nr_cids,
463 					   const enum cmask_op2 op)
464 {
465 	u32 lo = max(a_base, b_base);
466 	u32 hi = min(a_base + a_nr_cids, b_base + b_nr_cids);
467 	u32 a_word_off = a_base / 64;
468 	u32 b_word_off = b_base / 64;
469 	u32 lo_word = lo / 64;
470 	u32 hi_word = (hi - 1) / 64;
471 	u64 head_mask = GENMASK_U64(63, lo & 63);
472 	u64 tail_mask = GENMASK_U64((hi - 1) & 63, 0);
473 	u32 w;
474 
475 	if (lo >= hi)
476 		return false;
477 
478 	if (lo_word == hi_word)
479 		return cmask_word_op2(&a_bits[lo_word - a_word_off],
480 				      &b_bits[lo_word - b_word_off],
481 				      head_mask & tail_mask, op);
482 
483 	if (cmask_word_op2(&a_bits[lo_word - a_word_off],
484 			   &b_bits[lo_word - b_word_off], head_mask, op) &&
485 	    cmask_op2_is_pred(op))
486 		return true;
487 
488 	for (w = lo_word + 1; w < hi_word; w++)
489 		if (cmask_word_op2(&a_bits[w - a_word_off],
490 				   &b_bits[w - b_word_off], ~0ULL, op) &&
491 		    cmask_op2_is_pred(op))
492 			return true;
493 
494 	return cmask_word_op2(&a_bits[hi_word - a_word_off],
495 			      &b_bits[hi_word - b_word_off], tail_mask, op);
496 }
497 
498 enum cmask_op1 {
499 	CMASK_OP1_ANY_SET,
500 };
501 
502 static __always_inline bool cmask_word_op1(const u64 *ap, u64 mask,
503 					   const enum cmask_op1 op)
504 {
505 	switch (op) {
506 	case CMASK_OP1_ANY_SET:
507 		return *ap & mask;
508 	}
509 	unreachable();
510 }
511 
512 /*
513  * Walk [@a_base, @a_base + @a_nr_cids) of @a_bits word by word, applying @op.
514  * Returns true on the first word whose per-word test is true; returns false if
515  * no word matches or the range is empty. All current op1s short-circuit on
516  * per-word true; if a non-predicate op1 lands here, add a cmask_op1_is_pred()
517  * guard analogous to cmask_op2_is_pred().
518  */
519 static __always_inline bool cmask_walk_op1(const u64 *a_bits, u32 a_base,
520 					   u32 a_nr_cids,
521 					   const enum cmask_op1 op)
522 {
523 	u32 lo = a_base;
524 	u32 hi = a_base + a_nr_cids;
525 	u32 a_word_off = a_base / 64;
526 	u32 lo_word = lo / 64;
527 	u32 hi_word = (hi - 1) / 64;
528 	u64 head_mask = GENMASK_U64(63, lo & 63);
529 	u64 tail_mask = GENMASK_U64((hi - 1) & 63, 0);
530 	u32 w;
531 
532 	if (lo >= hi)
533 		return false;
534 
535 	if (lo_word == hi_word)
536 		return cmask_word_op1(&a_bits[lo_word - a_word_off],
537 				      head_mask & tail_mask, op);
538 
539 	if (cmask_word_op1(&a_bits[lo_word - a_word_off], head_mask, op))
540 		return true;
541 	for (w = lo_word + 1; w < hi_word; w++)
542 		if (cmask_word_op1(&a_bits[w - a_word_off], ~0ULL, op))
543 			return true;
544 	return cmask_word_op1(&a_bits[hi_word - a_word_off], tail_mask, op);
545 }
546 
547 void scx_cmask_and(struct scx_cmask *dst, const struct scx_cmask *src)
548 {
549 	cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
550 		       src->bits, src->base, src->nr_cids, CMASK_OP2_AND);
551 }
552 
553 void scx_cmask_or(struct scx_cmask *dst, const struct scx_cmask *src)
554 {
555 	cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
556 		       src->bits, src->base, src->nr_cids, CMASK_OP2_OR);
557 }
558 
559 /**
560  * scx_cmask_or_racy - OR @src into @dst, reading @src without locking
561  *
562  * @src is read word-by-word through data_race(). Same per-bit independence
563  * rationale as scx_cmask_copy_racy(). Memory ordering with writers is the
564  * caller's responsibility.
565  */
566 void scx_cmask_or_racy(struct scx_cmask *dst, const struct scx_cmask *src)
567 {
568 	cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
569 		       src->bits, src->base, src->nr_cids, CMASK_OP2_OR_RACY);
570 }
571 
572 void scx_cmask_copy(struct scx_cmask *dst, const struct scx_cmask *src)
573 {
574 	cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
575 		       src->bits, src->base, src->nr_cids, CMASK_OP2_COPY);
576 }
577 
578 /**
579  * scx_cmask_copy_racy - Snapshot @src into @dst without locking
580  *
581  * @src is read word-by-word through data_race(). Head/tail masking matches
582  * scx_cmask_copy(). Each bit in a cmask is independent, so partial updates
583  * just leave some bits fresher than others. Memory ordering with writers is
584  * the caller's responsibility.
585  */
586 void scx_cmask_copy_racy(struct scx_cmask *dst, const struct scx_cmask *src)
587 {
588 	cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
589 		       src->bits, src->base, src->nr_cids, CMASK_OP2_COPY_RACY);
590 }
591 
592 void scx_cmask_andnot(struct scx_cmask *dst, const struct scx_cmask *src)
593 {
594 	cmask_walk_op2(dst->bits, dst->base, dst->nr_cids,
595 		       src->bits, src->base, src->nr_cids, CMASK_OP2_ANDNOT);
596 }
597 
598 /*
599  * Return true if @cm has any bit set in [@lo, @hi). Caller must ensure
600  * [@lo, @hi) is contained in @cm's range.
601  */
602 static bool cmask_any_set_in_range(const struct scx_cmask *cm, u32 lo, u32 hi)
603 {
604 	if (lo >= hi)
605 		return false;
606 	return cmask_walk_op1(&cm->bits[lo / 64 - cm->base / 64], lo, hi - lo,
607 			      CMASK_OP1_ANY_SET);
608 }
609 
610 /**
611  * scx_cmask_subset - test whether @sub is a subset of @super
612  * @sub: cmask to test
613  * @super: cmask to test against
614  *
615  * Return true iff every set bit of @sub is also set in @super.
616  */
617 bool scx_cmask_subset(const struct scx_cmask *sub, const struct scx_cmask *super)
618 {
619 	u32 super_end = super->base + super->nr_cids;
620 	u32 sub_end = sub->base + sub->nr_cids;
621 
622 	/*
623 	 * Set bits in @sub outside @super's range can't be in @super, so any
624 	 * such bit means not a subset. The walk below only visits words
625 	 * common to both ranges, so these need a separate scan.
626 	 */
627 	if (sub->base < super->base &&
628 	    cmask_any_set_in_range(sub, sub->base, min(super->base, sub_end)))
629 		return false;
630 	if (sub_end > super_end &&
631 	    cmask_any_set_in_range(sub, max(sub->base, super_end), sub_end))
632 		return false;
633 
634 	return !cmask_walk_op2((u64 *)super->bits, super->base, super->nr_cids,
635 			       sub->bits, sub->base, sub->nr_cids, CMASK_OP2_SUBSET);
636 }
637 
638 bool scx_cmask_intersects(const struct scx_cmask *a, const struct scx_cmask *b)
639 {
640 	return cmask_walk_op2((u64 *)a->bits, a->base, a->nr_cids,
641 			      b->bits, b->base, b->nr_cids, CMASK_OP2_INTERSECTS);
642 }
643 
644 /**
645  * scx_cmask_empty - Test whether @m has no bits set
646  * @m: cmask to test
647  *
648  * Return true iff @m's active range has no bits set.
649  */
650 bool scx_cmask_empty(const struct scx_cmask *m)
651 {
652 	return !cmask_any_set_in_range(m, m->base, m->base + m->nr_cids);
653 }
654 
655 /**
656  * scx_bpf_cid_topo - Copy out per-cid topology info
657  * @cid: cid to look up
658  * @out__uninit: where to copy the topology info; fully written by this call
659  * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
660  *
661  * Fill @out__uninit with the topology info for @cid. Trigger scx_error() if
662  * @cid is out of range. If @cid is valid but in the no-topo section, all fields
663  * are set to -1.
664  */
665 __bpf_kfunc void scx_bpf_cid_topo(s32 cid, struct scx_cid_topo *out__uninit,
666 				  const struct bpf_prog_aux *aux)
667 {
668 	struct scx_sched *sch;
669 
670 	guard(rcu)();
671 
672 	sch = scx_prog_sched(aux);
673 	if (unlikely(!sch) || !cid_valid(sch, cid)) {
674 		*out__uninit = SCX_CID_TOPO_NEG;
675 		return;
676 	}
677 
678 	*out__uninit = READ_ONCE(scx_cid_topo)[cid];
679 }
680 
681 __bpf_kfunc_end_defs();
682 
683 BTF_KFUNCS_START(scx_kfunc_ids_init)
684 BTF_ID_FLAGS(func, scx_bpf_cid_override, KF_IMPLICIT_ARGS | KF_SLEEPABLE)
685 BTF_KFUNCS_END(scx_kfunc_ids_init)
686 
687 static const struct btf_kfunc_id_set scx_kfunc_set_init = {
688 	.owner	= THIS_MODULE,
689 	.set	= &scx_kfunc_ids_init,
690 	.filter	= scx_kfunc_context_filter,
691 };
692 
693 BTF_KFUNCS_START(scx_kfunc_ids_cid)
694 BTF_ID_FLAGS(func, scx_bpf_cid_to_cpu, KF_IMPLICIT_ARGS)
695 BTF_ID_FLAGS(func, scx_bpf_cpu_to_cid, KF_IMPLICIT_ARGS)
696 BTF_ID_FLAGS(func, scx_bpf_cid_topo, KF_IMPLICIT_ARGS)
697 BTF_KFUNCS_END(scx_kfunc_ids_cid)
698 
699 static const struct btf_kfunc_id_set scx_kfunc_set_cid = {
700 	.owner	= THIS_MODULE,
701 	.set	= &scx_kfunc_ids_cid,
702 };
703 
704 int scx_cid_kfunc_init(void)
705 {
706 	return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_init) ?:
707 		register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_cid) ?:
708 		register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_cid) ?:
709 		register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_cid);
710 }
711