xref: /linux/kernel/sched/ext/idle.c (revision 7603d8e78023e5883e075b4625fbdf059c6384f7)
1bba2c361STejun Heo // SPDX-License-Identifier: GPL-2.0
2bba2c361STejun Heo /*
3bba2c361STejun Heo  * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
4bba2c361STejun Heo  *
5bba2c361STejun Heo  * Built-in idle CPU tracking policy.
6bba2c361STejun Heo  *
7bba2c361STejun Heo  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
8bba2c361STejun Heo  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
9bba2c361STejun Heo  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
10bba2c361STejun Heo  * Copyright (c) 2024 Andrea Righi <arighi@nvidia.com>
11bba2c361STejun Heo  */
12*3cd1f76bSTejun Heo #include "internal.h"
13*3cd1f76bSTejun Heo #include "cid.h"
14*3cd1f76bSTejun Heo #include "idle.h"
15bba2c361STejun Heo 
16bba2c361STejun Heo /* Enable/disable built-in idle CPU selection policy */
17bba2c361STejun Heo static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
18bba2c361STejun Heo 
19bba2c361STejun Heo /* Enable/disable per-node idle cpumasks */
20bba2c361STejun Heo static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_per_node);
21bba2c361STejun Heo 
22bba2c361STejun Heo /* Enable/disable LLC aware optimizations */
23bba2c361STejun Heo static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
24bba2c361STejun Heo 
25bba2c361STejun Heo /* Enable/disable NUMA aware optimizations */
26bba2c361STejun Heo static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
27bba2c361STejun Heo 
28bba2c361STejun Heo /*
29bba2c361STejun Heo  * cpumasks to track idle CPUs within each NUMA node.
30bba2c361STejun Heo  *
31bba2c361STejun Heo  * If SCX_OPS_BUILTIN_IDLE_PER_NODE is not enabled, a single global cpumask
32bba2c361STejun Heo  * from is used to track all the idle CPUs in the system.
33bba2c361STejun Heo  */
34bba2c361STejun Heo struct scx_idle_cpus {
35bba2c361STejun Heo 	cpumask_var_t cpu;
36bba2c361STejun Heo 	cpumask_var_t smt;
37bba2c361STejun Heo };
38bba2c361STejun Heo 
39bba2c361STejun Heo /*
40bba2c361STejun Heo  * Global host-wide idle cpumasks (used when SCX_OPS_BUILTIN_IDLE_PER_NODE
41bba2c361STejun Heo  * is not enabled).
42bba2c361STejun Heo  */
43bba2c361STejun Heo static struct scx_idle_cpus scx_idle_global_masks;
44bba2c361STejun Heo 
45bba2c361STejun Heo /*
46bba2c361STejun Heo  * Per-node idle cpumasks.
47bba2c361STejun Heo  */
48bba2c361STejun Heo static struct scx_idle_cpus **scx_idle_node_masks;
49bba2c361STejun Heo 
50bba2c361STejun Heo /*
51bba2c361STejun Heo  * Local per-CPU cpumasks (used to generate temporary idle cpumasks).
52bba2c361STejun Heo  */
53bba2c361STejun Heo static DEFINE_PER_CPU(cpumask_var_t, local_idle_cpumask);
54bba2c361STejun Heo static DEFINE_PER_CPU(cpumask_var_t, local_llc_idle_cpumask);
55bba2c361STejun Heo static DEFINE_PER_CPU(cpumask_var_t, local_numa_idle_cpumask);
56bba2c361STejun Heo 
57bba2c361STejun Heo /*
58bba2c361STejun Heo  * Return the idle masks associated to a target @node.
59bba2c361STejun Heo  *
60bba2c361STejun Heo  * NUMA_NO_NODE identifies the global idle cpumask.
61bba2c361STejun Heo  */
62bba2c361STejun Heo static struct scx_idle_cpus *idle_cpumask(int node)
63bba2c361STejun Heo {
64bba2c361STejun Heo 	return node == NUMA_NO_NODE ? &scx_idle_global_masks : scx_idle_node_masks[node];
65bba2c361STejun Heo }
66bba2c361STejun Heo 
67bba2c361STejun Heo /*
68bba2c361STejun Heo  * Returns the NUMA node ID associated with a @cpu, or NUMA_NO_NODE if
69bba2c361STejun Heo  * per-node idle cpumasks are disabled.
70bba2c361STejun Heo  */
71bba2c361STejun Heo static int scx_cpu_node_if_enabled(int cpu)
72bba2c361STejun Heo {
73bba2c361STejun Heo 	if (!static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node))
74bba2c361STejun Heo 		return NUMA_NO_NODE;
75bba2c361STejun Heo 
76bba2c361STejun Heo 	return cpu_to_node(cpu);
77bba2c361STejun Heo }
78bba2c361STejun Heo 
79bba2c361STejun Heo static bool scx_idle_test_and_clear_cpu(int cpu)
80bba2c361STejun Heo {
81bba2c361STejun Heo 	int node = scx_cpu_node_if_enabled(cpu);
82bba2c361STejun Heo 	struct cpumask *idle_cpus = idle_cpumask(node)->cpu;
83bba2c361STejun Heo 
84bba2c361STejun Heo 	/*
85bba2c361STejun Heo 	 * SMT mask should be cleared whether we can claim @cpu or not. The SMT
86bba2c361STejun Heo 	 * cluster is not wholly idle either way. This also prevents
87bba2c361STejun Heo 	 * scx_pick_idle_cpu() from getting caught in an infinite loop.
88bba2c361STejun Heo 	 */
89bba2c361STejun Heo 	if (sched_smt_active()) {
90bba2c361STejun Heo 		const struct cpumask *smt = cpu_smt_mask(cpu);
91bba2c361STejun Heo 		struct cpumask *idle_smts = idle_cpumask(node)->smt;
92bba2c361STejun Heo 
93bba2c361STejun Heo 		/*
94bba2c361STejun Heo 		 * If offline, @cpu is not its own sibling and
95bba2c361STejun Heo 		 * scx_pick_idle_cpu() can get caught in an infinite loop as
96bba2c361STejun Heo 		 * @cpu is never cleared from the idle SMT mask. Ensure that
97bba2c361STejun Heo 		 * @cpu is eventually cleared.
98bba2c361STejun Heo 		 *
99bba2c361STejun Heo 		 * NOTE: Use cpumask_intersects() and cpumask_test_cpu() to
100bba2c361STejun Heo 		 * reduce memory writes, which may help alleviate cache
101bba2c361STejun Heo 		 * coherence pressure.
102bba2c361STejun Heo 		 */
103bba2c361STejun Heo 		if (cpumask_intersects(smt, idle_smts))
104bba2c361STejun Heo 			cpumask_andnot(idle_smts, idle_smts, smt);
105bba2c361STejun Heo 		else if (cpumask_test_cpu(cpu, idle_smts))
106bba2c361STejun Heo 			__cpumask_clear_cpu(cpu, idle_smts);
107bba2c361STejun Heo 	}
108bba2c361STejun Heo 
109bba2c361STejun Heo 	return cpumask_test_and_clear_cpu(cpu, idle_cpus);
110bba2c361STejun Heo }
111bba2c361STejun Heo 
112bba2c361STejun Heo /*
113bba2c361STejun Heo  * Pick an idle CPU in a specific NUMA node.
114bba2c361STejun Heo  */
115bba2c361STejun Heo static s32 pick_idle_cpu_in_node(const struct cpumask *cpus_allowed, int node, u64 flags)
116bba2c361STejun Heo {
117bba2c361STejun Heo 	int cpu;
118bba2c361STejun Heo 
119bba2c361STejun Heo retry:
120bba2c361STejun Heo 	if (sched_smt_active()) {
121bba2c361STejun Heo 		cpu = cpumask_any_and_distribute(idle_cpumask(node)->smt, cpus_allowed);
122bba2c361STejun Heo 		if (cpu < nr_cpu_ids)
123bba2c361STejun Heo 			goto found;
124bba2c361STejun Heo 
125bba2c361STejun Heo 		if (flags & SCX_PICK_IDLE_CORE)
126bba2c361STejun Heo 			return -EBUSY;
127bba2c361STejun Heo 	}
128bba2c361STejun Heo 
129bba2c361STejun Heo 	cpu = cpumask_any_and_distribute(idle_cpumask(node)->cpu, cpus_allowed);
130bba2c361STejun Heo 	if (cpu >= nr_cpu_ids)
131bba2c361STejun Heo 		return -EBUSY;
132bba2c361STejun Heo 
133bba2c361STejun Heo found:
134bba2c361STejun Heo 	if (scx_idle_test_and_clear_cpu(cpu))
135bba2c361STejun Heo 		return cpu;
136bba2c361STejun Heo 	else
137bba2c361STejun Heo 		goto retry;
138bba2c361STejun Heo }
139bba2c361STejun Heo 
140bba2c361STejun Heo #ifdef CONFIG_NUMA
141bba2c361STejun Heo /*
142bba2c361STejun Heo  * Tracks nodes that have not yet been visited when searching for an idle
143bba2c361STejun Heo  * CPU across all available nodes.
144bba2c361STejun Heo  */
145bba2c361STejun Heo static DEFINE_PER_CPU(nodemask_t, per_cpu_unvisited);
146bba2c361STejun Heo 
147bba2c361STejun Heo /*
148bba2c361STejun Heo  * Search for an idle CPU across all nodes, excluding @node.
149bba2c361STejun Heo  */
150bba2c361STejun Heo static s32 pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags)
151bba2c361STejun Heo {
152bba2c361STejun Heo 	nodemask_t *unvisited;
153bba2c361STejun Heo 	s32 cpu = -EBUSY;
154bba2c361STejun Heo 
155bba2c361STejun Heo 	preempt_disable();
156bba2c361STejun Heo 	unvisited = this_cpu_ptr(&per_cpu_unvisited);
157bba2c361STejun Heo 
158bba2c361STejun Heo 	/*
159bba2c361STejun Heo 	 * Restrict the search to the online nodes (excluding the current
160bba2c361STejun Heo 	 * node that has been visited already).
161bba2c361STejun Heo 	 */
162bba2c361STejun Heo 	nodes_copy(*unvisited, node_states[N_ONLINE]);
163bba2c361STejun Heo 	node_clear(node, *unvisited);
164bba2c361STejun Heo 
165bba2c361STejun Heo 	/*
166bba2c361STejun Heo 	 * Traverse all nodes in order of increasing distance, starting
167bba2c361STejun Heo 	 * from @node.
168bba2c361STejun Heo 	 *
169bba2c361STejun Heo 	 * This loop is O(N^2), with N being the amount of NUMA nodes,
170bba2c361STejun Heo 	 * which might be quite expensive in large NUMA systems. However,
171bba2c361STejun Heo 	 * this complexity comes into play only when a scheduler enables
172bba2c361STejun Heo 	 * SCX_OPS_BUILTIN_IDLE_PER_NODE and it's requesting an idle CPU
173bba2c361STejun Heo 	 * without specifying a target NUMA node, so it shouldn't be a
174bba2c361STejun Heo 	 * bottleneck is most cases.
175bba2c361STejun Heo 	 *
176bba2c361STejun Heo 	 * As a future optimization we may want to cache the list of nodes
177bba2c361STejun Heo 	 * in a per-node array, instead of actually traversing them every
178bba2c361STejun Heo 	 * time.
179bba2c361STejun Heo 	 */
180bba2c361STejun Heo 	for_each_node_numadist(node, *unvisited) {
181bba2c361STejun Heo 		cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags);
182bba2c361STejun Heo 		if (cpu >= 0)
183bba2c361STejun Heo 			break;
184bba2c361STejun Heo 	}
185bba2c361STejun Heo 	preempt_enable();
186bba2c361STejun Heo 
187bba2c361STejun Heo 	return cpu;
188bba2c361STejun Heo }
189bba2c361STejun Heo #else
190bba2c361STejun Heo static inline s32
191bba2c361STejun Heo pick_idle_cpu_from_online_nodes(const struct cpumask *cpus_allowed, int node, u64 flags)
192bba2c361STejun Heo {
193bba2c361STejun Heo 	return -EBUSY;
194bba2c361STejun Heo }
195bba2c361STejun Heo #endif
196bba2c361STejun Heo 
197bba2c361STejun Heo /*
198bba2c361STejun Heo  * Find an idle CPU in the system, starting from @node.
199bba2c361STejun Heo  */
200bba2c361STejun Heo static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, int node, u64 flags)
201bba2c361STejun Heo {
202bba2c361STejun Heo 	s32 cpu;
203bba2c361STejun Heo 
204bba2c361STejun Heo 	/*
205bba2c361STejun Heo 	 * Always search in the starting node first (this is an
206bba2c361STejun Heo 	 * optimization that can save some cycles even when the search is
207bba2c361STejun Heo 	 * not limited to a single node).
208bba2c361STejun Heo 	 */
209bba2c361STejun Heo 	cpu = pick_idle_cpu_in_node(cpus_allowed, node, flags);
210bba2c361STejun Heo 	if (cpu >= 0)
211bba2c361STejun Heo 		return cpu;
212bba2c361STejun Heo 
213bba2c361STejun Heo 	/*
214bba2c361STejun Heo 	 * Stop the search if we are using only a single global cpumask
215bba2c361STejun Heo 	 * (NUMA_NO_NODE) or if the search is restricted to the first node
216bba2c361STejun Heo 	 * only.
217bba2c361STejun Heo 	 */
218bba2c361STejun Heo 	if (node == NUMA_NO_NODE || flags & SCX_PICK_IDLE_IN_NODE)
219bba2c361STejun Heo 		return -EBUSY;
220bba2c361STejun Heo 
221bba2c361STejun Heo 	/*
222bba2c361STejun Heo 	 * Extend the search to the other online nodes.
223bba2c361STejun Heo 	 */
224bba2c361STejun Heo 	return pick_idle_cpu_from_online_nodes(cpus_allowed, node, flags);
225bba2c361STejun Heo }
226bba2c361STejun Heo 
227bba2c361STejun Heo /*
228bba2c361STejun Heo  * Return the amount of CPUs in the same LLC domain of @cpu (or zero if the LLC
229bba2c361STejun Heo  * domain is not defined).
230bba2c361STejun Heo  */
231bba2c361STejun Heo static unsigned int llc_weight(s32 cpu)
232bba2c361STejun Heo {
233bba2c361STejun Heo 	struct sched_domain *sd;
234bba2c361STejun Heo 
235bba2c361STejun Heo 	sd = rcu_dereference(per_cpu(sd_llc, cpu));
236bba2c361STejun Heo 	if (!sd)
237bba2c361STejun Heo 		return 0;
238bba2c361STejun Heo 
239bba2c361STejun Heo 	return sd->span_weight;
240bba2c361STejun Heo }
241bba2c361STejun Heo 
242bba2c361STejun Heo /*
243bba2c361STejun Heo  * Return the cpumask representing the LLC domain of @cpu (or NULL if the LLC
244bba2c361STejun Heo  * domain is not defined).
245bba2c361STejun Heo  */
246bba2c361STejun Heo static struct cpumask *llc_span(s32 cpu)
247bba2c361STejun Heo {
248bba2c361STejun Heo 	struct sched_domain *sd;
249bba2c361STejun Heo 
250bba2c361STejun Heo 	sd = rcu_dereference(per_cpu(sd_llc, cpu));
251bba2c361STejun Heo 	if (!sd)
252bba2c361STejun Heo 		return NULL;
253bba2c361STejun Heo 
254bba2c361STejun Heo 	return sched_domain_span(sd);
255bba2c361STejun Heo }
256bba2c361STejun Heo 
257bba2c361STejun Heo /*
258bba2c361STejun Heo  * Return the amount of CPUs in the same NUMA domain of @cpu (or zero if the
259bba2c361STejun Heo  * NUMA domain is not defined).
260bba2c361STejun Heo  */
261bba2c361STejun Heo static unsigned int numa_weight(s32 cpu)
262bba2c361STejun Heo {
263bba2c361STejun Heo 	struct sched_domain *sd;
264bba2c361STejun Heo 	struct sched_group *sg;
265bba2c361STejun Heo 
266bba2c361STejun Heo 	sd = rcu_dereference(per_cpu(sd_numa, cpu));
267bba2c361STejun Heo 	if (!sd)
268bba2c361STejun Heo 		return 0;
269bba2c361STejun Heo 	sg = sd->groups;
270bba2c361STejun Heo 	if (!sg)
271bba2c361STejun Heo 		return 0;
272bba2c361STejun Heo 
273bba2c361STejun Heo 	return sg->group_weight;
274bba2c361STejun Heo }
275bba2c361STejun Heo 
276bba2c361STejun Heo /*
277bba2c361STejun Heo  * Return the cpumask representing the NUMA domain of @cpu (or NULL if the NUMA
278bba2c361STejun Heo  * domain is not defined).
279bba2c361STejun Heo  */
280bba2c361STejun Heo static struct cpumask *numa_span(s32 cpu)
281bba2c361STejun Heo {
282bba2c361STejun Heo 	struct sched_domain *sd;
283bba2c361STejun Heo 	struct sched_group *sg;
284bba2c361STejun Heo 
285bba2c361STejun Heo 	sd = rcu_dereference(per_cpu(sd_numa, cpu));
286bba2c361STejun Heo 	if (!sd)
287bba2c361STejun Heo 		return NULL;
288bba2c361STejun Heo 	sg = sd->groups;
289bba2c361STejun Heo 	if (!sg)
290bba2c361STejun Heo 		return NULL;
291bba2c361STejun Heo 
292bba2c361STejun Heo 	return sched_group_span(sg);
293bba2c361STejun Heo }
294bba2c361STejun Heo 
295bba2c361STejun Heo /*
296bba2c361STejun Heo  * Return true if the LLC domains do not perfectly overlap with the NUMA
297bba2c361STejun Heo  * domains, false otherwise.
298bba2c361STejun Heo  */
299bba2c361STejun Heo static bool llc_numa_mismatch(void)
300bba2c361STejun Heo {
301bba2c361STejun Heo 	int cpu;
302bba2c361STejun Heo 
303bba2c361STejun Heo 	/*
304bba2c361STejun Heo 	 * We need to scan all online CPUs to verify whether their scheduling
305bba2c361STejun Heo 	 * domains overlap.
306bba2c361STejun Heo 	 *
307bba2c361STejun Heo 	 * While it is rare to encounter architectures with asymmetric NUMA
308bba2c361STejun Heo 	 * topologies, CPU hotplugging or virtualized environments can result
309bba2c361STejun Heo 	 * in asymmetric configurations.
310bba2c361STejun Heo 	 *
311bba2c361STejun Heo 	 * For example:
312bba2c361STejun Heo 	 *
313bba2c361STejun Heo 	 *  NUMA 0:
314bba2c361STejun Heo 	 *    - LLC 0: cpu0..cpu7
315bba2c361STejun Heo 	 *    - LLC 1: cpu8..cpu15 [offline]
316bba2c361STejun Heo 	 *
317bba2c361STejun Heo 	 *  NUMA 1:
318bba2c361STejun Heo 	 *    - LLC 0: cpu16..cpu23
319bba2c361STejun Heo 	 *    - LLC 1: cpu24..cpu31
320bba2c361STejun Heo 	 *
321bba2c361STejun Heo 	 * In this case, if we only check the first online CPU (cpu0), we might
322bba2c361STejun Heo 	 * incorrectly assume that the LLC and NUMA domains are fully
323bba2c361STejun Heo 	 * overlapping, which is incorrect (as NUMA 1 has two distinct LLC
324bba2c361STejun Heo 	 * domains).
325bba2c361STejun Heo 	 */
326bba2c361STejun Heo 	for_each_online_cpu(cpu)
327bba2c361STejun Heo 		if (llc_weight(cpu) != numa_weight(cpu))
328bba2c361STejun Heo 			return true;
329bba2c361STejun Heo 
330bba2c361STejun Heo 	return false;
331bba2c361STejun Heo }
332bba2c361STejun Heo 
333bba2c361STejun Heo /*
334bba2c361STejun Heo  * Initialize topology-aware scheduling.
335bba2c361STejun Heo  *
336bba2c361STejun Heo  * Detect if the system has multiple LLC or multiple NUMA domains and enable
337bba2c361STejun Heo  * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
338bba2c361STejun Heo  * selection policy.
339bba2c361STejun Heo  *
340bba2c361STejun Heo  * Assumption: the kernel's internal topology representation assumes that each
341bba2c361STejun Heo  * CPU belongs to a single LLC domain, and that each LLC domain is entirely
342bba2c361STejun Heo  * contained within a single NUMA node.
343bba2c361STejun Heo  */
344bba2c361STejun Heo void scx_idle_update_selcpu_topology(struct sched_ext_ops *ops)
345bba2c361STejun Heo {
346bba2c361STejun Heo 	bool enable_llc = false, enable_numa = false;
347bba2c361STejun Heo 	unsigned int nr_cpus;
348bba2c361STejun Heo 	s32 cpu = cpumask_first(cpu_online_mask);
349bba2c361STejun Heo 
350bba2c361STejun Heo 	/*
351bba2c361STejun Heo 	 * Enable LLC domain optimization only when there are multiple LLC
352bba2c361STejun Heo 	 * domains among the online CPUs. If all online CPUs are part of a
353bba2c361STejun Heo 	 * single LLC domain, the idle CPU selection logic can choose any
354bba2c361STejun Heo 	 * online CPU without bias.
355bba2c361STejun Heo 	 *
356bba2c361STejun Heo 	 * Note that it is sufficient to check the LLC domain of the first
357bba2c361STejun Heo 	 * online CPU to determine whether a single LLC domain includes all
358bba2c361STejun Heo 	 * CPUs.
359bba2c361STejun Heo 	 */
360bba2c361STejun Heo 	rcu_read_lock();
361bba2c361STejun Heo 	nr_cpus = llc_weight(cpu);
362bba2c361STejun Heo 	if (nr_cpus > 0) {
363bba2c361STejun Heo 		if (nr_cpus < num_online_cpus())
364bba2c361STejun Heo 			enable_llc = true;
365bba2c361STejun Heo 		pr_debug("sched_ext: LLC=%*pb weight=%u\n",
366bba2c361STejun Heo 			 cpumask_pr_args(llc_span(cpu)), llc_weight(cpu));
367bba2c361STejun Heo 	}
368bba2c361STejun Heo 
369bba2c361STejun Heo 	/*
370bba2c361STejun Heo 	 * Enable NUMA optimization only when there are multiple NUMA domains
371bba2c361STejun Heo 	 * among the online CPUs and the NUMA domains don't perfectly overlap
372bba2c361STejun Heo 	 * with the LLC domains.
373bba2c361STejun Heo 	 *
374bba2c361STejun Heo 	 * If all CPUs belong to the same NUMA node and the same LLC domain,
375bba2c361STejun Heo 	 * enabling both NUMA and LLC optimizations is unnecessary, as checking
376bba2c361STejun Heo 	 * for an idle CPU in the same domain twice is redundant.
377bba2c361STejun Heo 	 *
378bba2c361STejun Heo 	 * If SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled ignore the NUMA
379bba2c361STejun Heo 	 * optimization, as we would naturally select idle CPUs within
380bba2c361STejun Heo 	 * specific NUMA nodes querying the corresponding per-node cpumask.
381bba2c361STejun Heo 	 */
382bba2c361STejun Heo 	if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) {
383bba2c361STejun Heo 		nr_cpus = numa_weight(cpu);
384bba2c361STejun Heo 		if (nr_cpus > 0) {
385bba2c361STejun Heo 			if (nr_cpus < num_online_cpus() && llc_numa_mismatch())
386bba2c361STejun Heo 				enable_numa = true;
387bba2c361STejun Heo 			pr_debug("sched_ext: NUMA=%*pb weight=%u\n",
388bba2c361STejun Heo 				 cpumask_pr_args(numa_span(cpu)), nr_cpus);
389bba2c361STejun Heo 		}
390bba2c361STejun Heo 	}
391bba2c361STejun Heo 	rcu_read_unlock();
392bba2c361STejun Heo 
393bba2c361STejun Heo 	pr_debug("sched_ext: LLC idle selection %s\n",
394bba2c361STejun Heo 		 str_enabled_disabled(enable_llc));
395bba2c361STejun Heo 	pr_debug("sched_ext: NUMA idle selection %s\n",
396bba2c361STejun Heo 		 str_enabled_disabled(enable_numa));
397bba2c361STejun Heo 
398bba2c361STejun Heo 	if (enable_llc)
399bba2c361STejun Heo 		static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
400bba2c361STejun Heo 	else
401bba2c361STejun Heo 		static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
402bba2c361STejun Heo 	if (enable_numa)
403bba2c361STejun Heo 		static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
404bba2c361STejun Heo 	else
405bba2c361STejun Heo 		static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
406bba2c361STejun Heo }
407bba2c361STejun Heo 
408bba2c361STejun Heo /*
409bba2c361STejun Heo  * Return true if @p can run on all possible CPUs, false otherwise.
410bba2c361STejun Heo  */
411bba2c361STejun Heo static inline bool task_affinity_all(const struct task_struct *p)
412bba2c361STejun Heo {
413bba2c361STejun Heo 	return p->nr_cpus_allowed >= num_possible_cpus();
414bba2c361STejun Heo }
415bba2c361STejun Heo 
416bba2c361STejun Heo /*
417bba2c361STejun Heo  * Built-in CPU idle selection policy:
418bba2c361STejun Heo  *
419bba2c361STejun Heo  * 1. Prioritize full-idle cores:
420bba2c361STejun Heo  *   - always prioritize CPUs from fully idle cores (both logical CPUs are
421bba2c361STejun Heo  *     idle) to avoid interference caused by SMT.
422bba2c361STejun Heo  *
423bba2c361STejun Heo  * 2. Reuse the same CPU:
424bba2c361STejun Heo  *   - prefer the last used CPU to take advantage of cached data (L1, L2) and
425bba2c361STejun Heo  *     branch prediction optimizations.
426bba2c361STejun Heo  *
427bba2c361STejun Heo  * 3. Prefer @prev_cpu's SMT sibling:
428bba2c361STejun Heo  *   - if @prev_cpu is busy and no fully idle core is available, try to
429bba2c361STejun Heo  *     place the task on an idle SMT sibling of @prev_cpu; keeping the
430bba2c361STejun Heo  *     task on the same core makes migration cheaper, preserves L1 cache
431bba2c361STejun Heo  *     locality and reduces wakeup latency.
432bba2c361STejun Heo  *
433bba2c361STejun Heo  * 4. Pick a CPU within the same LLC (Last-Level Cache):
434bba2c361STejun Heo  *   - if the above conditions aren't met, pick a CPU that shares the same
435bba2c361STejun Heo  *     LLC, if the LLC domain is a subset of @cpus_allowed, to maintain
436bba2c361STejun Heo  *     cache locality.
437bba2c361STejun Heo  *
438bba2c361STejun Heo  * 5. Pick a CPU within the same NUMA node, if enabled:
439bba2c361STejun Heo  *   - choose a CPU from the same NUMA node, if the node cpumask is a
440bba2c361STejun Heo  *     subset of @cpus_allowed, to reduce memory access latency.
441bba2c361STejun Heo  *
442bba2c361STejun Heo  * 6. Pick any idle CPU within the @cpus_allowed domain.
443bba2c361STejun Heo  *
444bba2c361STejun Heo  * Step 4 and 5 are performed only if the system has, respectively,
445bba2c361STejun Heo  * multiple LLCs / multiple NUMA nodes (see scx_selcpu_topo_llc and
446bba2c361STejun Heo  * scx_selcpu_topo_numa) and they don't contain the same subset of CPUs.
447bba2c361STejun Heo  *
448bba2c361STejun Heo  * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, the search will always
449bba2c361STejun Heo  * begin in @prev_cpu's node and proceed to other nodes in order of
450bba2c361STejun Heo  * increasing distance.
451bba2c361STejun Heo  *
452bba2c361STejun Heo  * Return the picked CPU if idle, or a negative value otherwise.
453bba2c361STejun Heo  *
454bba2c361STejun Heo  * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
455bba2c361STejun Heo  * we never call ops.select_cpu() for them, see select_task_rq().
456bba2c361STejun Heo  */
457bba2c361STejun Heo s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
458bba2c361STejun Heo 		       const struct cpumask *cpus_allowed, u64 flags)
459bba2c361STejun Heo {
460bba2c361STejun Heo 	const struct cpumask *llc_cpus = NULL, *numa_cpus = NULL;
461bba2c361STejun Heo 	const struct cpumask *allowed = cpus_allowed ?: p->cpus_ptr;
462bba2c361STejun Heo 	int node = scx_cpu_node_if_enabled(prev_cpu);
463bba2c361STejun Heo 	bool is_prev_allowed;
464bba2c361STejun Heo 	s32 cpu;
465bba2c361STejun Heo 
466bba2c361STejun Heo 	preempt_disable();
467bba2c361STejun Heo 
468bba2c361STejun Heo 	/*
469bba2c361STejun Heo 	 * Determine the subset of CPUs usable by @p within @cpus_allowed.
470bba2c361STejun Heo 	 */
471bba2c361STejun Heo 	if (allowed != p->cpus_ptr) {
472bba2c361STejun Heo 		struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_idle_cpumask);
473bba2c361STejun Heo 
474bba2c361STejun Heo 		if (task_affinity_all(p)) {
475bba2c361STejun Heo 			allowed = cpus_allowed;
476bba2c361STejun Heo 		} else if (cpumask_and(local_cpus, cpus_allowed, p->cpus_ptr)) {
477bba2c361STejun Heo 			allowed = local_cpus;
478bba2c361STejun Heo 		} else {
479bba2c361STejun Heo 			cpu = -EBUSY;
480bba2c361STejun Heo 			goto out_enable;
481bba2c361STejun Heo 		}
482bba2c361STejun Heo 	}
483bba2c361STejun Heo 
484bba2c361STejun Heo 	/*
485bba2c361STejun Heo 	 * Check whether @prev_cpu is still within the allowed set. If not,
486bba2c361STejun Heo 	 * we can still try selecting a nearby CPU.
487bba2c361STejun Heo 	 */
488bba2c361STejun Heo 	is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed);
489bba2c361STejun Heo 
490bba2c361STejun Heo 	/*
491bba2c361STejun Heo 	 * This is necessary to protect llc_cpus.
492bba2c361STejun Heo 	 */
493bba2c361STejun Heo 	rcu_read_lock();
494bba2c361STejun Heo 
495bba2c361STejun Heo 	/*
496bba2c361STejun Heo 	 * Determine the subset of CPUs that the task can use in its
497bba2c361STejun Heo 	 * current LLC and node.
498bba2c361STejun Heo 	 *
499bba2c361STejun Heo 	 * If the task can run on all CPUs, use the node and LLC cpumasks
500bba2c361STejun Heo 	 * directly.
501bba2c361STejun Heo 	 */
502bba2c361STejun Heo 	if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa)) {
503bba2c361STejun Heo 		struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_numa_idle_cpumask);
504bba2c361STejun Heo 		const struct cpumask *cpus = numa_span(prev_cpu);
505bba2c361STejun Heo 
506bba2c361STejun Heo 		if (allowed == p->cpus_ptr && task_affinity_all(p))
507bba2c361STejun Heo 			numa_cpus = cpus;
508bba2c361STejun Heo 		else if (cpus && cpumask_and(local_cpus, allowed, cpus))
509bba2c361STejun Heo 			numa_cpus = local_cpus;
510bba2c361STejun Heo 	}
511bba2c361STejun Heo 
512bba2c361STejun Heo 	if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) {
513bba2c361STejun Heo 		struct cpumask *local_cpus = this_cpu_cpumask_var_ptr(local_llc_idle_cpumask);
514bba2c361STejun Heo 		const struct cpumask *cpus = llc_span(prev_cpu);
515bba2c361STejun Heo 
516bba2c361STejun Heo 		if (allowed == p->cpus_ptr && task_affinity_all(p))
517bba2c361STejun Heo 			llc_cpus = cpus;
518bba2c361STejun Heo 		else if (cpus && cpumask_and(local_cpus, allowed, cpus))
519bba2c361STejun Heo 			llc_cpus = local_cpus;
520bba2c361STejun Heo 	}
521bba2c361STejun Heo 
522bba2c361STejun Heo 	/*
523bba2c361STejun Heo 	 * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
524bba2c361STejun Heo 	 */
525bba2c361STejun Heo 	if (wake_flags & SCX_WAKE_SYNC) {
526bba2c361STejun Heo 		int waker_node;
527bba2c361STejun Heo 
528bba2c361STejun Heo 		/*
529bba2c361STejun Heo 		 * If the waker's CPU is cache affine and prev_cpu is idle,
530bba2c361STejun Heo 		 * then avoid a migration.
531bba2c361STejun Heo 		 */
532bba2c361STejun Heo 		cpu = smp_processor_id();
533bba2c361STejun Heo 		if (is_prev_allowed && cpus_share_cache(cpu, prev_cpu) &&
534bba2c361STejun Heo 		    scx_idle_test_and_clear_cpu(prev_cpu)) {
535bba2c361STejun Heo 			cpu = prev_cpu;
536bba2c361STejun Heo 			goto out_unlock;
537bba2c361STejun Heo 		}
538bba2c361STejun Heo 
539bba2c361STejun Heo 		/*
540bba2c361STejun Heo 		 * If the waker's local DSQ is empty, and the system is under
541bba2c361STejun Heo 		 * utilized, try to wake up @p to the local DSQ of the waker.
542bba2c361STejun Heo 		 *
543bba2c361STejun Heo 		 * Checking only for an empty local DSQ is insufficient as it
544bba2c361STejun Heo 		 * could give the wakee an unfair advantage when the system is
545bba2c361STejun Heo 		 * oversaturated.
546bba2c361STejun Heo 		 *
547bba2c361STejun Heo 		 * Checking only for the presence of idle CPUs is also
548bba2c361STejun Heo 		 * insufficient as the local DSQ of the waker could have tasks
549bba2c361STejun Heo 		 * piled up on it even if there is an idle core elsewhere on
550bba2c361STejun Heo 		 * the system.
551bba2c361STejun Heo 		 */
552bba2c361STejun Heo 		waker_node = scx_cpu_node_if_enabled(cpu);
553bba2c361STejun Heo 		if (!(current->flags & PF_EXITING) &&
554bba2c361STejun Heo 		    cpu_rq(cpu)->scx.local_dsq.nr == 0 &&
555bba2c361STejun Heo 		    (!(flags & SCX_PICK_IDLE_IN_NODE) || (waker_node == node)) &&
556bba2c361STejun Heo 		    !cpumask_empty(idle_cpumask(waker_node)->cpu)) {
557bba2c361STejun Heo 			if (cpumask_test_cpu(cpu, allowed))
558bba2c361STejun Heo 				goto out_unlock;
559bba2c361STejun Heo 		}
560bba2c361STejun Heo 	}
561bba2c361STejun Heo 
562bba2c361STejun Heo 	/*
563bba2c361STejun Heo 	 * If CPU has SMT, any wholly idle CPU is likely a better pick than
564bba2c361STejun Heo 	 * partially idle @prev_cpu.
565bba2c361STejun Heo 	 */
566bba2c361STejun Heo 	if (sched_smt_active()) {
567bba2c361STejun Heo 		/*
568bba2c361STejun Heo 		 * Keep using @prev_cpu if it's part of a fully idle core.
569bba2c361STejun Heo 		 */
570bba2c361STejun Heo 		if (is_prev_allowed &&
571bba2c361STejun Heo 		    cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) &&
572bba2c361STejun Heo 		    scx_idle_test_and_clear_cpu(prev_cpu)) {
573bba2c361STejun Heo 			cpu = prev_cpu;
574bba2c361STejun Heo 			goto out_unlock;
575bba2c361STejun Heo 		}
576bba2c361STejun Heo 
577bba2c361STejun Heo 		/*
578bba2c361STejun Heo 		 * Search for any fully idle core in the same LLC domain.
579bba2c361STejun Heo 		 */
580bba2c361STejun Heo 		if (llc_cpus) {
581bba2c361STejun Heo 			cpu = pick_idle_cpu_in_node(llc_cpus, node, SCX_PICK_IDLE_CORE);
582bba2c361STejun Heo 			if (cpu >= 0)
583bba2c361STejun Heo 				goto out_unlock;
584bba2c361STejun Heo 		}
585bba2c361STejun Heo 
586bba2c361STejun Heo 		/*
587bba2c361STejun Heo 		 * Search for any fully idle core in the same NUMA node.
588bba2c361STejun Heo 		 */
589bba2c361STejun Heo 		if (numa_cpus) {
590bba2c361STejun Heo 			cpu = pick_idle_cpu_in_node(numa_cpus, node, SCX_PICK_IDLE_CORE);
591bba2c361STejun Heo 			if (cpu >= 0)
592bba2c361STejun Heo 				goto out_unlock;
593bba2c361STejun Heo 		}
594bba2c361STejun Heo 
595bba2c361STejun Heo 		/*
596bba2c361STejun Heo 		 * Search for any full-idle core usable by the task.
597bba2c361STejun Heo 		 *
598bba2c361STejun Heo 		 * If the node-aware idle CPU selection policy is enabled
599bba2c361STejun Heo 		 * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always
600bba2c361STejun Heo 		 * begin in prev_cpu's node and proceed to other nodes in
601bba2c361STejun Heo 		 * order of increasing distance.
602bba2c361STejun Heo 		 */
603bba2c361STejun Heo 		cpu = scx_pick_idle_cpu(allowed, node, flags | SCX_PICK_IDLE_CORE);
604bba2c361STejun Heo 		if (cpu >= 0)
605bba2c361STejun Heo 			goto out_unlock;
606bba2c361STejun Heo 
607bba2c361STejun Heo 		/*
608bba2c361STejun Heo 		 * Give up if we're strictly looking for a full-idle SMT
609bba2c361STejun Heo 		 * core.
610bba2c361STejun Heo 		 */
611bba2c361STejun Heo 		if (flags & SCX_PICK_IDLE_CORE) {
612bba2c361STejun Heo 			cpu = -EBUSY;
613bba2c361STejun Heo 			goto out_unlock;
614bba2c361STejun Heo 		}
615bba2c361STejun Heo 	}
616bba2c361STejun Heo 
617bba2c361STejun Heo 	/*
618bba2c361STejun Heo 	 * Use @prev_cpu if it's idle.
619bba2c361STejun Heo 	 */
620bba2c361STejun Heo 	if (is_prev_allowed && scx_idle_test_and_clear_cpu(prev_cpu)) {
621bba2c361STejun Heo 		cpu = prev_cpu;
622bba2c361STejun Heo 		goto out_unlock;
623bba2c361STejun Heo 	}
624bba2c361STejun Heo 
625bba2c361STejun Heo 	/*
626bba2c361STejun Heo 	 * Use @prev_cpu's sibling if it's idle.
627bba2c361STejun Heo 	 */
628bba2c361STejun Heo 	if (sched_smt_active()) {
629bba2c361STejun Heo 		for_each_cpu_and(cpu, cpu_smt_mask(prev_cpu), allowed) {
630bba2c361STejun Heo 			if (cpu == prev_cpu)
631bba2c361STejun Heo 				continue;
632bba2c361STejun Heo 			if (scx_idle_test_and_clear_cpu(cpu))
633bba2c361STejun Heo 				goto out_unlock;
634bba2c361STejun Heo 		}
635bba2c361STejun Heo 	}
636bba2c361STejun Heo 
637bba2c361STejun Heo 	/*
638bba2c361STejun Heo 	 * Search for any idle CPU in the same LLC domain.
639bba2c361STejun Heo 	 */
640bba2c361STejun Heo 	if (llc_cpus) {
641bba2c361STejun Heo 		cpu = pick_idle_cpu_in_node(llc_cpus, node, 0);
642bba2c361STejun Heo 		if (cpu >= 0)
643bba2c361STejun Heo 			goto out_unlock;
644bba2c361STejun Heo 	}
645bba2c361STejun Heo 
646bba2c361STejun Heo 	/*
647bba2c361STejun Heo 	 * Search for any idle CPU in the same NUMA node.
648bba2c361STejun Heo 	 */
649bba2c361STejun Heo 	if (numa_cpus) {
650bba2c361STejun Heo 		cpu = pick_idle_cpu_in_node(numa_cpus, node, 0);
651bba2c361STejun Heo 		if (cpu >= 0)
652bba2c361STejun Heo 			goto out_unlock;
653bba2c361STejun Heo 	}
654bba2c361STejun Heo 
655bba2c361STejun Heo 	/*
656bba2c361STejun Heo 	 * Search for any idle CPU usable by the task.
657bba2c361STejun Heo 	 *
658bba2c361STejun Heo 	 * If the node-aware idle CPU selection policy is enabled
659bba2c361STejun Heo 	 * (%SCX_OPS_BUILTIN_IDLE_PER_NODE), the search will always begin
660bba2c361STejun Heo 	 * in prev_cpu's node and proceed to other nodes in order of
661bba2c361STejun Heo 	 * increasing distance.
662bba2c361STejun Heo 	 */
663bba2c361STejun Heo 	cpu = scx_pick_idle_cpu(allowed, node, flags);
664bba2c361STejun Heo 
665bba2c361STejun Heo out_unlock:
666bba2c361STejun Heo 	rcu_read_unlock();
667bba2c361STejun Heo out_enable:
668bba2c361STejun Heo 	preempt_enable();
669bba2c361STejun Heo 
670bba2c361STejun Heo 	return cpu;
671bba2c361STejun Heo }
672bba2c361STejun Heo 
673bba2c361STejun Heo /*
674bba2c361STejun Heo  * Initialize global and per-node idle cpumasks.
675bba2c361STejun Heo  */
676bba2c361STejun Heo void scx_idle_init_masks(void)
677bba2c361STejun Heo {
678bba2c361STejun Heo 	int i;
679bba2c361STejun Heo 
680bba2c361STejun Heo 	/* Allocate global idle cpumasks */
681bba2c361STejun Heo 	BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.cpu, GFP_KERNEL));
682bba2c361STejun Heo 	BUG_ON(!alloc_cpumask_var(&scx_idle_global_masks.smt, GFP_KERNEL));
683bba2c361STejun Heo 
684bba2c361STejun Heo 	/* Allocate per-node idle cpumasks (use nr_node_ids for non-contiguous NUMA nodes) */
685bba2c361STejun Heo 	scx_idle_node_masks = kzalloc_objs(*scx_idle_node_masks, nr_node_ids);
686bba2c361STejun Heo 	BUG_ON(!scx_idle_node_masks);
687bba2c361STejun Heo 
688bba2c361STejun Heo 	for_each_node(i) {
689bba2c361STejun Heo 		scx_idle_node_masks[i] = kzalloc_node(sizeof(**scx_idle_node_masks),
690bba2c361STejun Heo 							 GFP_KERNEL, i);
691bba2c361STejun Heo 		BUG_ON(!scx_idle_node_masks[i]);
692bba2c361STejun Heo 
693bba2c361STejun Heo 		BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->cpu, GFP_KERNEL, i));
694bba2c361STejun Heo 		BUG_ON(!alloc_cpumask_var_node(&scx_idle_node_masks[i]->smt, GFP_KERNEL, i));
695bba2c361STejun Heo 	}
696bba2c361STejun Heo 
697bba2c361STejun Heo 	/* Allocate local per-cpu idle cpumasks */
698bba2c361STejun Heo 	for_each_possible_cpu(i) {
699bba2c361STejun Heo 		BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_idle_cpumask, i),
700bba2c361STejun Heo 					       GFP_KERNEL, cpu_to_node(i)));
701bba2c361STejun Heo 		BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_llc_idle_cpumask, i),
702bba2c361STejun Heo 					       GFP_KERNEL, cpu_to_node(i)));
703bba2c361STejun Heo 		BUG_ON(!alloc_cpumask_var_node(&per_cpu(local_numa_idle_cpumask, i),
704bba2c361STejun Heo 					       GFP_KERNEL, cpu_to_node(i)));
705bba2c361STejun Heo 	}
706bba2c361STejun Heo }
707bba2c361STejun Heo 
708bba2c361STejun Heo static void update_builtin_idle(int cpu, bool idle)
709bba2c361STejun Heo {
710bba2c361STejun Heo 	int node = scx_cpu_node_if_enabled(cpu);
711bba2c361STejun Heo 	struct cpumask *idle_cpus = idle_cpumask(node)->cpu;
712bba2c361STejun Heo 
713bba2c361STejun Heo 	assign_cpu(cpu, idle_cpus, idle);
714bba2c361STejun Heo 
715bba2c361STejun Heo 	if (sched_smt_active()) {
716bba2c361STejun Heo 		const struct cpumask *smt = cpu_smt_mask(cpu);
717bba2c361STejun Heo 		struct cpumask *idle_smts = idle_cpumask(node)->smt;
718bba2c361STejun Heo 
719bba2c361STejun Heo 		if (idle) {
720bba2c361STejun Heo 			/*
721bba2c361STejun Heo 			 * idle_smt handling is racy but that's fine as it's
722bba2c361STejun Heo 			 * only for optimization and self-correcting.
723bba2c361STejun Heo 			 */
724bba2c361STejun Heo 			if (!cpumask_subset(smt, idle_cpus))
725bba2c361STejun Heo 				return;
726bba2c361STejun Heo 			cpumask_or(idle_smts, idle_smts, smt);
727bba2c361STejun Heo 		} else {
728bba2c361STejun Heo 			cpumask_andnot(idle_smts, idle_smts, smt);
729bba2c361STejun Heo 		}
730bba2c361STejun Heo 	}
731bba2c361STejun Heo }
732bba2c361STejun Heo 
733bba2c361STejun Heo /*
734bba2c361STejun Heo  * Update the idle state of a CPU to @idle.
735bba2c361STejun Heo  *
736bba2c361STejun Heo  * If @do_notify is true, ops.update_idle() is invoked to notify the scx
737bba2c361STejun Heo  * scheduler of an actual idle state transition (idle to busy or vice
738bba2c361STejun Heo  * versa). If @do_notify is false, only the idle state in the idle masks is
739bba2c361STejun Heo  * refreshed without invoking ops.update_idle().
740bba2c361STejun Heo  *
741bba2c361STejun Heo  * This distinction is necessary, because an idle CPU can be "reserved" and
742bba2c361STejun Heo  * awakened via scx_bpf_pick_idle_cpu() + scx_bpf_kick_cpu(), marking it as
743bba2c361STejun Heo  * busy even if no tasks are dispatched. In this case, the CPU may return
744bba2c361STejun Heo  * to idle without a true state transition. Refreshing the idle masks
745bba2c361STejun Heo  * without invoking ops.update_idle() ensures accurate idle state tracking
746bba2c361STejun Heo  * while avoiding unnecessary updates and maintaining balanced state
747bba2c361STejun Heo  * transitions.
748bba2c361STejun Heo  */
749bba2c361STejun Heo void __scx_update_idle(struct rq *rq, bool idle, bool do_notify)
750bba2c361STejun Heo {
751bba2c361STejun Heo 	struct scx_sched *sch = scx_root;
752bba2c361STejun Heo 	int cpu = cpu_of(rq);
753bba2c361STejun Heo 
754bba2c361STejun Heo 	lockdep_assert_rq_held(rq);
755bba2c361STejun Heo 
756bba2c361STejun Heo 	/*
757bba2c361STejun Heo 	 * Update the idle masks:
758bba2c361STejun Heo 	 * - for real idle transitions (do_notify == true)
759bba2c361STejun Heo 	 * - for idle-to-idle transitions (indicated by the previous task
760bba2c361STejun Heo 	 *   being the idle thread, managed by pick_task_idle())
761bba2c361STejun Heo 	 *
762bba2c361STejun Heo 	 * Skip updating idle masks if the previous task is not the idle
763bba2c361STejun Heo 	 * thread, since set_next_task_idle() has already handled it when
764bba2c361STejun Heo 	 * transitioning from a task to the idle thread (calling this
765bba2c361STejun Heo 	 * function with do_notify == true).
766bba2c361STejun Heo 	 *
767bba2c361STejun Heo 	 * In this way we can avoid updating the idle masks twice,
768bba2c361STejun Heo 	 * unnecessarily.
769bba2c361STejun Heo 	 */
770bba2c361STejun Heo 	if (static_branch_likely(&scx_builtin_idle_enabled))
771bba2c361STejun Heo 		if (do_notify || is_idle_task(rq->curr))
772bba2c361STejun Heo 			update_builtin_idle(cpu, idle);
773bba2c361STejun Heo 
774bba2c361STejun Heo 	/*
775bba2c361STejun Heo 	 * Trigger ops.update_idle() only when transitioning from a task to
776bba2c361STejun Heo 	 * the idle thread and vice versa.
777bba2c361STejun Heo 	 *
778bba2c361STejun Heo 	 * Idle transitions are indicated by do_notify being set to true,
779bba2c361STejun Heo 	 * managed by put_prev_task_idle()/set_next_task_idle().
780bba2c361STejun Heo 	 *
781bba2c361STejun Heo 	 * This must come after builtin idle update so that BPF schedulers can
782bba2c361STejun Heo 	 * create interlocking between ops.update_idle() and ops.enqueue() -
783bba2c361STejun Heo 	 * either enqueue() sees the idle bit or update_idle() sees the task
784bba2c361STejun Heo 	 * that enqueue() queued.
785bba2c361STejun Heo 	 */
786bba2c361STejun Heo 	if (SCX_HAS_OP(sch, update_idle) && do_notify &&
787bba2c361STejun Heo 	    !scx_bypassing(sch, cpu_of(rq)))
788bba2c361STejun Heo 		SCX_CALL_OP(sch, update_idle, rq, scx_cpu_arg(cpu_of(rq)), idle);
789bba2c361STejun Heo }
790bba2c361STejun Heo 
791bba2c361STejun Heo static void reset_idle_masks(struct sched_ext_ops *ops)
792bba2c361STejun Heo {
793bba2c361STejun Heo 	int node;
794bba2c361STejun Heo 
795bba2c361STejun Heo 	/*
796bba2c361STejun Heo 	 * Consider all online cpus idle. Should converge to the actual state
797bba2c361STejun Heo 	 * quickly.
798bba2c361STejun Heo 	 */
799bba2c361STejun Heo 	if (!(ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)) {
800bba2c361STejun Heo 		cpumask_copy(idle_cpumask(NUMA_NO_NODE)->cpu, cpu_online_mask);
801bba2c361STejun Heo 		cpumask_copy(idle_cpumask(NUMA_NO_NODE)->smt, cpu_online_mask);
802bba2c361STejun Heo 		return;
803bba2c361STejun Heo 	}
804bba2c361STejun Heo 
805bba2c361STejun Heo 	for_each_node(node) {
806bba2c361STejun Heo 		const struct cpumask *node_mask = cpumask_of_node(node);
807bba2c361STejun Heo 
808bba2c361STejun Heo 		cpumask_and(idle_cpumask(node)->cpu, cpu_online_mask, node_mask);
809bba2c361STejun Heo 		cpumask_and(idle_cpumask(node)->smt, cpu_online_mask, node_mask);
810bba2c361STejun Heo 	}
811bba2c361STejun Heo }
812bba2c361STejun Heo 
813bba2c361STejun Heo void scx_idle_enable(struct sched_ext_ops *ops)
814bba2c361STejun Heo {
815bba2c361STejun Heo 	if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE))
816bba2c361STejun Heo 		static_branch_enable_cpuslocked(&scx_builtin_idle_enabled);
817bba2c361STejun Heo 	else
818bba2c361STejun Heo 		static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
819bba2c361STejun Heo 
820bba2c361STejun Heo 	if (ops->flags & SCX_OPS_BUILTIN_IDLE_PER_NODE)
821bba2c361STejun Heo 		static_branch_enable_cpuslocked(&scx_builtin_idle_per_node);
822bba2c361STejun Heo 	else
823bba2c361STejun Heo 		static_branch_disable_cpuslocked(&scx_builtin_idle_per_node);
824bba2c361STejun Heo 
825bba2c361STejun Heo 	reset_idle_masks(ops);
826bba2c361STejun Heo }
827bba2c361STejun Heo 
828bba2c361STejun Heo void scx_idle_disable(void)
829bba2c361STejun Heo {
830bba2c361STejun Heo 	static_branch_disable(&scx_builtin_idle_enabled);
831bba2c361STejun Heo 	static_branch_disable(&scx_builtin_idle_per_node);
832bba2c361STejun Heo }
833bba2c361STejun Heo 
834bba2c361STejun Heo /********************************************************************************
835bba2c361STejun Heo  * Helpers that can be called from the BPF scheduler.
836bba2c361STejun Heo  */
837bba2c361STejun Heo 
838bba2c361STejun Heo static int validate_node(struct scx_sched *sch, int node)
839bba2c361STejun Heo {
840bba2c361STejun Heo 	if (!static_branch_likely(&scx_builtin_idle_per_node)) {
841bba2c361STejun Heo 		scx_error(sch, "per-node idle tracking is disabled");
842bba2c361STejun Heo 		return -EOPNOTSUPP;
843bba2c361STejun Heo 	}
844bba2c361STejun Heo 
845bba2c361STejun Heo 	/* Return no entry for NUMA_NO_NODE (not a critical scx error) */
846bba2c361STejun Heo 	if (node == NUMA_NO_NODE)
847bba2c361STejun Heo 		return -ENOENT;
848bba2c361STejun Heo 
849bba2c361STejun Heo 	/* Make sure node is in a valid range */
850bba2c361STejun Heo 	if (node < 0 || node >= nr_node_ids) {
851bba2c361STejun Heo 		scx_error(sch, "invalid node %d", node);
852bba2c361STejun Heo 		return -EINVAL;
853bba2c361STejun Heo 	}
854bba2c361STejun Heo 
855bba2c361STejun Heo 	/* Make sure the node is part of the set of possible nodes */
856bba2c361STejun Heo 	if (!node_possible(node)) {
857bba2c361STejun Heo 		scx_error(sch, "unavailable node %d", node);
858bba2c361STejun Heo 		return -EINVAL;
859bba2c361STejun Heo 	}
860bba2c361STejun Heo 
861bba2c361STejun Heo 	return node;
862bba2c361STejun Heo }
863bba2c361STejun Heo 
864bba2c361STejun Heo __bpf_kfunc_start_defs();
865bba2c361STejun Heo 
866bba2c361STejun Heo static bool check_builtin_idle_enabled(struct scx_sched *sch)
867bba2c361STejun Heo {
868bba2c361STejun Heo 	if (static_branch_likely(&scx_builtin_idle_enabled))
869bba2c361STejun Heo 		return true;
870bba2c361STejun Heo 
871bba2c361STejun Heo 	scx_error(sch, "built-in idle tracking is disabled");
872bba2c361STejun Heo 	return false;
873bba2c361STejun Heo }
874bba2c361STejun Heo 
875bba2c361STejun Heo /*
876bba2c361STejun Heo  * Determine whether @p is a migration-disabled task in the context of BPF
877bba2c361STejun Heo  * code.
878bba2c361STejun Heo  *
879bba2c361STejun Heo  * We can't simply check whether @p->migration_disabled is set in a
880bba2c361STejun Heo  * sched_ext callback, because the BPF prolog (__bpf_prog_enter) may disable
881bba2c361STejun Heo  * migration for the current task while running BPF code.
882bba2c361STejun Heo  *
883bba2c361STejun Heo  * Since the BPF prolog calls migrate_disable() only when CONFIG_PREEMPT_RCU
884bba2c361STejun Heo  * is enabled (via rcu_read_lock_dont_migrate()), migration_disabled == 1 for
885bba2c361STejun Heo  * the current task is ambiguous only in that case: it could be from the BPF
886bba2c361STejun Heo  * prolog rather than a real migrate_disable() call.
887bba2c361STejun Heo  *
888bba2c361STejun Heo  * Without CONFIG_PREEMPT_RCU, the BPF prolog never calls migrate_disable(),
889bba2c361STejun Heo  * so migration_disabled == 1 always means the task is truly
890bba2c361STejun Heo  * migration-disabled.
891bba2c361STejun Heo  *
892bba2c361STejun Heo  * Therefore, when migration_disabled == 1 and CONFIG_PREEMPT_RCU is enabled,
893bba2c361STejun Heo  * check whether @p is the current task or not: if it is, then migration was
894bba2c361STejun Heo  * not disabled before entering the callback, otherwise migration was disabled.
895bba2c361STejun Heo  *
896bba2c361STejun Heo  * Returns true if @p is migration-disabled, false otherwise.
897bba2c361STejun Heo  */
898bba2c361STejun Heo static bool is_bpf_migration_disabled(const struct task_struct *p)
899bba2c361STejun Heo {
900bba2c361STejun Heo 	if (p->migration_disabled == 1) {
901bba2c361STejun Heo 		if (IS_ENABLED(CONFIG_PREEMPT_RCU))
902bba2c361STejun Heo 			return p != current;
903bba2c361STejun Heo 		return true;
904bba2c361STejun Heo 	}
905bba2c361STejun Heo 	return p->migration_disabled;
906bba2c361STejun Heo }
907bba2c361STejun Heo 
908bba2c361STejun Heo static s32 select_cpu_from_kfunc(struct scx_sched *sch, struct task_struct *p,
909bba2c361STejun Heo 				 s32 prev_cpu, u64 wake_flags,
910bba2c361STejun Heo 				 const struct cpumask *allowed, u64 flags)
911bba2c361STejun Heo {
912bba2c361STejun Heo 	unsigned long irq_flags;
913bba2c361STejun Heo 	bool we_locked = false;
914bba2c361STejun Heo 	s32 cpu;
915bba2c361STejun Heo 
916bba2c361STejun Heo 	if (!scx_cpu_valid(sch, prev_cpu, NULL))
917bba2c361STejun Heo 		return -EINVAL;
918bba2c361STejun Heo 
919bba2c361STejun Heo 	if (!check_builtin_idle_enabled(sch))
920bba2c361STejun Heo 		return -EBUSY;
921bba2c361STejun Heo 
922bba2c361STejun Heo 	/*
923bba2c361STejun Heo 	 * Accessing p->cpus_ptr / p->nr_cpus_allowed needs either @p's rq
924bba2c361STejun Heo 	 * lock or @p's pi_lock. Three cases:
925bba2c361STejun Heo 	 *
926bba2c361STejun Heo 	 *  - inside ops.select_cpu(): try_to_wake_up() holds the wake-up
927bba2c361STejun Heo 	 *    task's pi_lock; the wake-up task is recorded in kf_tasks[0]
928bba2c361STejun Heo 	 *    by SCX_CALL_OP_TASK_RET().
929bba2c361STejun Heo 	 *  - other rq-locked SCX op: scx_locked_rq() points at the held rq.
930bba2c361STejun Heo 	 *  - truly unlocked (UNLOCKED ops, SYSCALL, non-SCX struct_ops):
931bba2c361STejun Heo 	 *    nothing held, take pi_lock ourselves.
932bba2c361STejun Heo 	 *
933bba2c361STejun Heo 	 * In the first two cases, BPF schedulers may pass an arbitrary task
934bba2c361STejun Heo 	 * that the held lock doesn't cover. Refuse those.
935bba2c361STejun Heo 	 */
936bba2c361STejun Heo 	if (this_rq()->scx.in_select_cpu) {
937bba2c361STejun Heo 		if (!scx_kf_arg_task_ok(sch, p))
938bba2c361STejun Heo 			return -EINVAL;
939bba2c361STejun Heo 		lockdep_assert_held(&p->pi_lock);
940bba2c361STejun Heo 	} else if (scx_locked_rq()) {
941bba2c361STejun Heo 		if (task_rq(p) != scx_locked_rq())
942bba2c361STejun Heo 			goto cross_task;
943bba2c361STejun Heo 	} else {
944bba2c361STejun Heo 		raw_spin_lock_irqsave(&p->pi_lock, irq_flags);
945bba2c361STejun Heo 		we_locked = true;
946bba2c361STejun Heo 	}
947bba2c361STejun Heo 
948bba2c361STejun Heo 	/*
949bba2c361STejun Heo 	 * This may also be called from ops.enqueue(), so we need to handle
950bba2c361STejun Heo 	 * per-CPU tasks as well. For these tasks, we can skip all idle CPU
951bba2c361STejun Heo 	 * selection optimizations and simply check whether the previously
952bba2c361STejun Heo 	 * used CPU is idle and within the allowed cpumask.
953bba2c361STejun Heo 	 */
954bba2c361STejun Heo 	if (p->nr_cpus_allowed == 1 || is_bpf_migration_disabled(p)) {
955bba2c361STejun Heo 		if (cpumask_test_cpu(prev_cpu, allowed ?: p->cpus_ptr) &&
956bba2c361STejun Heo 		    scx_idle_test_and_clear_cpu(prev_cpu))
957bba2c361STejun Heo 			cpu = prev_cpu;
958bba2c361STejun Heo 		else
959bba2c361STejun Heo 			cpu = -EBUSY;
960bba2c361STejun Heo 	} else {
961bba2c361STejun Heo 		cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags,
962bba2c361STejun Heo 					 allowed ?: p->cpus_ptr, flags);
963bba2c361STejun Heo 	}
964bba2c361STejun Heo 
965bba2c361STejun Heo 	if (we_locked)
966bba2c361STejun Heo 		raw_spin_unlock_irqrestore(&p->pi_lock, irq_flags);
967bba2c361STejun Heo 
968bba2c361STejun Heo 	return cpu;
969bba2c361STejun Heo 
970bba2c361STejun Heo cross_task:
971bba2c361STejun Heo 	scx_error(sch, "select_cpu kfunc called cross-task on %s[%d]",
972bba2c361STejun Heo 		  p->comm, p->pid);
973bba2c361STejun Heo 	return -EINVAL;
974bba2c361STejun Heo }
975bba2c361STejun Heo 
976bba2c361STejun Heo /**
977bba2c361STejun Heo  * scx_bpf_cpu_node - Return the NUMA node the given @cpu belongs to, or
978bba2c361STejun Heo  *		      trigger an error if @cpu is invalid
979bba2c361STejun Heo  * @cpu: target CPU
980bba2c361STejun Heo  * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
981bba2c361STejun Heo  */
982bba2c361STejun Heo __bpf_kfunc s32 scx_bpf_cpu_node(s32 cpu, const struct bpf_prog_aux *aux)
983bba2c361STejun Heo {
984bba2c361STejun Heo 	struct scx_sched *sch;
985bba2c361STejun Heo 
986bba2c361STejun Heo 	guard(rcu)();
987bba2c361STejun Heo 
988bba2c361STejun Heo 	sch = scx_prog_sched(aux);
989bba2c361STejun Heo 	if (unlikely(!sch) || !scx_cpu_valid(sch, cpu, NULL))
990bba2c361STejun Heo 		return NUMA_NO_NODE;
991bba2c361STejun Heo 	return cpu_to_node(cpu);
992bba2c361STejun Heo }
993bba2c361STejun Heo 
994bba2c361STejun Heo /**
995bba2c361STejun Heo  * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
996bba2c361STejun Heo  * @p: task_struct to select a CPU for
997bba2c361STejun Heo  * @prev_cpu: CPU @p was on previously
998bba2c361STejun Heo  * @wake_flags: %SCX_WAKE_* flags
999bba2c361STejun Heo  * @is_idle: out parameter indicating whether the returned CPU is idle
1000bba2c361STejun Heo  * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
1001bba2c361STejun Heo  *
1002bba2c361STejun Heo  * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked
1003bba2c361STejun Heo  * context such as a BPF test_run() call, as long as built-in CPU selection
1004bba2c361STejun Heo  * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE
1005bba2c361STejun Heo  * is set.
1006bba2c361STejun Heo  *
1007bba2c361STejun Heo  * Returns the picked CPU with *@is_idle indicating whether the picked CPU is
1008bba2c361STejun Heo  * currently idle and thus a good candidate for direct dispatching.
1009bba2c361STejun Heo  */
1010bba2c361STejun Heo __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
1011bba2c361STejun Heo 				       u64 wake_flags, bool *is_idle,
1012bba2c361STejun Heo 				       const struct bpf_prog_aux *aux)
1013bba2c361STejun Heo {
1014bba2c361STejun Heo 	struct scx_sched *sch;
1015bba2c361STejun Heo 	s32 cpu;
1016bba2c361STejun Heo 
1017bba2c361STejun Heo 	guard(rcu)();
1018bba2c361STejun Heo 
1019bba2c361STejun Heo 	sch = scx_prog_sched(aux);
1020bba2c361STejun Heo 	if (unlikely(!sch))
1021bba2c361STejun Heo 		return -ENODEV;
1022bba2c361STejun Heo 
1023bba2c361STejun Heo 	cpu = select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags, NULL, 0);
1024bba2c361STejun Heo 	if (cpu >= 0) {
1025bba2c361STejun Heo 		*is_idle = true;
1026bba2c361STejun Heo 		return cpu;
1027bba2c361STejun Heo 	}
1028bba2c361STejun Heo 	*is_idle = false;
1029bba2c361STejun Heo 	return prev_cpu;
1030bba2c361STejun Heo }
1031bba2c361STejun Heo 
1032bba2c361STejun Heo struct scx_bpf_select_cpu_and_args {
1033bba2c361STejun Heo 	/* @p and @cpus_allowed can't be packed together as KF_RCU is not transitive */
1034bba2c361STejun Heo 	s32			prev_cpu;
1035bba2c361STejun Heo 	u64			wake_flags;
1036bba2c361STejun Heo 	u64			flags;
1037bba2c361STejun Heo };
1038bba2c361STejun Heo 
1039bba2c361STejun Heo /**
1040bba2c361STejun Heo  * __scx_bpf_select_cpu_and - Arg-wrapped CPU selection with cpumask
1041bba2c361STejun Heo  * @p: task_struct to select a CPU for
1042bba2c361STejun Heo  * @cpus_allowed: cpumask of allowed CPUs
1043bba2c361STejun Heo  * @args: struct containing the rest of the arguments
1044bba2c361STejun Heo  *       @args->prev_cpu: CPU @p was on previously
1045bba2c361STejun Heo  *       @args->wake_flags: %SCX_WAKE_* flags
1046bba2c361STejun Heo  *       @args->flags: %SCX_PICK_IDLE* flags
1047bba2c361STejun Heo  * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
1048bba2c361STejun Heo  *
1049bba2c361STejun Heo  * Wrapper kfunc that takes arguments via struct to work around BPF's 5 argument
1050bba2c361STejun Heo  * limit. BPF programs should use scx_bpf_select_cpu_and() which is provided
1051bba2c361STejun Heo  * as an inline wrapper in common.bpf.h.
1052bba2c361STejun Heo  *
1053bba2c361STejun Heo  * Can be called from ops.select_cpu(), ops.enqueue(), or from an unlocked
1054bba2c361STejun Heo  * context such as a BPF test_run() call, as long as built-in CPU selection
1055bba2c361STejun Heo  * is enabled: ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE
1056bba2c361STejun Heo  * is set.
1057bba2c361STejun Heo  *
1058bba2c361STejun Heo  * @p, @args->prev_cpu and @args->wake_flags match ops.select_cpu().
1059bba2c361STejun Heo  *
1060bba2c361STejun Heo  * Returns the selected idle CPU, which will be automatically awakened upon
1061bba2c361STejun Heo  * returning from ops.select_cpu() and can be used for direct dispatch, or
1062bba2c361STejun Heo  * a negative value if no idle CPU is available.
1063bba2c361STejun Heo  */
1064bba2c361STejun Heo __bpf_kfunc s32
1065bba2c361STejun Heo __scx_bpf_select_cpu_and(struct task_struct *p, const struct cpumask *cpus_allowed,
1066bba2c361STejun Heo 			 struct scx_bpf_select_cpu_and_args *args,
1067bba2c361STejun Heo 			 const struct bpf_prog_aux *aux)
1068bba2c361STejun Heo {
1069bba2c361STejun Heo 	struct scx_sched *sch;
1070bba2c361STejun Heo 
1071bba2c361STejun Heo 	guard(rcu)();
1072bba2c361STejun Heo 
1073bba2c361STejun Heo 	sch = scx_prog_sched(aux);
1074bba2c361STejun Heo 	if (unlikely(!sch))
1075bba2c361STejun Heo 		return -ENODEV;
1076bba2c361STejun Heo 
1077bba2c361STejun Heo 	return select_cpu_from_kfunc(sch, p, args->prev_cpu, args->wake_flags,
1078bba2c361STejun Heo 				     cpus_allowed, args->flags);
1079bba2c361STejun Heo }
1080bba2c361STejun Heo 
1081bba2c361STejun Heo /*
1082bba2c361STejun Heo  * COMPAT: Will be removed in v6.22.
1083bba2c361STejun Heo  */
1084bba2c361STejun Heo __bpf_kfunc s32 scx_bpf_select_cpu_and(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
1085bba2c361STejun Heo 				       const struct cpumask *cpus_allowed, u64 flags)
1086bba2c361STejun Heo {
1087bba2c361STejun Heo 	struct scx_sched *sch;
1088bba2c361STejun Heo 
1089bba2c361STejun Heo 	guard(rcu)();
1090bba2c361STejun Heo 
1091bba2c361STejun Heo 	sch = rcu_dereference(scx_root);
1092bba2c361STejun Heo 	if (unlikely(!sch))
1093bba2c361STejun Heo 		return -ENODEV;
1094bba2c361STejun Heo 
1095bba2c361STejun Heo #ifdef CONFIG_EXT_SUB_SCHED
1096bba2c361STejun Heo 	/*
1097bba2c361STejun Heo 	 * Disallow if any sub-scheds are attached. There is no way to tell
1098bba2c361STejun Heo 	 * which scheduler called us, just error out @p's scheduler.
1099bba2c361STejun Heo 	 */
1100bba2c361STejun Heo 	if (unlikely(!list_empty(&sch->children))) {
1101bba2c361STejun Heo 		scx_error(scx_task_sched(p), "__scx_bpf_select_cpu_and() must be used");
1102bba2c361STejun Heo 		return -EINVAL;
1103bba2c361STejun Heo 	}
1104bba2c361STejun Heo #endif
1105bba2c361STejun Heo 
1106bba2c361STejun Heo 	return select_cpu_from_kfunc(sch, p, prev_cpu, wake_flags,
1107bba2c361STejun Heo 				     cpus_allowed, flags);
1108bba2c361STejun Heo }
1109bba2c361STejun Heo 
1110bba2c361STejun Heo /**
1111bba2c361STejun Heo  * scx_bpf_get_idle_cpumask_node - Get a referenced kptr to the
1112bba2c361STejun Heo  * idle-tracking per-CPU cpumask of a target NUMA node.
1113bba2c361STejun Heo  * @node: target NUMA node
1114bba2c361STejun Heo  * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
1115bba2c361STejun Heo  *
1116bba2c361STejun Heo  * Returns an empty cpumask if idle tracking is not enabled, if @node is
1117bba2c361STejun Heo  * not valid, or running on a UP kernel. In this case the actual error will
1118bba2c361STejun Heo  * be reported to the BPF scheduler via scx_error().
1119bba2c361STejun Heo  */
1120bba2c361STejun Heo __bpf_kfunc const struct cpumask *
1121bba2c361STejun Heo scx_bpf_get_idle_cpumask_node(s32 node, const struct bpf_prog_aux *aux)
1122bba2c361STejun Heo {
1123bba2c361STejun Heo 	struct scx_sched *sch;
1124bba2c361STejun Heo 
1125bba2c361STejun Heo 	guard(rcu)();
1126bba2c361STejun Heo 
1127bba2c361STejun Heo 	sch = scx_prog_sched(aux);
1128bba2c361STejun Heo 	if (unlikely(!sch))
1129bba2c361STejun Heo 		return cpu_none_mask;
1130bba2c361STejun Heo 
1131bba2c361STejun Heo 	node = validate_node(sch, node);
1132bba2c361STejun Heo 	if (node < 0)
1133bba2c361STejun Heo 		return cpu_none_mask;
1134bba2c361STejun Heo 
1135bba2c361STejun Heo 	return idle_cpumask(node)->cpu;
1136bba2c361STejun Heo }
1137bba2c361STejun Heo 
1138bba2c361STejun Heo /**
1139bba2c361STejun Heo  * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
1140bba2c361STejun Heo  * per-CPU cpumask.
1141bba2c361STejun Heo  * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
1142bba2c361STejun Heo  *
1143bba2c361STejun Heo  * Returns an empty mask if idle tracking is not enabled, or running on a
1144bba2c361STejun Heo  * UP kernel.
1145bba2c361STejun Heo  */
1146bba2c361STejun Heo __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(const struct bpf_prog_aux *aux)
1147bba2c361STejun Heo {
1148bba2c361STejun Heo 	struct scx_sched *sch;
1149bba2c361STejun Heo 
1150bba2c361STejun Heo 	guard(rcu)();
1151bba2c361STejun Heo 
1152bba2c361STejun Heo 	sch = scx_prog_sched(aux);
1153bba2c361STejun Heo 	if (unlikely(!sch))
1154bba2c361STejun Heo 		return cpu_none_mask;
1155bba2c361STejun Heo 
1156bba2c361STejun Heo 	if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
1157bba2c361STejun Heo 		scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
1158bba2c361STejun Heo 		return cpu_none_mask;
1159bba2c361STejun Heo 	}
1160bba2c361STejun Heo 
1161bba2c361STejun Heo 	if (!check_builtin_idle_enabled(sch))
1162bba2c361STejun Heo 		return cpu_none_mask;
1163bba2c361STejun Heo 
1164bba2c361STejun Heo 	return idle_cpumask(NUMA_NO_NODE)->cpu;
1165bba2c361STejun Heo }
1166bba2c361STejun Heo 
1167bba2c361STejun Heo /**
1168bba2c361STejun Heo  * scx_bpf_get_idle_smtmask_node - Get a referenced kptr to the
1169bba2c361STejun Heo  * idle-tracking, per-physical-core cpumask of a target NUMA node. Can be
1170bba2c361STejun Heo  * used to determine if an entire physical core is free.
1171bba2c361STejun Heo  * @node: target NUMA node
1172bba2c361STejun Heo  * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
1173bba2c361STejun Heo  *
1174bba2c361STejun Heo  * Returns an empty cpumask if idle tracking is not enabled, if @node is
1175bba2c361STejun Heo  * not valid, or running on a UP kernel. In this case the actual error will
1176bba2c361STejun Heo  * be reported to the BPF scheduler via scx_error().
1177bba2c361STejun Heo  */
1178bba2c361STejun Heo __bpf_kfunc const struct cpumask *
1179bba2c361STejun Heo scx_bpf_get_idle_smtmask_node(s32 node, const struct bpf_prog_aux *aux)
1180bba2c361STejun Heo {
1181bba2c361STejun Heo 	struct scx_sched *sch;
1182bba2c361STejun Heo 
1183bba2c361STejun Heo 	guard(rcu)();
1184bba2c361STejun Heo 
1185bba2c361STejun Heo 	sch = scx_prog_sched(aux);
1186bba2c361STejun Heo 	if (unlikely(!sch))
1187bba2c361STejun Heo 		return cpu_none_mask;
1188bba2c361STejun Heo 
1189bba2c361STejun Heo 	node = validate_node(sch, node);
1190bba2c361STejun Heo 	if (node < 0)
1191bba2c361STejun Heo 		return cpu_none_mask;
1192bba2c361STejun Heo 
1193bba2c361STejun Heo 	if (sched_smt_active())
1194bba2c361STejun Heo 		return idle_cpumask(node)->smt;
1195bba2c361STejun Heo 	else
1196bba2c361STejun Heo 		return idle_cpumask(node)->cpu;
1197bba2c361STejun Heo }
1198bba2c361STejun Heo 
1199bba2c361STejun Heo /**
1200bba2c361STejun Heo  * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
1201bba2c361STejun Heo  * per-physical-core cpumask. Can be used to determine if an entire physical
1202bba2c361STejun Heo  * core is free.
1203bba2c361STejun Heo  * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
1204bba2c361STejun Heo  *
1205bba2c361STejun Heo  * Returns an empty mask if idle tracking is not enabled, or running on a
1206bba2c361STejun Heo  * UP kernel.
1207bba2c361STejun Heo  */
1208bba2c361STejun Heo __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(const struct bpf_prog_aux *aux)
1209bba2c361STejun Heo {
1210bba2c361STejun Heo 	struct scx_sched *sch;
1211bba2c361STejun Heo 
1212bba2c361STejun Heo 	guard(rcu)();
1213bba2c361STejun Heo 
1214bba2c361STejun Heo 	sch = scx_prog_sched(aux);
1215bba2c361STejun Heo 	if (unlikely(!sch))
1216bba2c361STejun Heo 		return cpu_none_mask;
1217bba2c361STejun Heo 
1218bba2c361STejun Heo 	if (static_branch_unlikely(&scx_builtin_idle_per_node)) {
1219bba2c361STejun Heo 		scx_error(sch, "SCX_OPS_BUILTIN_IDLE_PER_NODE enabled");
1220bba2c361STejun Heo 		return cpu_none_mask;
1221bba2c361STejun Heo 	}
1222bba2c361STejun Heo 
1223bba2c361STejun Heo 	if (!check_builtin_idle_enabled(sch))
1224bba2c361STejun Heo 		return cpu_none_mask;
1225bba2c361STejun Heo 
1226bba2c361STejun Heo 	if (sched_smt_active())
1227bba2c361STejun Heo 		return idle_cpumask(NUMA_NO_NODE)->smt;
1228bba2c361STejun Heo 	else
1229bba2c361STejun Heo 		return idle_cpumask(NUMA_NO_NODE)->cpu;
1230bba2c361STejun Heo }
1231bba2c361STejun Heo 
1232bba2c361STejun Heo /**
1233bba2c361STejun Heo  * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
1234bba2c361STejun Heo  * either the percpu, or SMT idle-tracking cpumask.
1235bba2c361STejun Heo  * @idle_mask: &cpumask to use
1236bba2c361STejun Heo  */
1237bba2c361STejun Heo __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
1238bba2c361STejun Heo {
1239bba2c361STejun Heo 	/*
1240bba2c361STejun Heo 	 * Empty function body because we aren't actually acquiring or releasing
1241bba2c361STejun Heo 	 * a reference to a global idle cpumask, which is read-only in the
1242bba2c361STejun Heo 	 * caller and is never released. The acquire / release semantics here
1243bba2c361STejun Heo 	 * are just used to make the cpumask a trusted pointer in the caller.
1244bba2c361STejun Heo 	 */
1245bba2c361STejun Heo }
1246bba2c361STejun Heo 
1247bba2c361STejun Heo /**
1248bba2c361STejun Heo  * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
1249bba2c361STejun Heo  * @cpu: cpu to test and clear idle for
1250bba2c361STejun Heo  * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
1251bba2c361STejun Heo  *
1252bba2c361STejun Heo  * Returns %true if @cpu was idle and its idle state was successfully cleared.
1253bba2c361STejun Heo  * %false otherwise.
1254bba2c361STejun Heo  *
1255bba2c361STejun Heo  * Unavailable if ops.update_idle() is implemented and
1256bba2c361STejun Heo  * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
1257bba2c361STejun Heo  */
1258bba2c361STejun Heo __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu, const struct bpf_prog_aux *aux)
1259bba2c361STejun Heo {
1260bba2c361STejun Heo 	struct scx_sched *sch;
1261bba2c361STejun Heo 
1262bba2c361STejun Heo 	guard(rcu)();
1263bba2c361STejun Heo 
1264bba2c361STejun Heo 	sch = scx_prog_sched(aux);
1265bba2c361STejun Heo 	if (unlikely(!sch))
1266bba2c361STejun Heo 		return false;
1267bba2c361STejun Heo 
1268bba2c361STejun Heo 	if (!check_builtin_idle_enabled(sch))
1269bba2c361STejun Heo 		return false;
1270bba2c361STejun Heo 
1271bba2c361STejun Heo 	if (!scx_cpu_valid(sch, cpu, NULL))
1272bba2c361STejun Heo 		return false;
1273bba2c361STejun Heo 
1274bba2c361STejun Heo 	return scx_idle_test_and_clear_cpu(cpu);
1275bba2c361STejun Heo }
1276bba2c361STejun Heo 
1277bba2c361STejun Heo /**
1278bba2c361STejun Heo  * scx_bpf_pick_idle_cpu_node - Pick and claim an idle cpu from @node
1279bba2c361STejun Heo  * @cpus_allowed: Allowed cpumask
1280bba2c361STejun Heo  * @node: target NUMA node
1281bba2c361STejun Heo  * @flags: %SCX_PICK_IDLE_* flags
1282bba2c361STejun Heo  * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
1283bba2c361STejun Heo  *
1284bba2c361STejun Heo  * Pick and claim an idle cpu in @cpus_allowed from the NUMA node @node.
1285bba2c361STejun Heo  *
1286bba2c361STejun Heo  * Returns the picked idle cpu number on success, or -%EBUSY if no matching
1287bba2c361STejun Heo  * cpu was found.
1288bba2c361STejun Heo  *
1289bba2c361STejun Heo  * The search starts from @node and proceeds to other online NUMA nodes in
1290bba2c361STejun Heo  * order of increasing distance (unless SCX_PICK_IDLE_IN_NODE is specified,
1291bba2c361STejun Heo  * in which case the search is limited to the target @node).
1292bba2c361STejun Heo  *
1293bba2c361STejun Heo  * Always returns an error if ops.update_idle() is implemented and
1294bba2c361STejun Heo  * %SCX_OPS_KEEP_BUILTIN_IDLE is not set, or if
1295bba2c361STejun Heo  * %SCX_OPS_BUILTIN_IDLE_PER_NODE is not set.
1296bba2c361STejun Heo  */
1297bba2c361STejun Heo __bpf_kfunc s32 scx_bpf_pick_idle_cpu_node(const struct cpumask *cpus_allowed,
1298bba2c361STejun Heo 					   s32 node, u64 flags,
1299bba2c361STejun Heo 					   const struct bpf_prog_aux *aux)
1300bba2c361STejun Heo {
1301bba2c361STejun Heo 	struct scx_sched *sch;
1302bba2c361STejun Heo 
1303bba2c361STejun Heo 	guard(rcu)();
1304bba2c361STejun Heo 
1305bba2c361STejun Heo 	sch = scx_prog_sched(aux);
1306bba2c361STejun Heo 	if (unlikely(!sch))
1307bba2c361STejun Heo 		return -ENODEV;
1308bba2c361STejun Heo 
1309bba2c361STejun Heo 	node = validate_node(sch, node);
1310bba2c361STejun Heo 	if (node < 0)
1311bba2c361STejun Heo 		return node;
1312bba2c361STejun Heo 
1313bba2c361STejun Heo 	return scx_pick_idle_cpu(cpus_allowed, node, flags);
1314bba2c361STejun Heo }
1315bba2c361STejun Heo 
1316bba2c361STejun Heo /**
1317bba2c361STejun Heo  * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
1318bba2c361STejun Heo  * @cpus_allowed: Allowed cpumask
1319bba2c361STejun Heo  * @flags: %SCX_PICK_IDLE_CPU_* flags
1320bba2c361STejun Heo  * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
1321bba2c361STejun Heo  *
1322bba2c361STejun Heo  * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
1323bba2c361STejun Heo  * number on success. -%EBUSY if no matching cpu was found.
1324bba2c361STejun Heo  *
1325bba2c361STejun Heo  * Idle CPU tracking may race against CPU scheduling state transitions. For
1326bba2c361STejun Heo  * example, this function may return -%EBUSY as CPUs are transitioning into the
1327bba2c361STejun Heo  * idle state. If the caller then assumes that there will be dispatch events on
1328bba2c361STejun Heo  * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
1329bba2c361STejun Heo  * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
1330bba2c361STejun Heo  * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
1331bba2c361STejun Heo  * event in the near future.
1332bba2c361STejun Heo  *
1333bba2c361STejun Heo  * Unavailable if ops.update_idle() is implemented and
1334bba2c361STejun Heo  * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
1335bba2c361STejun Heo  *
1336bba2c361STejun Heo  * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use
1337bba2c361STejun Heo  * scx_bpf_pick_idle_cpu_node() instead.
1338bba2c361STejun Heo  */
1339bba2c361STejun Heo __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
1340bba2c361STejun Heo 				      u64 flags, const struct bpf_prog_aux *aux)
1341bba2c361STejun Heo {
1342bba2c361STejun Heo 	struct scx_sched *sch;
1343bba2c361STejun Heo 
1344bba2c361STejun Heo 	guard(rcu)();
1345bba2c361STejun Heo 
1346bba2c361STejun Heo 	sch = scx_prog_sched(aux);
1347bba2c361STejun Heo 	if (unlikely(!sch))
1348bba2c361STejun Heo 		return -ENODEV;
1349bba2c361STejun Heo 
1350bba2c361STejun Heo 	if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
1351bba2c361STejun Heo 		scx_error(sch, "per-node idle tracking is enabled");
1352bba2c361STejun Heo 		return -EBUSY;
1353bba2c361STejun Heo 	}
1354bba2c361STejun Heo 
1355bba2c361STejun Heo 	if (!check_builtin_idle_enabled(sch))
1356bba2c361STejun Heo 		return -EBUSY;
1357bba2c361STejun Heo 
1358bba2c361STejun Heo 	return scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags);
1359bba2c361STejun Heo }
1360bba2c361STejun Heo 
1361bba2c361STejun Heo /**
1362bba2c361STejun Heo  * scx_bpf_pick_any_cpu_node - Pick and claim an idle cpu if available
1363bba2c361STejun Heo  *			       or pick any CPU from @node
1364bba2c361STejun Heo  * @cpus_allowed: Allowed cpumask
1365bba2c361STejun Heo  * @node: target NUMA node
1366bba2c361STejun Heo  * @flags: %SCX_PICK_IDLE_CPU_* flags
1367bba2c361STejun Heo  * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
1368bba2c361STejun Heo  *
1369bba2c361STejun Heo  * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
1370bba2c361STejun Heo  * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
1371bba2c361STejun Heo  * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
1372bba2c361STejun Heo  * empty.
1373bba2c361STejun Heo  *
1374bba2c361STejun Heo  * The search starts from @node and proceeds to other online NUMA nodes in
1375bba2c361STejun Heo  * order of increasing distance (unless %SCX_PICK_IDLE_IN_NODE is specified,
1376bba2c361STejun Heo  * in which case the search is limited to the target @node, regardless of
1377bba2c361STejun Heo  * the CPU idle state).
1378bba2c361STejun Heo  *
1379bba2c361STejun Heo  * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
1380bba2c361STejun Heo  * set, this function can't tell which CPUs are idle and will always pick any
1381bba2c361STejun Heo  * CPU.
1382bba2c361STejun Heo  */
1383bba2c361STejun Heo __bpf_kfunc s32 scx_bpf_pick_any_cpu_node(const struct cpumask *cpus_allowed,
1384bba2c361STejun Heo 					  s32 node, u64 flags,
1385bba2c361STejun Heo 					  const struct bpf_prog_aux *aux)
1386bba2c361STejun Heo {
1387bba2c361STejun Heo 	struct scx_sched *sch;
1388bba2c361STejun Heo 	s32 cpu;
1389bba2c361STejun Heo 
1390bba2c361STejun Heo 	guard(rcu)();
1391bba2c361STejun Heo 
1392bba2c361STejun Heo 	sch = scx_prog_sched(aux);
1393bba2c361STejun Heo 	if (unlikely(!sch))
1394bba2c361STejun Heo 		return -ENODEV;
1395bba2c361STejun Heo 
1396bba2c361STejun Heo 	node = validate_node(sch, node);
1397bba2c361STejun Heo 	if (node < 0)
1398bba2c361STejun Heo 		return node;
1399bba2c361STejun Heo 
1400bba2c361STejun Heo 	cpu = scx_pick_idle_cpu(cpus_allowed, node, flags);
1401bba2c361STejun Heo 	if (cpu >= 0)
1402bba2c361STejun Heo 		return cpu;
1403bba2c361STejun Heo 
1404bba2c361STejun Heo 	if (flags & SCX_PICK_IDLE_IN_NODE)
1405bba2c361STejun Heo 		cpu = cpumask_any_and_distribute(cpumask_of_node(node), cpus_allowed);
1406bba2c361STejun Heo 	else
1407bba2c361STejun Heo 		cpu = cpumask_any_distribute(cpus_allowed);
1408bba2c361STejun Heo 	if (cpu < nr_cpu_ids)
1409bba2c361STejun Heo 		return cpu;
1410bba2c361STejun Heo 	else
1411bba2c361STejun Heo 		return -EBUSY;
1412bba2c361STejun Heo }
1413bba2c361STejun Heo 
1414bba2c361STejun Heo /**
1415bba2c361STejun Heo  * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
1416bba2c361STejun Heo  * @cpus_allowed: Allowed cpumask
1417bba2c361STejun Heo  * @flags: %SCX_PICK_IDLE_CPU_* flags
1418bba2c361STejun Heo  * @aux: implicit BPF argument to access bpf_prog_aux hidden from BPF progs
1419bba2c361STejun Heo  *
1420bba2c361STejun Heo  * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
1421bba2c361STejun Heo  * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
1422bba2c361STejun Heo  * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
1423bba2c361STejun Heo  * empty.
1424bba2c361STejun Heo  *
1425bba2c361STejun Heo  * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
1426bba2c361STejun Heo  * set, this function can't tell which CPUs are idle and will always pick any
1427bba2c361STejun Heo  * CPU.
1428bba2c361STejun Heo  *
1429bba2c361STejun Heo  * Always returns an error if %SCX_OPS_BUILTIN_IDLE_PER_NODE is set, use
1430bba2c361STejun Heo  * scx_bpf_pick_any_cpu_node() instead.
1431bba2c361STejun Heo  */
1432bba2c361STejun Heo __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
1433bba2c361STejun Heo 				     u64 flags, const struct bpf_prog_aux *aux)
1434bba2c361STejun Heo {
1435bba2c361STejun Heo 	struct scx_sched *sch;
1436bba2c361STejun Heo 	s32 cpu;
1437bba2c361STejun Heo 
1438bba2c361STejun Heo 	guard(rcu)();
1439bba2c361STejun Heo 
1440bba2c361STejun Heo 	sch = scx_prog_sched(aux);
1441bba2c361STejun Heo 	if (unlikely(!sch))
1442bba2c361STejun Heo 		return -ENODEV;
1443bba2c361STejun Heo 
1444bba2c361STejun Heo 	if (static_branch_maybe(CONFIG_NUMA, &scx_builtin_idle_per_node)) {
1445bba2c361STejun Heo 		scx_error(sch, "per-node idle tracking is enabled");
1446bba2c361STejun Heo 		return -EBUSY;
1447bba2c361STejun Heo 	}
1448bba2c361STejun Heo 
1449bba2c361STejun Heo 	if (static_branch_likely(&scx_builtin_idle_enabled)) {
1450bba2c361STejun Heo 		cpu = scx_pick_idle_cpu(cpus_allowed, NUMA_NO_NODE, flags);
1451bba2c361STejun Heo 		if (cpu >= 0)
1452bba2c361STejun Heo 			return cpu;
1453bba2c361STejun Heo 	}
1454bba2c361STejun Heo 
1455bba2c361STejun Heo 	cpu = cpumask_any_distribute(cpus_allowed);
1456bba2c361STejun Heo 	if (cpu < nr_cpu_ids)
1457bba2c361STejun Heo 		return cpu;
1458bba2c361STejun Heo 	else
1459bba2c361STejun Heo 		return -EBUSY;
1460bba2c361STejun Heo }
1461bba2c361STejun Heo 
1462bba2c361STejun Heo __bpf_kfunc_end_defs();
1463bba2c361STejun Heo 
1464bba2c361STejun Heo BTF_KFUNCS_START(scx_kfunc_ids_idle)
1465bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_cpu_node, KF_IMPLICIT_ARGS)
1466bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE)
1467bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_IMPLICIT_ARGS | KF_ACQUIRE)
1468bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask_node, KF_IMPLICIT_ARGS | KF_ACQUIRE)
1469bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_IMPLICIT_ARGS | KF_ACQUIRE)
1470bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
1471bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle, KF_IMPLICIT_ARGS)
1472bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu_node, KF_IMPLICIT_ARGS | KF_RCU)
1473bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_IMPLICIT_ARGS | KF_RCU)
1474bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu_node, KF_IMPLICIT_ARGS | KF_RCU)
1475bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_IMPLICIT_ARGS | KF_RCU)
1476bba2c361STejun Heo BTF_KFUNCS_END(scx_kfunc_ids_idle)
1477bba2c361STejun Heo 
1478bba2c361STejun Heo static const struct btf_kfunc_id_set scx_kfunc_set_idle = {
1479bba2c361STejun Heo 	.owner			= THIS_MODULE,
1480bba2c361STejun Heo 	.set			= &scx_kfunc_ids_idle,
1481bba2c361STejun Heo 	.filter			= scx_kfunc_context_filter,
1482bba2c361STejun Heo };
1483bba2c361STejun Heo 
1484bba2c361STejun Heo /*
1485bba2c361STejun Heo  * The select_cpu kfuncs internally call task_rq_lock() when invoked from an
1486bba2c361STejun Heo  * rq-unlocked context, and thus cannot be safely called from arbitrary tracing
1487bba2c361STejun Heo  * contexts where @p's pi_lock state is unknown. Keep them out of
1488bba2c361STejun Heo  * BPF_PROG_TYPE_TRACING by registering them in their own set which is exposed
1489bba2c361STejun Heo  * only to STRUCT_OPS and SYSCALL programs.
1490bba2c361STejun Heo  *
1491bba2c361STejun Heo  * These kfuncs are also members of scx_kfunc_ids_unlocked (see ext.c) because
1492bba2c361STejun Heo  * they're callable from unlocked contexts in addition to ops.select_cpu() and
1493bba2c361STejun Heo  * ops.enqueue().
1494bba2c361STejun Heo  */
1495bba2c361STejun Heo BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
1496bba2c361STejun Heo BTF_ID_FLAGS(func, __scx_bpf_select_cpu_and, KF_IMPLICIT_ARGS | KF_RCU)
1497bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_select_cpu_and, KF_RCU)
1498bba2c361STejun Heo BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_IMPLICIT_ARGS | KF_RCU)
1499bba2c361STejun Heo BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
1500bba2c361STejun Heo 
1501bba2c361STejun Heo static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
1502bba2c361STejun Heo 	.owner			= THIS_MODULE,
1503bba2c361STejun Heo 	.set			= &scx_kfunc_ids_select_cpu,
1504bba2c361STejun Heo 	.filter			= scx_kfunc_context_filter,
1505bba2c361STejun Heo };
1506bba2c361STejun Heo 
1507bba2c361STejun Heo int scx_idle_init(void)
1508bba2c361STejun Heo {
1509bba2c361STejun Heo 	return register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_idle) ?:
1510bba2c361STejun Heo 	       register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &scx_kfunc_set_idle) ?:
1511bba2c361STejun Heo 	       register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_idle) ?:
1512bba2c361STejun Heo 	       register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &scx_kfunc_set_select_cpu) ?:
1513bba2c361STejun Heo 	       register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &scx_kfunc_set_select_cpu);
1514bba2c361STejun Heo }
1515