xref: /linux/kernel/sched/ext.c (revision 81aae789181b5850d77dfdf74d4b85c63f0705e9)
1 /* SPDX-License-Identifier: GPL-2.0 */
2 /*
3  * Copyright (c) 2022 Meta Platforms, Inc. and affiliates.
4  * Copyright (c) 2022 Tejun Heo <tj@kernel.org>
5  * Copyright (c) 2022 David Vernet <dvernet@meta.com>
6  */
7 #define SCX_OP_IDX(op)		(offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void)))
8 
9 enum scx_consts {
10 	SCX_DSP_DFL_MAX_BATCH		= 32,
11 	SCX_WATCHDOG_MAX_TIMEOUT	= 30 * HZ,
12 
13 	SCX_EXIT_BT_LEN			= 64,
14 	SCX_EXIT_MSG_LEN		= 1024,
15 	SCX_EXIT_DUMP_DFL_LEN		= 32768,
16 };
17 
18 enum scx_exit_kind {
19 	SCX_EXIT_NONE,
20 	SCX_EXIT_DONE,
21 
22 	SCX_EXIT_UNREG = 64,	/* user-space initiated unregistration */
23 	SCX_EXIT_UNREG_BPF,	/* BPF-initiated unregistration */
24 	SCX_EXIT_UNREG_KERN,	/* kernel-initiated unregistration */
25 	SCX_EXIT_SYSRQ,		/* requested by 'S' sysrq */
26 
27 	SCX_EXIT_ERROR = 1024,	/* runtime error, error msg contains details */
28 	SCX_EXIT_ERROR_BPF,	/* ERROR but triggered through scx_bpf_error() */
29 	SCX_EXIT_ERROR_STALL,	/* watchdog detected stalled runnable tasks */
30 };
31 
32 /*
33  * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is
34  * being disabled.
35  */
36 struct scx_exit_info {
37 	/* %SCX_EXIT_* - broad category of the exit reason */
38 	enum scx_exit_kind	kind;
39 
40 	/* exit code if gracefully exiting */
41 	s64			exit_code;
42 
43 	/* textual representation of the above */
44 	const char		*reason;
45 
46 	/* backtrace if exiting due to an error */
47 	unsigned long		*bt;
48 	u32			bt_len;
49 
50 	/* informational message */
51 	char			*msg;
52 
53 	/* debug dump */
54 	char			*dump;
55 };
56 
57 /* sched_ext_ops.flags */
58 enum scx_ops_flags {
59 	/*
60 	 * Keep built-in idle tracking even if ops.update_idle() is implemented.
61 	 */
62 	SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0,
63 
64 	/*
65 	 * By default, if there are no other task to run on the CPU, ext core
66 	 * keeps running the current task even after its slice expires. If this
67 	 * flag is specified, such tasks are passed to ops.enqueue() with
68 	 * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info.
69 	 */
70 	SCX_OPS_ENQ_LAST	= 1LLU << 1,
71 
72 	/*
73 	 * An exiting task may schedule after PF_EXITING is set. In such cases,
74 	 * bpf_task_from_pid() may not be able to find the task and if the BPF
75 	 * scheduler depends on pid lookup for dispatching, the task will be
76 	 * lost leading to various issues including RCU grace period stalls.
77 	 *
78 	 * To mask this problem, by default, unhashed tasks are automatically
79 	 * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't
80 	 * depend on pid lookups and wants to handle these tasks directly, the
81 	 * following flag can be used.
82 	 */
83 	SCX_OPS_ENQ_EXITING	= 1LLU << 2,
84 
85 	/*
86 	 * If set, only tasks with policy set to SCHED_EXT are attached to
87 	 * sched_ext. If clear, SCHED_NORMAL tasks are also included.
88 	 */
89 	SCX_OPS_SWITCH_PARTIAL	= 1LLU << 3,
90 
91 	SCX_OPS_ALL_FLAGS	= SCX_OPS_KEEP_BUILTIN_IDLE |
92 				  SCX_OPS_ENQ_LAST |
93 				  SCX_OPS_ENQ_EXITING |
94 				  SCX_OPS_SWITCH_PARTIAL,
95 };
96 
97 /* argument container for ops.init_task() */
98 struct scx_init_task_args {
99 	/*
100 	 * Set if ops.init_task() is being invoked on the fork path, as opposed
101 	 * to the scheduler transition path.
102 	 */
103 	bool			fork;
104 };
105 
106 /* argument container for ops.exit_task() */
107 struct scx_exit_task_args {
108 	/* Whether the task exited before running on sched_ext. */
109 	bool cancelled;
110 };
111 
112 /*
113  * Informational context provided to dump operations.
114  */
115 struct scx_dump_ctx {
116 	enum scx_exit_kind	kind;
117 	s64			exit_code;
118 	const char		*reason;
119 	u64			at_ns;
120 	u64			at_jiffies;
121 };
122 
123 /**
124  * struct sched_ext_ops - Operation table for BPF scheduler implementation
125  *
126  * Userland can implement an arbitrary scheduling policy by implementing and
127  * loading operations in this table.
128  */
129 struct sched_ext_ops {
130 	/**
131 	 * select_cpu - Pick the target CPU for a task which is being woken up
132 	 * @p: task being woken up
133 	 * @prev_cpu: the cpu @p was on before sleeping
134 	 * @wake_flags: SCX_WAKE_*
135 	 *
136 	 * Decision made here isn't final. @p may be moved to any CPU while it
137 	 * is getting dispatched for execution later. However, as @p is not on
138 	 * the rq at this point, getting the eventual execution CPU right here
139 	 * saves a small bit of overhead down the line.
140 	 *
141 	 * If an idle CPU is returned, the CPU is kicked and will try to
142 	 * dispatch. While an explicit custom mechanism can be added,
143 	 * select_cpu() serves as the default way to wake up idle CPUs.
144 	 *
145 	 * @p may be dispatched directly by calling scx_bpf_dispatch(). If @p
146 	 * is dispatched, the ops.enqueue() callback will be skipped. Finally,
147 	 * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the
148 	 * local DSQ of whatever CPU is returned by this callback.
149 	 */
150 	s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
151 
152 	/**
153 	 * enqueue - Enqueue a task on the BPF scheduler
154 	 * @p: task being enqueued
155 	 * @enq_flags: %SCX_ENQ_*
156 	 *
157 	 * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch()
158 	 * or enqueue on the BPF scheduler. If not directly dispatched, the bpf
159 	 * scheduler owns @p and if it fails to dispatch @p, the task will
160 	 * stall.
161 	 *
162 	 * If @p was dispatched from ops.select_cpu(), this callback is
163 	 * skipped.
164 	 */
165 	void (*enqueue)(struct task_struct *p, u64 enq_flags);
166 
167 	/**
168 	 * dequeue - Remove a task from the BPF scheduler
169 	 * @p: task being dequeued
170 	 * @deq_flags: %SCX_DEQ_*
171 	 *
172 	 * Remove @p from the BPF scheduler. This is usually called to isolate
173 	 * the task while updating its scheduling properties (e.g. priority).
174 	 *
175 	 * The ext core keeps track of whether the BPF side owns a given task or
176 	 * not and can gracefully ignore spurious dispatches from BPF side,
177 	 * which makes it safe to not implement this method. However, depending
178 	 * on the scheduling logic, this can lead to confusing behaviors - e.g.
179 	 * scheduling position not being updated across a priority change.
180 	 */
181 	void (*dequeue)(struct task_struct *p, u64 deq_flags);
182 
183 	/**
184 	 * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs
185 	 * @cpu: CPU to dispatch tasks for
186 	 * @prev: previous task being switched out
187 	 *
188 	 * Called when a CPU's local dsq is empty. The operation should dispatch
189 	 * one or more tasks from the BPF scheduler into the DSQs using
190 	 * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using
191 	 * scx_bpf_consume().
192 	 *
193 	 * The maximum number of times scx_bpf_dispatch() can be called without
194 	 * an intervening scx_bpf_consume() is specified by
195 	 * ops.dispatch_max_batch. See the comments on top of the two functions
196 	 * for more details.
197 	 *
198 	 * When not %NULL, @prev is an SCX task with its slice depleted. If
199 	 * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
200 	 * @prev->scx.flags, it is not enqueued yet and will be enqueued after
201 	 * ops.dispatch() returns. To keep executing @prev, return without
202 	 * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST.
203 	 */
204 	void (*dispatch)(s32 cpu, struct task_struct *prev);
205 
206 	/**
207 	 * tick - Periodic tick
208 	 * @p: task running currently
209 	 *
210 	 * This operation is called every 1/HZ seconds on CPUs which are
211 	 * executing an SCX task. Setting @p->scx.slice to 0 will trigger an
212 	 * immediate dispatch cycle on the CPU.
213 	 */
214 	void (*tick)(struct task_struct *p);
215 
216 	/**
217 	 * yield - Yield CPU
218 	 * @from: yielding task
219 	 * @to: optional yield target task
220 	 *
221 	 * If @to is NULL, @from is yielding the CPU to other runnable tasks.
222 	 * The BPF scheduler should ensure that other available tasks are
223 	 * dispatched before the yielding task. Return value is ignored in this
224 	 * case.
225 	 *
226 	 * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf
227 	 * scheduler can implement the request, return %true; otherwise, %false.
228 	 */
229 	bool (*yield)(struct task_struct *from, struct task_struct *to);
230 
231 	/**
232 	 * set_weight - Set task weight
233 	 * @p: task to set weight for
234 	 * @weight: new eight [1..10000]
235 	 *
236 	 * Update @p's weight to @weight.
237 	 */
238 	void (*set_weight)(struct task_struct *p, u32 weight);
239 
240 	/**
241 	 * set_cpumask - Set CPU affinity
242 	 * @p: task to set CPU affinity for
243 	 * @cpumask: cpumask of cpus that @p can run on
244 	 *
245 	 * Update @p's CPU affinity to @cpumask.
246 	 */
247 	void (*set_cpumask)(struct task_struct *p,
248 			    const struct cpumask *cpumask);
249 
250 	/**
251 	 * update_idle - Update the idle state of a CPU
252 	 * @cpu: CPU to udpate the idle state for
253 	 * @idle: whether entering or exiting the idle state
254 	 *
255 	 * This operation is called when @rq's CPU goes or leaves the idle
256 	 * state. By default, implementing this operation disables the built-in
257 	 * idle CPU tracking and the following helpers become unavailable:
258 	 *
259 	 * - scx_bpf_select_cpu_dfl()
260 	 * - scx_bpf_test_and_clear_cpu_idle()
261 	 * - scx_bpf_pick_idle_cpu()
262 	 *
263 	 * The user also must implement ops.select_cpu() as the default
264 	 * implementation relies on scx_bpf_select_cpu_dfl().
265 	 *
266 	 * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle
267 	 * tracking.
268 	 */
269 	void (*update_idle)(s32 cpu, bool idle);
270 
271 	/**
272 	 * init_task - Initialize a task to run in a BPF scheduler
273 	 * @p: task to initialize for BPF scheduling
274 	 * @args: init arguments, see the struct definition
275 	 *
276 	 * Either we're loading a BPF scheduler or a new task is being forked.
277 	 * Initialize @p for BPF scheduling. This operation may block and can
278 	 * be used for allocations, and is called exactly once for a task.
279 	 *
280 	 * Return 0 for success, -errno for failure. An error return while
281 	 * loading will abort loading of the BPF scheduler. During a fork, it
282 	 * will abort that specific fork.
283 	 */
284 	s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args);
285 
286 	/**
287 	 * exit_task - Exit a previously-running task from the system
288 	 * @p: task to exit
289 	 *
290 	 * @p is exiting or the BPF scheduler is being unloaded. Perform any
291 	 * necessary cleanup for @p.
292 	 */
293 	void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args);
294 
295 	/**
296 	 * enable - Enable BPF scheduling for a task
297 	 * @p: task to enable BPF scheduling for
298 	 *
299 	 * Enable @p for BPF scheduling. enable() is called on @p any time it
300 	 * enters SCX, and is always paired with a matching disable().
301 	 */
302 	void (*enable)(struct task_struct *p);
303 
304 	/**
305 	 * disable - Disable BPF scheduling for a task
306 	 * @p: task to disable BPF scheduling for
307 	 *
308 	 * @p is exiting, leaving SCX or the BPF scheduler is being unloaded.
309 	 * Disable BPF scheduling for @p. A disable() call is always matched
310 	 * with a prior enable() call.
311 	 */
312 	void (*disable)(struct task_struct *p);
313 
314 	/**
315 	 * dump - Dump BPF scheduler state on error
316 	 * @ctx: debug dump context
317 	 *
318 	 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump.
319 	 */
320 	void (*dump)(struct scx_dump_ctx *ctx);
321 
322 	/**
323 	 * dump_cpu - Dump BPF scheduler state for a CPU on error
324 	 * @ctx: debug dump context
325 	 * @cpu: CPU to generate debug dump for
326 	 * @idle: @cpu is currently idle without any runnable tasks
327 	 *
328 	 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
329 	 * @cpu. If @idle is %true and this operation doesn't produce any
330 	 * output, @cpu is skipped for dump.
331 	 */
332 	void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle);
333 
334 	/**
335 	 * dump_task - Dump BPF scheduler state for a runnable task on error
336 	 * @ctx: debug dump context
337 	 * @p: runnable task to generate debug dump for
338 	 *
339 	 * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for
340 	 * @p.
341 	 */
342 	void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p);
343 
344 	/*
345 	 * All online ops must come before ops.init().
346 	 */
347 
348 	/**
349 	 * init - Initialize the BPF scheduler
350 	 */
351 	s32 (*init)(void);
352 
353 	/**
354 	 * exit - Clean up after the BPF scheduler
355 	 * @info: Exit info
356 	 */
357 	void (*exit)(struct scx_exit_info *info);
358 
359 	/**
360 	 * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch
361 	 */
362 	u32 dispatch_max_batch;
363 
364 	/**
365 	 * flags - %SCX_OPS_* flags
366 	 */
367 	u64 flags;
368 
369 	/**
370 	 * timeout_ms - The maximum amount of time, in milliseconds, that a
371 	 * runnable task should be able to wait before being scheduled. The
372 	 * maximum timeout may not exceed the default timeout of 30 seconds.
373 	 *
374 	 * Defaults to the maximum allowed timeout value of 30 seconds.
375 	 */
376 	u32 timeout_ms;
377 
378 	/**
379 	 * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default
380 	 * value of 32768 is used.
381 	 */
382 	u32 exit_dump_len;
383 
384 	/**
385 	 * name - BPF scheduler's name
386 	 *
387 	 * Must be a non-zero valid BPF object name including only isalnum(),
388 	 * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the
389 	 * BPF scheduler is enabled.
390 	 */
391 	char name[SCX_OPS_NAME_LEN];
392 };
393 
394 enum scx_opi {
395 	SCX_OPI_BEGIN			= 0,
396 	SCX_OPI_NORMAL_BEGIN		= 0,
397 	SCX_OPI_NORMAL_END		= SCX_OP_IDX(init),
398 	SCX_OPI_END			= SCX_OP_IDX(init),
399 };
400 
401 enum scx_wake_flags {
402 	/* expose select WF_* flags as enums */
403 	SCX_WAKE_FORK		= WF_FORK,
404 	SCX_WAKE_TTWU		= WF_TTWU,
405 	SCX_WAKE_SYNC		= WF_SYNC,
406 };
407 
408 enum scx_enq_flags {
409 	/* expose select ENQUEUE_* flags as enums */
410 	SCX_ENQ_WAKEUP		= ENQUEUE_WAKEUP,
411 	SCX_ENQ_HEAD		= ENQUEUE_HEAD,
412 
413 	/* high 32bits are SCX specific */
414 
415 	/*
416 	 * Set the following to trigger preemption when calling
417 	 * scx_bpf_dispatch() with a local dsq as the target. The slice of the
418 	 * current task is cleared to zero and the CPU is kicked into the
419 	 * scheduling path. Implies %SCX_ENQ_HEAD.
420 	 */
421 	SCX_ENQ_PREEMPT		= 1LLU << 32,
422 
423 	/*
424 	 * The task being enqueued is the only task available for the cpu. By
425 	 * default, ext core keeps executing such tasks but when
426 	 * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the
427 	 * %SCX_ENQ_LAST flag set.
428 	 *
429 	 * If the BPF scheduler wants to continue executing the task,
430 	 * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately.
431 	 * If the task gets queued on a different dsq or the BPF side, the BPF
432 	 * scheduler is responsible for triggering a follow-up scheduling event.
433 	 * Otherwise, Execution may stall.
434 	 */
435 	SCX_ENQ_LAST		= 1LLU << 41,
436 
437 	/* high 8 bits are internal */
438 	__SCX_ENQ_INTERNAL_MASK	= 0xffLLU << 56,
439 
440 	SCX_ENQ_CLEAR_OPSS	= 1LLU << 56,
441 };
442 
443 enum scx_deq_flags {
444 	/* expose select DEQUEUE_* flags as enums */
445 	SCX_DEQ_SLEEP		= DEQUEUE_SLEEP,
446 };
447 
448 enum scx_pick_idle_cpu_flags {
449 	SCX_PICK_IDLE_CORE	= 1LLU << 0,	/* pick a CPU whose SMT siblings are also idle */
450 };
451 
452 enum scx_kick_flags {
453 	/*
454 	 * Kick the target CPU if idle. Guarantees that the target CPU goes
455 	 * through at least one full scheduling cycle before going idle. If the
456 	 * target CPU can be determined to be currently not idle and going to go
457 	 * through a scheduling cycle before going idle, noop.
458 	 */
459 	SCX_KICK_IDLE		= 1LLU << 0,
460 
461 	/*
462 	 * Preempt the current task and execute the dispatch path. If the
463 	 * current task of the target CPU is an SCX task, its ->scx.slice is
464 	 * cleared to zero before the scheduling path is invoked so that the
465 	 * task expires and the dispatch path is invoked.
466 	 */
467 	SCX_KICK_PREEMPT	= 1LLU << 1,
468 };
469 
470 enum scx_ops_enable_state {
471 	SCX_OPS_PREPPING,
472 	SCX_OPS_ENABLING,
473 	SCX_OPS_ENABLED,
474 	SCX_OPS_DISABLING,
475 	SCX_OPS_DISABLED,
476 };
477 
478 static const char *scx_ops_enable_state_str[] = {
479 	[SCX_OPS_PREPPING]	= "prepping",
480 	[SCX_OPS_ENABLING]	= "enabling",
481 	[SCX_OPS_ENABLED]	= "enabled",
482 	[SCX_OPS_DISABLING]	= "disabling",
483 	[SCX_OPS_DISABLED]	= "disabled",
484 };
485 
486 /*
487  * sched_ext_entity->ops_state
488  *
489  * Used to track the task ownership between the SCX core and the BPF scheduler.
490  * State transitions look as follows:
491  *
492  * NONE -> QUEUEING -> QUEUED -> DISPATCHING
493  *   ^              |                 |
494  *   |              v                 v
495  *   \-------------------------------/
496  *
497  * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call
498  * sites for explanations on the conditions being waited upon and why they are
499  * safe. Transitions out of them into NONE or QUEUED must store_release and the
500  * waiters should load_acquire.
501  *
502  * Tracking scx_ops_state enables sched_ext core to reliably determine whether
503  * any given task can be dispatched by the BPF scheduler at all times and thus
504  * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler
505  * to try to dispatch any task anytime regardless of its state as the SCX core
506  * can safely reject invalid dispatches.
507  */
508 enum scx_ops_state {
509 	SCX_OPSS_NONE,		/* owned by the SCX core */
510 	SCX_OPSS_QUEUEING,	/* in transit to the BPF scheduler */
511 	SCX_OPSS_QUEUED,	/* owned by the BPF scheduler */
512 	SCX_OPSS_DISPATCHING,	/* in transit back to the SCX core */
513 
514 	/*
515 	 * QSEQ brands each QUEUED instance so that, when dispatch races
516 	 * dequeue/requeue, the dispatcher can tell whether it still has a claim
517 	 * on the task being dispatched.
518 	 *
519 	 * As some 32bit archs can't do 64bit store_release/load_acquire,
520 	 * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on
521 	 * 32bit machines. The dispatch race window QSEQ protects is very narrow
522 	 * and runs with IRQ disabled. 30 bits should be sufficient.
523 	 */
524 	SCX_OPSS_QSEQ_SHIFT	= 2,
525 };
526 
527 /* Use macros to ensure that the type is unsigned long for the masks */
528 #define SCX_OPSS_STATE_MASK	((1LU << SCX_OPSS_QSEQ_SHIFT) - 1)
529 #define SCX_OPSS_QSEQ_MASK	(~SCX_OPSS_STATE_MASK)
530 
531 /*
532  * During exit, a task may schedule after losing its PIDs. When disabling the
533  * BPF scheduler, we need to be able to iterate tasks in every state to
534  * guarantee system safety. Maintain a dedicated task list which contains every
535  * task between its fork and eventual free.
536  */
537 static DEFINE_SPINLOCK(scx_tasks_lock);
538 static LIST_HEAD(scx_tasks);
539 
540 /* ops enable/disable */
541 static struct kthread_worker *scx_ops_helper;
542 static DEFINE_MUTEX(scx_ops_enable_mutex);
543 DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
544 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
545 static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
546 static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0);
547 static bool scx_switching_all;
548 DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
549 
550 static struct sched_ext_ops scx_ops;
551 static bool scx_warned_zero_slice;
552 
553 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last);
554 static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
555 static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
556 
557 struct static_key_false scx_has_op[SCX_OPI_END] =
558 	{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
559 
560 static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE);
561 static struct scx_exit_info *scx_exit_info;
562 
563 static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0);
564 
565 /*
566  * The maximum amount of time in jiffies that a task may be runnable without
567  * being scheduled on a CPU. If this timeout is exceeded, it will trigger
568  * scx_ops_error().
569  */
570 static unsigned long scx_watchdog_timeout;
571 
572 /*
573  * The last time the delayed work was run. This delayed work relies on
574  * ksoftirqd being able to run to service timer interrupts, so it's possible
575  * that this work itself could get wedged. To account for this, we check that
576  * it's not stalled in the timer tick, and trigger an error if it is.
577  */
578 static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES;
579 
580 static struct delayed_work scx_watchdog_work;
581 
582 /* idle tracking */
583 #ifdef CONFIG_SMP
584 #ifdef CONFIG_CPUMASK_OFFSTACK
585 #define CL_ALIGNED_IF_ONSTACK
586 #else
587 #define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp
588 #endif
589 
590 static struct {
591 	cpumask_var_t cpu;
592 	cpumask_var_t smt;
593 } idle_masks CL_ALIGNED_IF_ONSTACK;
594 
595 #endif	/* CONFIG_SMP */
596 
597 /*
598  * Direct dispatch marker.
599  *
600  * Non-NULL values are used for direct dispatch from enqueue path. A valid
601  * pointer points to the task currently being enqueued. An ERR_PTR value is used
602  * to indicate that direct dispatch has already happened.
603  */
604 static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task);
605 
606 /* dispatch queues */
607 static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global;
608 
609 static const struct rhashtable_params dsq_hash_params = {
610 	.key_len		= 8,
611 	.key_offset		= offsetof(struct scx_dispatch_q, id),
612 	.head_offset		= offsetof(struct scx_dispatch_q, hash_node),
613 };
614 
615 static struct rhashtable dsq_hash;
616 static LLIST_HEAD(dsqs_to_free);
617 
618 /* dispatch buf */
619 struct scx_dsp_buf_ent {
620 	struct task_struct	*task;
621 	unsigned long		qseq;
622 	u64			dsq_id;
623 	u64			enq_flags;
624 };
625 
626 static u32 scx_dsp_max_batch;
627 
628 struct scx_dsp_ctx {
629 	struct rq		*rq;
630 	struct rq_flags		*rf;
631 	u32			cursor;
632 	u32			nr_tasks;
633 	struct scx_dsp_buf_ent	buf[];
634 };
635 
636 static struct scx_dsp_ctx __percpu *scx_dsp_ctx;
637 
638 /* string formatting from BPF */
639 struct scx_bstr_buf {
640 	u64			data[MAX_BPRINTF_VARARGS];
641 	char			line[SCX_EXIT_MSG_LEN];
642 };
643 
644 static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock);
645 static struct scx_bstr_buf scx_exit_bstr_buf;
646 
647 /* ops debug dump */
648 struct scx_dump_data {
649 	s32			cpu;
650 	bool			first;
651 	s32			cursor;
652 	struct seq_buf		*s;
653 	const char		*prefix;
654 	struct scx_bstr_buf	buf;
655 };
656 
657 struct scx_dump_data scx_dump_data = {
658 	.cpu			= -1,
659 };
660 
661 /* /sys/kernel/sched_ext interface */
662 static struct kset *scx_kset;
663 static struct kobject *scx_root_kobj;
664 
665 #define CREATE_TRACE_POINTS
666 #include <trace/events/sched_ext.h>
667 
668 static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
669 					     s64 exit_code,
670 					     const char *fmt, ...);
671 
672 #define scx_ops_error_kind(err, fmt, args...)					\
673 	scx_ops_exit_kind((err), 0, fmt, ##args)
674 
675 #define scx_ops_exit(code, fmt, args...)					\
676 	scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args)
677 
678 #define scx_ops_error(fmt, args...)						\
679 	scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args)
680 
681 #define SCX_HAS_OP(op)	static_branch_likely(&scx_has_op[SCX_OP_IDX(op)])
682 
683 static long jiffies_delta_msecs(unsigned long at, unsigned long now)
684 {
685 	if (time_after(at, now))
686 		return jiffies_to_msecs(at - now);
687 	else
688 		return -(long)jiffies_to_msecs(now - at);
689 }
690 
691 /* if the highest set bit is N, return a mask with bits [N+1, 31] set */
692 static u32 higher_bits(u32 flags)
693 {
694 	return ~((1 << fls(flags)) - 1);
695 }
696 
697 /* return the mask with only the highest bit set */
698 static u32 highest_bit(u32 flags)
699 {
700 	int bit = fls(flags);
701 	return ((u64)1 << bit) >> 1;
702 }
703 
704 /*
705  * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX
706  * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate
707  * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check
708  * whether it's running from an allowed context.
709  *
710  * @mask is constant, always inline to cull the mask calculations.
711  */
712 static __always_inline void scx_kf_allow(u32 mask)
713 {
714 	/* nesting is allowed only in increasing scx_kf_mask order */
715 	WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask,
716 		  "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n",
717 		  current->scx.kf_mask, mask);
718 	current->scx.kf_mask |= mask;
719 	barrier();
720 }
721 
722 static void scx_kf_disallow(u32 mask)
723 {
724 	barrier();
725 	current->scx.kf_mask &= ~mask;
726 }
727 
728 #define SCX_CALL_OP(mask, op, args...)						\
729 do {										\
730 	if (mask) {								\
731 		scx_kf_allow(mask);						\
732 		scx_ops.op(args);						\
733 		scx_kf_disallow(mask);						\
734 	} else {								\
735 		scx_ops.op(args);						\
736 	}									\
737 } while (0)
738 
739 #define SCX_CALL_OP_RET(mask, op, args...)					\
740 ({										\
741 	__typeof__(scx_ops.op(args)) __ret;					\
742 	if (mask) {								\
743 		scx_kf_allow(mask);						\
744 		__ret = scx_ops.op(args);					\
745 		scx_kf_disallow(mask);						\
746 	} else {								\
747 		__ret = scx_ops.op(args);					\
748 	}									\
749 	__ret;									\
750 })
751 
752 /* @mask is constant, always inline to cull unnecessary branches */
753 static __always_inline bool scx_kf_allowed(u32 mask)
754 {
755 	if (unlikely(!(current->scx.kf_mask & mask))) {
756 		scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x",
757 			      mask, current->scx.kf_mask);
758 		return false;
759 	}
760 
761 	if (unlikely((mask & SCX_KF_SLEEPABLE) && in_interrupt())) {
762 		scx_ops_error("sleepable kfunc called from non-sleepable context");
763 		return false;
764 	}
765 
766 	/*
767 	 * Enforce nesting boundaries. e.g. A kfunc which can be called from
768 	 * DISPATCH must not be called if we're running DEQUEUE which is nested
769 	 * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE
770 	 * boundary thanks to the above in_interrupt() check.
771 	 */
772 	if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH &&
773 		     (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) {
774 		scx_ops_error("dispatch kfunc called from a nested operation");
775 		return false;
776 	}
777 
778 	return true;
779 }
780 
781 
782 /*
783  * SCX task iterator.
784  */
785 struct scx_task_iter {
786 	struct sched_ext_entity		cursor;
787 	struct task_struct		*locked;
788 	struct rq			*rq;
789 	struct rq_flags			rf;
790 };
791 
792 /**
793  * scx_task_iter_init - Initialize a task iterator
794  * @iter: iterator to init
795  *
796  * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized,
797  * @iter must eventually be exited with scx_task_iter_exit().
798  *
799  * scx_tasks_lock may be released between this and the first next() call or
800  * between any two next() calls. If scx_tasks_lock is released between two
801  * next() calls, the caller is responsible for ensuring that the task being
802  * iterated remains accessible either through RCU read lock or obtaining a
803  * reference count.
804  *
805  * All tasks which existed when the iteration started are guaranteed to be
806  * visited as long as they still exist.
807  */
808 static void scx_task_iter_init(struct scx_task_iter *iter)
809 {
810 	lockdep_assert_held(&scx_tasks_lock);
811 
812 	iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR };
813 	list_add(&iter->cursor.tasks_node, &scx_tasks);
814 	iter->locked = NULL;
815 }
816 
817 /**
818  * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator
819  * @iter: iterator to unlock rq for
820  *
821  * If @iter is in the middle of a locked iteration, it may be locking the rq of
822  * the task currently being visited. Unlock the rq if so. This function can be
823  * safely called anytime during an iteration.
824  *
825  * Returns %true if the rq @iter was locking is unlocked. %false if @iter was
826  * not locking an rq.
827  */
828 static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter)
829 {
830 	if (iter->locked) {
831 		task_rq_unlock(iter->rq, iter->locked, &iter->rf);
832 		iter->locked = NULL;
833 		return true;
834 	} else {
835 		return false;
836 	}
837 }
838 
839 /**
840  * scx_task_iter_exit - Exit a task iterator
841  * @iter: iterator to exit
842  *
843  * Exit a previously initialized @iter. Must be called with scx_tasks_lock held.
844  * If the iterator holds a task's rq lock, that rq lock is released. See
845  * scx_task_iter_init() for details.
846  */
847 static void scx_task_iter_exit(struct scx_task_iter *iter)
848 {
849 	lockdep_assert_held(&scx_tasks_lock);
850 
851 	scx_task_iter_rq_unlock(iter);
852 	list_del_init(&iter->cursor.tasks_node);
853 }
854 
855 /**
856  * scx_task_iter_next - Next task
857  * @iter: iterator to walk
858  *
859  * Visit the next task. See scx_task_iter_init() for details.
860  */
861 static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter)
862 {
863 	struct list_head *cursor = &iter->cursor.tasks_node;
864 	struct sched_ext_entity *pos;
865 
866 	lockdep_assert_held(&scx_tasks_lock);
867 
868 	list_for_each_entry(pos, cursor, tasks_node) {
869 		if (&pos->tasks_node == &scx_tasks)
870 			return NULL;
871 		if (!(pos->flags & SCX_TASK_CURSOR)) {
872 			list_move(cursor, &pos->tasks_node);
873 			return container_of(pos, struct task_struct, scx);
874 		}
875 	}
876 
877 	/* can't happen, should always terminate at scx_tasks above */
878 	BUG();
879 }
880 
881 /**
882  * scx_task_iter_next_locked - Next non-idle task with its rq locked
883  * @iter: iterator to walk
884  * @include_dead: Whether we should include dead tasks in the iteration
885  *
886  * Visit the non-idle task with its rq lock held. Allows callers to specify
887  * whether they would like to filter out dead tasks. See scx_task_iter_init()
888  * for details.
889  */
890 static struct task_struct *
891 scx_task_iter_next_locked(struct scx_task_iter *iter, bool include_dead)
892 {
893 	struct task_struct *p;
894 retry:
895 	scx_task_iter_rq_unlock(iter);
896 
897 	while ((p = scx_task_iter_next(iter))) {
898 		/*
899 		 * is_idle_task() tests %PF_IDLE which may not be set for CPUs
900 		 * which haven't yet been onlined. Test sched_class directly.
901 		 */
902 		if (p->sched_class != &idle_sched_class)
903 			break;
904 	}
905 	if (!p)
906 		return NULL;
907 
908 	iter->rq = task_rq_lock(p, &iter->rf);
909 	iter->locked = p;
910 
911 	/*
912 	 * If we see %TASK_DEAD, @p already disabled preemption, is about to do
913 	 * the final __schedule(), won't ever need to be scheduled again and can
914 	 * thus be safely ignored. If we don't see %TASK_DEAD, @p can't enter
915 	 * the final __schedle() while we're locking its rq and thus will stay
916 	 * alive until the rq is unlocked.
917 	 */
918 	if (!include_dead && READ_ONCE(p->__state) == TASK_DEAD)
919 		goto retry;
920 
921 	return p;
922 }
923 
924 static enum scx_ops_enable_state scx_ops_enable_state(void)
925 {
926 	return atomic_read(&scx_ops_enable_state_var);
927 }
928 
929 static enum scx_ops_enable_state
930 scx_ops_set_enable_state(enum scx_ops_enable_state to)
931 {
932 	return atomic_xchg(&scx_ops_enable_state_var, to);
933 }
934 
935 static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to,
936 					enum scx_ops_enable_state from)
937 {
938 	int from_v = from;
939 
940 	return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to);
941 }
942 
943 static bool scx_ops_bypassing(void)
944 {
945 	return unlikely(atomic_read(&scx_ops_bypass_depth));
946 }
947 
948 /**
949  * wait_ops_state - Busy-wait the specified ops state to end
950  * @p: target task
951  * @opss: state to wait the end of
952  *
953  * Busy-wait for @p to transition out of @opss. This can only be used when the
954  * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also
955  * has load_acquire semantics to ensure that the caller can see the updates made
956  * in the enqueueing and dispatching paths.
957  */
958 static void wait_ops_state(struct task_struct *p, unsigned long opss)
959 {
960 	do {
961 		cpu_relax();
962 	} while (atomic_long_read_acquire(&p->scx.ops_state) == opss);
963 }
964 
965 /**
966  * ops_cpu_valid - Verify a cpu number
967  * @cpu: cpu number which came from a BPF ops
968  * @where: extra information reported on error
969  *
970  * @cpu is a cpu number which came from the BPF scheduler and can be any value.
971  * Verify that it is in range and one of the possible cpus. If invalid, trigger
972  * an ops error.
973  */
974 static bool ops_cpu_valid(s32 cpu, const char *where)
975 {
976 	if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) {
977 		return true;
978 	} else {
979 		scx_ops_error("invalid CPU %d%s%s", cpu,
980 			      where ? " " : "", where ?: "");
981 		return false;
982 	}
983 }
984 
985 /**
986  * ops_sanitize_err - Sanitize a -errno value
987  * @ops_name: operation to blame on failure
988  * @err: -errno value to sanitize
989  *
990  * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return
991  * -%EPROTO. This is necessary because returning a rogue -errno up the chain can
992  * cause misbehaviors. For an example, a large negative return from
993  * ops.init_task() triggers an oops when passed up the call chain because the
994  * value fails IS_ERR() test after being encoded with ERR_PTR() and then is
995  * handled as a pointer.
996  */
997 static int ops_sanitize_err(const char *ops_name, s32 err)
998 {
999 	if (err < 0 && err >= -MAX_ERRNO)
1000 		return err;
1001 
1002 	scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err);
1003 	return -EPROTO;
1004 }
1005 
1006 static void update_curr_scx(struct rq *rq)
1007 {
1008 	struct task_struct *curr = rq->curr;
1009 	u64 now = rq_clock_task(rq);
1010 	u64 delta_exec;
1011 
1012 	if (time_before_eq64(now, curr->se.exec_start))
1013 		return;
1014 
1015 	delta_exec = now - curr->se.exec_start;
1016 	curr->se.exec_start = now;
1017 	curr->se.sum_exec_runtime += delta_exec;
1018 	account_group_exec_runtime(curr, delta_exec);
1019 	cgroup_account_cputime(curr, delta_exec);
1020 
1021 	curr->scx.slice -= min(curr->scx.slice, delta_exec);
1022 }
1023 
1024 static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta)
1025 {
1026 	/* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */
1027 	WRITE_ONCE(dsq->nr, dsq->nr + delta);
1028 }
1029 
1030 static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
1031 			     u64 enq_flags)
1032 {
1033 	bool is_local = dsq->id == SCX_DSQ_LOCAL;
1034 
1035 	WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node));
1036 
1037 	if (!is_local) {
1038 		raw_spin_lock(&dsq->lock);
1039 		if (unlikely(dsq->id == SCX_DSQ_INVALID)) {
1040 			scx_ops_error("attempting to dispatch to a destroyed dsq");
1041 			/* fall back to the global dsq */
1042 			raw_spin_unlock(&dsq->lock);
1043 			dsq = &scx_dsq_global;
1044 			raw_spin_lock(&dsq->lock);
1045 		}
1046 	}
1047 
1048 	if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT))
1049 		list_add(&p->scx.dsq_node, &dsq->list);
1050 	else
1051 		list_add_tail(&p->scx.dsq_node, &dsq->list);
1052 
1053 	dsq_mod_nr(dsq, 1);
1054 	p->scx.dsq = dsq;
1055 
1056 	/*
1057 	 * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the
1058 	 * direct dispatch path, but we clear them here because the direct
1059 	 * dispatch verdict may be overridden on the enqueue path during e.g.
1060 	 * bypass.
1061 	 */
1062 	p->scx.ddsp_dsq_id = SCX_DSQ_INVALID;
1063 	p->scx.ddsp_enq_flags = 0;
1064 
1065 	/*
1066 	 * We're transitioning out of QUEUEING or DISPATCHING. store_release to
1067 	 * match waiters' load_acquire.
1068 	 */
1069 	if (enq_flags & SCX_ENQ_CLEAR_OPSS)
1070 		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
1071 
1072 	if (is_local) {
1073 		struct rq *rq = container_of(dsq, struct rq, scx.local_dsq);
1074 		bool preempt = false;
1075 
1076 		if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr &&
1077 		    rq->curr->sched_class == &ext_sched_class) {
1078 			rq->curr->scx.slice = 0;
1079 			preempt = true;
1080 		}
1081 
1082 		if (preempt || sched_class_above(&ext_sched_class,
1083 						 rq->curr->sched_class))
1084 			resched_curr(rq);
1085 	} else {
1086 		raw_spin_unlock(&dsq->lock);
1087 	}
1088 }
1089 
1090 static void dispatch_dequeue(struct rq *rq, struct task_struct *p)
1091 {
1092 	struct scx_dispatch_q *dsq = p->scx.dsq;
1093 	bool is_local = dsq == &rq->scx.local_dsq;
1094 
1095 	if (!dsq) {
1096 		WARN_ON_ONCE(!list_empty(&p->scx.dsq_node));
1097 		/*
1098 		 * When dispatching directly from the BPF scheduler to a local
1099 		 * DSQ, the task isn't associated with any DSQ but
1100 		 * @p->scx.holding_cpu may be set under the protection of
1101 		 * %SCX_OPSS_DISPATCHING.
1102 		 */
1103 		if (p->scx.holding_cpu >= 0)
1104 			p->scx.holding_cpu = -1;
1105 		return;
1106 	}
1107 
1108 	if (!is_local)
1109 		raw_spin_lock(&dsq->lock);
1110 
1111 	/*
1112 	 * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_node
1113 	 * can't change underneath us.
1114 	*/
1115 	if (p->scx.holding_cpu < 0) {
1116 		/* @p must still be on @dsq, dequeue */
1117 		WARN_ON_ONCE(list_empty(&p->scx.dsq_node));
1118 		list_del_init(&p->scx.dsq_node);
1119 		dsq_mod_nr(dsq, -1);
1120 	} else {
1121 		/*
1122 		 * We're racing against dispatch_to_local_dsq() which already
1123 		 * removed @p from @dsq and set @p->scx.holding_cpu. Clear the
1124 		 * holding_cpu which tells dispatch_to_local_dsq() that it lost
1125 		 * the race.
1126 		 */
1127 		WARN_ON_ONCE(!list_empty(&p->scx.dsq_node));
1128 		p->scx.holding_cpu = -1;
1129 	}
1130 	p->scx.dsq = NULL;
1131 
1132 	if (!is_local)
1133 		raw_spin_unlock(&dsq->lock);
1134 }
1135 
1136 static struct scx_dispatch_q *find_user_dsq(u64 dsq_id)
1137 {
1138 	return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params);
1139 }
1140 
1141 static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id)
1142 {
1143 	lockdep_assert(rcu_read_lock_any_held());
1144 
1145 	if (dsq_id == SCX_DSQ_GLOBAL)
1146 		return &scx_dsq_global;
1147 	else
1148 		return find_user_dsq(dsq_id);
1149 }
1150 
1151 static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id,
1152 						    struct task_struct *p)
1153 {
1154 	struct scx_dispatch_q *dsq;
1155 
1156 	if (dsq_id == SCX_DSQ_LOCAL)
1157 		return &rq->scx.local_dsq;
1158 
1159 	dsq = find_non_local_dsq(dsq_id);
1160 	if (unlikely(!dsq)) {
1161 		scx_ops_error("non-existent DSQ 0x%llx for %s[%d]",
1162 			      dsq_id, p->comm, p->pid);
1163 		return &scx_dsq_global;
1164 	}
1165 
1166 	return dsq;
1167 }
1168 
1169 static void mark_direct_dispatch(struct task_struct *ddsp_task,
1170 				 struct task_struct *p, u64 dsq_id,
1171 				 u64 enq_flags)
1172 {
1173 	/*
1174 	 * Mark that dispatch already happened from ops.select_cpu() or
1175 	 * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value
1176 	 * which can never match a valid task pointer.
1177 	 */
1178 	__this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH));
1179 
1180 	/* @p must match the task on the enqueue path */
1181 	if (unlikely(p != ddsp_task)) {
1182 		if (IS_ERR(ddsp_task))
1183 			scx_ops_error("%s[%d] already direct-dispatched",
1184 				      p->comm, p->pid);
1185 		else
1186 			scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]",
1187 				      ddsp_task->comm, ddsp_task->pid,
1188 				      p->comm, p->pid);
1189 		return;
1190 	}
1191 
1192 	/*
1193 	 * %SCX_DSQ_LOCAL_ON is not supported during direct dispatch because
1194 	 * dispatching to the local DSQ of a different CPU requires unlocking
1195 	 * the current rq which isn't allowed in the enqueue path. Use
1196 	 * ops.select_cpu() to be on the target CPU and then %SCX_DSQ_LOCAL.
1197 	 */
1198 	if (unlikely((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON)) {
1199 		scx_ops_error("SCX_DSQ_LOCAL_ON can't be used for direct-dispatch");
1200 		return;
1201 	}
1202 
1203 	WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID);
1204 	WARN_ON_ONCE(p->scx.ddsp_enq_flags);
1205 
1206 	p->scx.ddsp_dsq_id = dsq_id;
1207 	p->scx.ddsp_enq_flags = enq_flags;
1208 }
1209 
1210 static void direct_dispatch(struct task_struct *p, u64 enq_flags)
1211 {
1212 	struct scx_dispatch_q *dsq;
1213 
1214 	enq_flags |= (p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS);
1215 	dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsp_dsq_id, p);
1216 	dispatch_enqueue(dsq, p, enq_flags);
1217 }
1218 
1219 static bool scx_rq_online(struct rq *rq)
1220 {
1221 #ifdef CONFIG_SMP
1222 	return likely(rq->online);
1223 #else
1224 	return true;
1225 #endif
1226 }
1227 
1228 static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
1229 			    int sticky_cpu)
1230 {
1231 	struct task_struct **ddsp_taskp;
1232 	unsigned long qseq;
1233 
1234 	WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED));
1235 
1236 	/* rq migration */
1237 	if (sticky_cpu == cpu_of(rq))
1238 		goto local_norefill;
1239 
1240 	if (!scx_rq_online(rq))
1241 		goto local;
1242 
1243 	if (scx_ops_bypassing()) {
1244 		if (enq_flags & SCX_ENQ_LAST)
1245 			goto local;
1246 		else
1247 			goto global;
1248 	}
1249 
1250 	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
1251 		goto direct;
1252 
1253 	/* see %SCX_OPS_ENQ_EXITING */
1254 	if (!static_branch_unlikely(&scx_ops_enq_exiting) &&
1255 	    unlikely(p->flags & PF_EXITING))
1256 		goto local;
1257 
1258 	/* see %SCX_OPS_ENQ_LAST */
1259 	if (!static_branch_unlikely(&scx_ops_enq_last) &&
1260 	    (enq_flags & SCX_ENQ_LAST))
1261 		goto local;
1262 
1263 	if (!SCX_HAS_OP(enqueue))
1264 		goto global;
1265 
1266 	/* DSQ bypass didn't trigger, enqueue on the BPF scheduler */
1267 	qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT;
1268 
1269 	WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
1270 	atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq);
1271 
1272 	ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
1273 	WARN_ON_ONCE(*ddsp_taskp);
1274 	*ddsp_taskp = p;
1275 
1276 	SCX_CALL_OP(SCX_KF_ENQUEUE, enqueue, p, enq_flags);
1277 
1278 	*ddsp_taskp = NULL;
1279 	if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID)
1280 		goto direct;
1281 
1282 	/*
1283 	 * If not directly dispatched, QUEUEING isn't clear yet and dispatch or
1284 	 * dequeue may be waiting. The store_release matches their load_acquire.
1285 	 */
1286 	atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq);
1287 	return;
1288 
1289 direct:
1290 	direct_dispatch(p, enq_flags);
1291 	return;
1292 
1293 local:
1294 	p->scx.slice = SCX_SLICE_DFL;
1295 local_norefill:
1296 	dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
1297 	return;
1298 
1299 global:
1300 	p->scx.slice = SCX_SLICE_DFL;
1301 	dispatch_enqueue(&scx_dsq_global, p, enq_flags);
1302 }
1303 
1304 static bool task_runnable(const struct task_struct *p)
1305 {
1306 	return !list_empty(&p->scx.runnable_node);
1307 }
1308 
1309 static void set_task_runnable(struct rq *rq, struct task_struct *p)
1310 {
1311 	lockdep_assert_rq_held(rq);
1312 
1313 	if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) {
1314 		p->scx.runnable_at = jiffies;
1315 		p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT;
1316 	}
1317 
1318 	/*
1319 	 * list_add_tail() must be used. scx_ops_bypass() depends on tasks being
1320 	 * appened to the runnable_list.
1321 	 */
1322 	list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list);
1323 }
1324 
1325 static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at)
1326 {
1327 	list_del_init(&p->scx.runnable_node);
1328 	if (reset_runnable_at)
1329 		p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
1330 }
1331 
1332 static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags)
1333 {
1334 	int sticky_cpu = p->scx.sticky_cpu;
1335 
1336 	enq_flags |= rq->scx.extra_enq_flags;
1337 
1338 	if (sticky_cpu >= 0)
1339 		p->scx.sticky_cpu = -1;
1340 
1341 	/*
1342 	 * Restoring a running task will be immediately followed by
1343 	 * set_next_task_scx() which expects the task to not be on the BPF
1344 	 * scheduler as tasks can only start running through local DSQs. Force
1345 	 * direct-dispatch into the local DSQ by setting the sticky_cpu.
1346 	 */
1347 	if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p))
1348 		sticky_cpu = cpu_of(rq);
1349 
1350 	if (p->scx.flags & SCX_TASK_QUEUED) {
1351 		WARN_ON_ONCE(!task_runnable(p));
1352 		return;
1353 	}
1354 
1355 	set_task_runnable(rq, p);
1356 	p->scx.flags |= SCX_TASK_QUEUED;
1357 	rq->scx.nr_running++;
1358 	add_nr_running(rq, 1);
1359 
1360 	do_enqueue_task(rq, p, enq_flags, sticky_cpu);
1361 }
1362 
1363 static void ops_dequeue(struct task_struct *p, u64 deq_flags)
1364 {
1365 	unsigned long opss;
1366 
1367 	/* dequeue is always temporary, don't reset runnable_at */
1368 	clr_task_runnable(p, false);
1369 
1370 	/* acquire ensures that we see the preceding updates on QUEUED */
1371 	opss = atomic_long_read_acquire(&p->scx.ops_state);
1372 
1373 	switch (opss & SCX_OPSS_STATE_MASK) {
1374 	case SCX_OPSS_NONE:
1375 		break;
1376 	case SCX_OPSS_QUEUEING:
1377 		/*
1378 		 * QUEUEING is started and finished while holding @p's rq lock.
1379 		 * As we're holding the rq lock now, we shouldn't see QUEUEING.
1380 		 */
1381 		BUG();
1382 	case SCX_OPSS_QUEUED:
1383 		if (SCX_HAS_OP(dequeue))
1384 			SCX_CALL_OP(SCX_KF_REST, dequeue, p, deq_flags);
1385 
1386 		if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
1387 					    SCX_OPSS_NONE))
1388 			break;
1389 		fallthrough;
1390 	case SCX_OPSS_DISPATCHING:
1391 		/*
1392 		 * If @p is being dispatched from the BPF scheduler to a DSQ,
1393 		 * wait for the transfer to complete so that @p doesn't get
1394 		 * added to its DSQ after dequeueing is complete.
1395 		 *
1396 		 * As we're waiting on DISPATCHING with the rq locked, the
1397 		 * dispatching side shouldn't try to lock the rq while
1398 		 * DISPATCHING is set. See dispatch_to_local_dsq().
1399 		 *
1400 		 * DISPATCHING shouldn't have qseq set and control can reach
1401 		 * here with NONE @opss from the above QUEUED case block.
1402 		 * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss.
1403 		 */
1404 		wait_ops_state(p, SCX_OPSS_DISPATCHING);
1405 		BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
1406 		break;
1407 	}
1408 }
1409 
1410 static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags)
1411 {
1412 	if (!(p->scx.flags & SCX_TASK_QUEUED)) {
1413 		WARN_ON_ONCE(task_runnable(p));
1414 		return;
1415 	}
1416 
1417 	ops_dequeue(p, deq_flags);
1418 
1419 	if (deq_flags & SCX_DEQ_SLEEP)
1420 		p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP;
1421 	else
1422 		p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP;
1423 
1424 	p->scx.flags &= ~SCX_TASK_QUEUED;
1425 	rq->scx.nr_running--;
1426 	sub_nr_running(rq, 1);
1427 
1428 	dispatch_dequeue(rq, p);
1429 }
1430 
1431 static void yield_task_scx(struct rq *rq)
1432 {
1433 	struct task_struct *p = rq->curr;
1434 
1435 	if (SCX_HAS_OP(yield))
1436 		SCX_CALL_OP_RET(SCX_KF_REST, yield, p, NULL);
1437 	else
1438 		p->scx.slice = 0;
1439 }
1440 
1441 static bool yield_to_task_scx(struct rq *rq, struct task_struct *to)
1442 {
1443 	struct task_struct *from = rq->curr;
1444 
1445 	if (SCX_HAS_OP(yield))
1446 		return SCX_CALL_OP_RET(SCX_KF_REST, yield, from, to);
1447 	else
1448 		return false;
1449 }
1450 
1451 #ifdef CONFIG_SMP
1452 /**
1453  * move_task_to_local_dsq - Move a task from a different rq to a local DSQ
1454  * @rq: rq to move the task into, currently locked
1455  * @p: task to move
1456  * @enq_flags: %SCX_ENQ_*
1457  *
1458  * Move @p which is currently on a different rq to @rq's local DSQ. The caller
1459  * must:
1460  *
1461  * 1. Start with exclusive access to @p either through its DSQ lock or
1462  *    %SCX_OPSS_DISPATCHING flag.
1463  *
1464  * 2. Set @p->scx.holding_cpu to raw_smp_processor_id().
1465  *
1466  * 3. Remember task_rq(@p). Release the exclusive access so that we don't
1467  *    deadlock with dequeue.
1468  *
1469  * 4. Lock @rq and the task_rq from #3.
1470  *
1471  * 5. Call this function.
1472  *
1473  * Returns %true if @p was successfully moved. %false after racing dequeue and
1474  * losing.
1475  */
1476 static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p,
1477 				   u64 enq_flags)
1478 {
1479 	struct rq *task_rq;
1480 
1481 	lockdep_assert_rq_held(rq);
1482 
1483 	/*
1484 	 * If dequeue got to @p while we were trying to lock both rq's, it'd
1485 	 * have cleared @p->scx.holding_cpu to -1. While other cpus may have
1486 	 * updated it to different values afterwards, as this operation can't be
1487 	 * preempted or recurse, @p->scx.holding_cpu can never become
1488 	 * raw_smp_processor_id() again before we're done. Thus, we can tell
1489 	 * whether we lost to dequeue by testing whether @p->scx.holding_cpu is
1490 	 * still raw_smp_processor_id().
1491 	 *
1492 	 * See dispatch_dequeue() for the counterpart.
1493 	 */
1494 	if (unlikely(p->scx.holding_cpu != raw_smp_processor_id()))
1495 		return false;
1496 
1497 	/* @p->rq couldn't have changed if we're still the holding cpu */
1498 	task_rq = task_rq(p);
1499 	lockdep_assert_rq_held(task_rq);
1500 
1501 	WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr));
1502 	deactivate_task(task_rq, p, 0);
1503 	set_task_cpu(p, cpu_of(rq));
1504 	p->scx.sticky_cpu = cpu_of(rq);
1505 
1506 	/*
1507 	 * We want to pass scx-specific enq_flags but activate_task() will
1508 	 * truncate the upper 32 bit. As we own @rq, we can pass them through
1509 	 * @rq->scx.extra_enq_flags instead.
1510 	 */
1511 	WARN_ON_ONCE(rq->scx.extra_enq_flags);
1512 	rq->scx.extra_enq_flags = enq_flags;
1513 	activate_task(rq, p, 0);
1514 	rq->scx.extra_enq_flags = 0;
1515 
1516 	return true;
1517 }
1518 
1519 /**
1520  * dispatch_to_local_dsq_lock - Ensure source and destination rq's are locked
1521  * @rq: current rq which is locked
1522  * @rf: rq_flags to use when unlocking @rq
1523  * @src_rq: rq to move task from
1524  * @dst_rq: rq to move task to
1525  *
1526  * We're holding @rq lock and trying to dispatch a task from @src_rq to
1527  * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether
1528  * @rq stays locked isn't important as long as the state is restored after
1529  * dispatch_to_local_dsq_unlock().
1530  */
1531 static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf,
1532 				       struct rq *src_rq, struct rq *dst_rq)
1533 {
1534 	rq_unpin_lock(rq, rf);
1535 
1536 	if (src_rq == dst_rq) {
1537 		raw_spin_rq_unlock(rq);
1538 		raw_spin_rq_lock(dst_rq);
1539 	} else if (rq == src_rq) {
1540 		double_lock_balance(rq, dst_rq);
1541 		rq_repin_lock(rq, rf);
1542 	} else if (rq == dst_rq) {
1543 		double_lock_balance(rq, src_rq);
1544 		rq_repin_lock(rq, rf);
1545 	} else {
1546 		raw_spin_rq_unlock(rq);
1547 		double_rq_lock(src_rq, dst_rq);
1548 	}
1549 }
1550 
1551 /**
1552  * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock()
1553  * @rq: current rq which is locked
1554  * @rf: rq_flags to use when unlocking @rq
1555  * @src_rq: rq to move task from
1556  * @dst_rq: rq to move task to
1557  *
1558  * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return.
1559  */
1560 static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf,
1561 					 struct rq *src_rq, struct rq *dst_rq)
1562 {
1563 	if (src_rq == dst_rq) {
1564 		raw_spin_rq_unlock(dst_rq);
1565 		raw_spin_rq_lock(rq);
1566 		rq_repin_lock(rq, rf);
1567 	} else if (rq == src_rq) {
1568 		double_unlock_balance(rq, dst_rq);
1569 	} else if (rq == dst_rq) {
1570 		double_unlock_balance(rq, src_rq);
1571 	} else {
1572 		double_rq_unlock(src_rq, dst_rq);
1573 		raw_spin_rq_lock(rq);
1574 		rq_repin_lock(rq, rf);
1575 	}
1576 }
1577 #endif	/* CONFIG_SMP */
1578 
1579 static void consume_local_task(struct rq *rq, struct scx_dispatch_q *dsq,
1580 			       struct task_struct *p)
1581 {
1582 	lockdep_assert_held(&dsq->lock);	/* released on return */
1583 
1584 	/* @dsq is locked and @p is on this rq */
1585 	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
1586 	list_move_tail(&p->scx.dsq_node, &rq->scx.local_dsq.list);
1587 	dsq_mod_nr(dsq, -1);
1588 	dsq_mod_nr(&rq->scx.local_dsq, 1);
1589 	p->scx.dsq = &rq->scx.local_dsq;
1590 	raw_spin_unlock(&dsq->lock);
1591 }
1592 
1593 #ifdef CONFIG_SMP
1594 /*
1595  * Similar to kernel/sched/core.c::is_cpu_allowed() but we're testing whether @p
1596  * can be pulled to @rq.
1597  */
1598 static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq)
1599 {
1600 	int cpu = cpu_of(rq);
1601 
1602 	if (!cpumask_test_cpu(cpu, p->cpus_ptr))
1603 		return false;
1604 	if (unlikely(is_migration_disabled(p)))
1605 		return false;
1606 	if (!(p->flags & PF_KTHREAD) && unlikely(!task_cpu_possible(cpu, p)))
1607 		return false;
1608 	if (!scx_rq_online(rq))
1609 		return false;
1610 	return true;
1611 }
1612 
1613 static bool consume_remote_task(struct rq *rq, struct rq_flags *rf,
1614 				struct scx_dispatch_q *dsq,
1615 				struct task_struct *p, struct rq *task_rq)
1616 {
1617 	bool moved = false;
1618 
1619 	lockdep_assert_held(&dsq->lock);	/* released on return */
1620 
1621 	/*
1622 	 * @dsq is locked and @p is on a remote rq. @p is currently protected by
1623 	 * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab
1624 	 * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the
1625 	 * rq lock or fail, do a little dancing from our side. See
1626 	 * move_task_to_local_dsq().
1627 	 */
1628 	WARN_ON_ONCE(p->scx.holding_cpu >= 0);
1629 	list_del_init(&p->scx.dsq_node);
1630 	dsq_mod_nr(dsq, -1);
1631 	p->scx.holding_cpu = raw_smp_processor_id();
1632 	raw_spin_unlock(&dsq->lock);
1633 
1634 	rq_unpin_lock(rq, rf);
1635 	double_lock_balance(rq, task_rq);
1636 	rq_repin_lock(rq, rf);
1637 
1638 	moved = move_task_to_local_dsq(rq, p, 0);
1639 
1640 	double_unlock_balance(rq, task_rq);
1641 
1642 	return moved;
1643 }
1644 #else	/* CONFIG_SMP */
1645 static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq) { return false; }
1646 static bool consume_remote_task(struct rq *rq, struct rq_flags *rf,
1647 				struct scx_dispatch_q *dsq,
1648 				struct task_struct *p, struct rq *task_rq) { return false; }
1649 #endif	/* CONFIG_SMP */
1650 
1651 static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf,
1652 			       struct scx_dispatch_q *dsq)
1653 {
1654 	struct task_struct *p;
1655 retry:
1656 	if (list_empty(&dsq->list))
1657 		return false;
1658 
1659 	raw_spin_lock(&dsq->lock);
1660 
1661 	list_for_each_entry(p, &dsq->list, scx.dsq_node) {
1662 		struct rq *task_rq = task_rq(p);
1663 
1664 		if (rq == task_rq) {
1665 			consume_local_task(rq, dsq, p);
1666 			return true;
1667 		}
1668 
1669 		if (task_can_run_on_remote_rq(p, rq)) {
1670 			if (likely(consume_remote_task(rq, rf, dsq, p, task_rq)))
1671 				return true;
1672 			goto retry;
1673 		}
1674 	}
1675 
1676 	raw_spin_unlock(&dsq->lock);
1677 	return false;
1678 }
1679 
1680 enum dispatch_to_local_dsq_ret {
1681 	DTL_DISPATCHED,		/* successfully dispatched */
1682 	DTL_LOST,		/* lost race to dequeue */
1683 	DTL_NOT_LOCAL,		/* destination is not a local DSQ */
1684 	DTL_INVALID,		/* invalid local dsq_id */
1685 };
1686 
1687 /**
1688  * dispatch_to_local_dsq - Dispatch a task to a local dsq
1689  * @rq: current rq which is locked
1690  * @rf: rq_flags to use when unlocking @rq
1691  * @dsq_id: destination dsq ID
1692  * @p: task to dispatch
1693  * @enq_flags: %SCX_ENQ_*
1694  *
1695  * We're holding @rq lock and want to dispatch @p to the local DSQ identified by
1696  * @dsq_id. This function performs all the synchronization dancing needed
1697  * because local DSQs are protected with rq locks.
1698  *
1699  * The caller must have exclusive ownership of @p (e.g. through
1700  * %SCX_OPSS_DISPATCHING).
1701  */
1702 static enum dispatch_to_local_dsq_ret
1703 dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id,
1704 		      struct task_struct *p, u64 enq_flags)
1705 {
1706 	struct rq *src_rq = task_rq(p);
1707 	struct rq *dst_rq;
1708 
1709 	/*
1710 	 * We're synchronized against dequeue through DISPATCHING. As @p can't
1711 	 * be dequeued, its task_rq and cpus_allowed are stable too.
1712 	 */
1713 	if (dsq_id == SCX_DSQ_LOCAL) {
1714 		dst_rq = rq;
1715 	} else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
1716 		s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
1717 
1718 		if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict"))
1719 			return DTL_INVALID;
1720 		dst_rq = cpu_rq(cpu);
1721 	} else {
1722 		return DTL_NOT_LOCAL;
1723 	}
1724 
1725 	/* if dispatching to @rq that @p is already on, no lock dancing needed */
1726 	if (rq == src_rq && rq == dst_rq) {
1727 		dispatch_enqueue(&dst_rq->scx.local_dsq, p,
1728 				 enq_flags | SCX_ENQ_CLEAR_OPSS);
1729 		return DTL_DISPATCHED;
1730 	}
1731 
1732 #ifdef CONFIG_SMP
1733 	if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) {
1734 		struct rq *locked_dst_rq = dst_rq;
1735 		bool dsp;
1736 
1737 		/*
1738 		 * @p is on a possibly remote @src_rq which we need to lock to
1739 		 * move the task. If dequeue is in progress, it'd be locking
1740 		 * @src_rq and waiting on DISPATCHING, so we can't grab @src_rq
1741 		 * lock while holding DISPATCHING.
1742 		 *
1743 		 * As DISPATCHING guarantees that @p is wholly ours, we can
1744 		 * pretend that we're moving from a DSQ and use the same
1745 		 * mechanism - mark the task under transfer with holding_cpu,
1746 		 * release DISPATCHING and then follow the same protocol.
1747 		 */
1748 		p->scx.holding_cpu = raw_smp_processor_id();
1749 
1750 		/* store_release ensures that dequeue sees the above */
1751 		atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE);
1752 
1753 		dispatch_to_local_dsq_lock(rq, rf, src_rq, locked_dst_rq);
1754 
1755 		/*
1756 		 * We don't require the BPF scheduler to avoid dispatching to
1757 		 * offline CPUs mostly for convenience but also because CPUs can
1758 		 * go offline between scx_bpf_dispatch() calls and here. If @p
1759 		 * is destined to an offline CPU, queue it on its current CPU
1760 		 * instead, which should always be safe. As this is an allowed
1761 		 * behavior, don't trigger an ops error.
1762 		 */
1763 		if (!scx_rq_online(dst_rq))
1764 			dst_rq = src_rq;
1765 
1766 		if (src_rq == dst_rq) {
1767 			/*
1768 			 * As @p is staying on the same rq, there's no need to
1769 			 * go through the full deactivate/activate cycle.
1770 			 * Optimize by abbreviating the operations in
1771 			 * move_task_to_local_dsq().
1772 			 */
1773 			dsp = p->scx.holding_cpu == raw_smp_processor_id();
1774 			if (likely(dsp)) {
1775 				p->scx.holding_cpu = -1;
1776 				dispatch_enqueue(&dst_rq->scx.local_dsq, p,
1777 						 enq_flags);
1778 			}
1779 		} else {
1780 			dsp = move_task_to_local_dsq(dst_rq, p, enq_flags);
1781 		}
1782 
1783 		/* if the destination CPU is idle, wake it up */
1784 		if (dsp && sched_class_above(p->sched_class,
1785 					     dst_rq->curr->sched_class))
1786 			resched_curr(dst_rq);
1787 
1788 		dispatch_to_local_dsq_unlock(rq, rf, src_rq, locked_dst_rq);
1789 
1790 		return dsp ? DTL_DISPATCHED : DTL_LOST;
1791 	}
1792 #endif	/* CONFIG_SMP */
1793 
1794 	scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]",
1795 		      cpu_of(dst_rq), p->comm, p->pid);
1796 	return DTL_INVALID;
1797 }
1798 
1799 /**
1800  * finish_dispatch - Asynchronously finish dispatching a task
1801  * @rq: current rq which is locked
1802  * @rf: rq_flags to use when unlocking @rq
1803  * @p: task to finish dispatching
1804  * @qseq_at_dispatch: qseq when @p started getting dispatched
1805  * @dsq_id: destination DSQ ID
1806  * @enq_flags: %SCX_ENQ_*
1807  *
1808  * Dispatching to local DSQs may need to wait for queueing to complete or
1809  * require rq lock dancing. As we don't wanna do either while inside
1810  * ops.dispatch() to avoid locking order inversion, we split dispatching into
1811  * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the
1812  * task and its qseq. Once ops.dispatch() returns, this function is called to
1813  * finish up.
1814  *
1815  * There is no guarantee that @p is still valid for dispatching or even that it
1816  * was valid in the first place. Make sure that the task is still owned by the
1817  * BPF scheduler and claim the ownership before dispatching.
1818  */
1819 static void finish_dispatch(struct rq *rq, struct rq_flags *rf,
1820 			    struct task_struct *p,
1821 			    unsigned long qseq_at_dispatch,
1822 			    u64 dsq_id, u64 enq_flags)
1823 {
1824 	struct scx_dispatch_q *dsq;
1825 	unsigned long opss;
1826 
1827 retry:
1828 	/*
1829 	 * No need for _acquire here. @p is accessed only after a successful
1830 	 * try_cmpxchg to DISPATCHING.
1831 	 */
1832 	opss = atomic_long_read(&p->scx.ops_state);
1833 
1834 	switch (opss & SCX_OPSS_STATE_MASK) {
1835 	case SCX_OPSS_DISPATCHING:
1836 	case SCX_OPSS_NONE:
1837 		/* someone else already got to it */
1838 		return;
1839 	case SCX_OPSS_QUEUED:
1840 		/*
1841 		 * If qseq doesn't match, @p has gone through at least one
1842 		 * dispatch/dequeue and re-enqueue cycle between
1843 		 * scx_bpf_dispatch() and here and we have no claim on it.
1844 		 */
1845 		if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch)
1846 			return;
1847 
1848 		/*
1849 		 * While we know @p is accessible, we don't yet have a claim on
1850 		 * it - the BPF scheduler is allowed to dispatch tasks
1851 		 * spuriously and there can be a racing dequeue attempt. Let's
1852 		 * claim @p by atomically transitioning it from QUEUED to
1853 		 * DISPATCHING.
1854 		 */
1855 		if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss,
1856 						   SCX_OPSS_DISPATCHING)))
1857 			break;
1858 		goto retry;
1859 	case SCX_OPSS_QUEUEING:
1860 		/*
1861 		 * do_enqueue_task() is in the process of transferring the task
1862 		 * to the BPF scheduler while holding @p's rq lock. As we aren't
1863 		 * holding any kernel or BPF resource that the enqueue path may
1864 		 * depend upon, it's safe to wait.
1865 		 */
1866 		wait_ops_state(p, opss);
1867 		goto retry;
1868 	}
1869 
1870 	BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED));
1871 
1872 	switch (dispatch_to_local_dsq(rq, rf, dsq_id, p, enq_flags)) {
1873 	case DTL_DISPATCHED:
1874 		break;
1875 	case DTL_LOST:
1876 		break;
1877 	case DTL_INVALID:
1878 		dsq_id = SCX_DSQ_GLOBAL;
1879 		fallthrough;
1880 	case DTL_NOT_LOCAL:
1881 		dsq = find_dsq_for_dispatch(cpu_rq(raw_smp_processor_id()),
1882 					    dsq_id, p);
1883 		dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);
1884 		break;
1885 	}
1886 }
1887 
1888 static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf)
1889 {
1890 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
1891 	u32 u;
1892 
1893 	for (u = 0; u < dspc->cursor; u++) {
1894 		struct scx_dsp_buf_ent *ent = &dspc->buf[u];
1895 
1896 		finish_dispatch(rq, rf, ent->task, ent->qseq, ent->dsq_id,
1897 				ent->enq_flags);
1898 	}
1899 
1900 	dspc->nr_tasks += dspc->cursor;
1901 	dspc->cursor = 0;
1902 }
1903 
1904 static int balance_scx(struct rq *rq, struct task_struct *prev,
1905 		       struct rq_flags *rf)
1906 {
1907 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
1908 	bool prev_on_scx = prev->sched_class == &ext_sched_class;
1909 	bool has_tasks = false;
1910 
1911 	lockdep_assert_rq_held(rq);
1912 	rq->scx.flags |= SCX_RQ_BALANCING;
1913 
1914 	if (prev_on_scx) {
1915 		WARN_ON_ONCE(prev->scx.flags & SCX_TASK_BAL_KEEP);
1916 		update_curr_scx(rq);
1917 
1918 		/*
1919 		 * If @prev is runnable & has slice left, it has priority and
1920 		 * fetching more just increases latency for the fetched tasks.
1921 		 * Tell put_prev_task_scx() to put @prev on local_dsq.
1922 		 *
1923 		 * See scx_ops_disable_workfn() for the explanation on the
1924 		 * bypassing test.
1925 		 */
1926 		if ((prev->scx.flags & SCX_TASK_QUEUED) &&
1927 		    prev->scx.slice && !scx_ops_bypassing()) {
1928 			prev->scx.flags |= SCX_TASK_BAL_KEEP;
1929 			goto has_tasks;
1930 		}
1931 	}
1932 
1933 	/* if there already are tasks to run, nothing to do */
1934 	if (rq->scx.local_dsq.nr)
1935 		goto has_tasks;
1936 
1937 	if (consume_dispatch_q(rq, rf, &scx_dsq_global))
1938 		goto has_tasks;
1939 
1940 	if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq))
1941 		goto out;
1942 
1943 	dspc->rq = rq;
1944 	dspc->rf = rf;
1945 
1946 	/*
1947 	 * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock,
1948 	 * the local DSQ might still end up empty after a successful
1949 	 * ops.dispatch(). If the local DSQ is empty even after ops.dispatch()
1950 	 * produced some tasks, retry. The BPF scheduler may depend on this
1951 	 * looping behavior to simplify its implementation.
1952 	 */
1953 	do {
1954 		dspc->nr_tasks = 0;
1955 
1956 		SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq),
1957 			    prev_on_scx ? prev : NULL);
1958 
1959 		flush_dispatch_buf(rq, rf);
1960 
1961 		if (rq->scx.local_dsq.nr)
1962 			goto has_tasks;
1963 		if (consume_dispatch_q(rq, rf, &scx_dsq_global))
1964 			goto has_tasks;
1965 	} while (dspc->nr_tasks);
1966 
1967 	goto out;
1968 
1969 has_tasks:
1970 	has_tasks = true;
1971 out:
1972 	rq->scx.flags &= ~SCX_RQ_BALANCING;
1973 	return has_tasks;
1974 }
1975 
1976 static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
1977 {
1978 	if (p->scx.flags & SCX_TASK_QUEUED) {
1979 		WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE);
1980 		dispatch_dequeue(rq, p);
1981 	}
1982 
1983 	p->se.exec_start = rq_clock_task(rq);
1984 
1985 	clr_task_runnable(p, true);
1986 }
1987 
1988 static void put_prev_task_scx(struct rq *rq, struct task_struct *p)
1989 {
1990 #ifndef CONFIG_SMP
1991 	/*
1992 	 * UP workaround.
1993 	 *
1994 	 * Because SCX may transfer tasks across CPUs during dispatch, dispatch
1995 	 * is performed from its balance operation which isn't called in UP.
1996 	 * Let's work around by calling it from the operations which come right
1997 	 * after.
1998 	 *
1999 	 * 1. If the prev task is on SCX, pick_next_task() calls
2000 	 *    .put_prev_task() right after. As .put_prev_task() is also called
2001 	 *    from other places, we need to distinguish the calls which can be
2002 	 *    done by looking at the previous task's state - if still queued or
2003 	 *    dequeued with %SCX_DEQ_SLEEP, the caller must be pick_next_task().
2004 	 *    This case is handled here.
2005 	 *
2006 	 * 2. If the prev task is not on SCX, the first following call into SCX
2007 	 *    will be .pick_next_task(), which is covered by calling
2008 	 *    balance_scx() from pick_next_task_scx().
2009 	 *
2010 	 * Note that we can't merge the first case into the second as
2011 	 * balance_scx() must be called before the previous SCX task goes
2012 	 * through put_prev_task_scx().
2013 	 *
2014 	 * As UP doesn't transfer tasks around, balance_scx() doesn't need @rf.
2015 	 * Pass in %NULL.
2016 	 */
2017 	if (p->scx.flags & (SCX_TASK_QUEUED | SCX_TASK_DEQD_FOR_SLEEP))
2018 		balance_scx(rq, p, NULL);
2019 #endif
2020 
2021 	update_curr_scx(rq);
2022 
2023 	/*
2024 	 * If we're being called from put_prev_task_balance(), balance_scx() may
2025 	 * have decided that @p should keep running.
2026 	 */
2027 	if (p->scx.flags & SCX_TASK_BAL_KEEP) {
2028 		p->scx.flags &= ~SCX_TASK_BAL_KEEP;
2029 		set_task_runnable(rq, p);
2030 		dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
2031 		return;
2032 	}
2033 
2034 	if (p->scx.flags & SCX_TASK_QUEUED) {
2035 		set_task_runnable(rq, p);
2036 
2037 		/*
2038 		 * If @p has slice left and balance_scx() didn't tag it for
2039 		 * keeping, @p is getting preempted by a higher priority
2040 		 * scheduler class. Leave it at the head of the local DSQ.
2041 		 */
2042 		if (p->scx.slice && !scx_ops_bypassing()) {
2043 			dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD);
2044 			return;
2045 		}
2046 
2047 		/*
2048 		 * If we're in the pick_next_task path, balance_scx() should
2049 		 * have already populated the local DSQ if there are any other
2050 		 * available tasks. If empty, tell ops.enqueue() that @p is the
2051 		 * only one available for this cpu. ops.enqueue() should put it
2052 		 * on the local DSQ so that the subsequent pick_next_task_scx()
2053 		 * can find the task unless it wants to trigger a separate
2054 		 * follow-up scheduling event.
2055 		 */
2056 		if (list_empty(&rq->scx.local_dsq.list))
2057 			do_enqueue_task(rq, p, SCX_ENQ_LAST, -1);
2058 		else
2059 			do_enqueue_task(rq, p, 0, -1);
2060 	}
2061 }
2062 
2063 static struct task_struct *first_local_task(struct rq *rq)
2064 {
2065 	return list_first_entry_or_null(&rq->scx.local_dsq.list,
2066 					struct task_struct, scx.dsq_node);
2067 }
2068 
2069 static struct task_struct *pick_next_task_scx(struct rq *rq)
2070 {
2071 	struct task_struct *p;
2072 
2073 #ifndef CONFIG_SMP
2074 	/* UP workaround - see the comment at the head of put_prev_task_scx() */
2075 	if (unlikely(rq->curr->sched_class != &ext_sched_class))
2076 		balance_scx(rq, rq->curr, NULL);
2077 #endif
2078 
2079 	p = first_local_task(rq);
2080 	if (!p)
2081 		return NULL;
2082 
2083 	set_next_task_scx(rq, p, true);
2084 
2085 	if (unlikely(!p->scx.slice)) {
2086 		if (!scx_ops_bypassing() && !scx_warned_zero_slice) {
2087 			printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n",
2088 					p->comm, p->pid);
2089 			scx_warned_zero_slice = true;
2090 		}
2091 		p->scx.slice = SCX_SLICE_DFL;
2092 	}
2093 
2094 	return p;
2095 }
2096 
2097 #ifdef CONFIG_SMP
2098 
2099 static bool test_and_clear_cpu_idle(int cpu)
2100 {
2101 #ifdef CONFIG_SCHED_SMT
2102 	/*
2103 	 * SMT mask should be cleared whether we can claim @cpu or not. The SMT
2104 	 * cluster is not wholly idle either way. This also prevents
2105 	 * scx_pick_idle_cpu() from getting caught in an infinite loop.
2106 	 */
2107 	if (sched_smt_active()) {
2108 		const struct cpumask *smt = cpu_smt_mask(cpu);
2109 
2110 		/*
2111 		 * If offline, @cpu is not its own sibling and
2112 		 * scx_pick_idle_cpu() can get caught in an infinite loop as
2113 		 * @cpu is never cleared from idle_masks.smt. Ensure that @cpu
2114 		 * is eventually cleared.
2115 		 */
2116 		if (cpumask_intersects(smt, idle_masks.smt))
2117 			cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
2118 		else if (cpumask_test_cpu(cpu, idle_masks.smt))
2119 			__cpumask_clear_cpu(cpu, idle_masks.smt);
2120 	}
2121 #endif
2122 	return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu);
2123 }
2124 
2125 static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
2126 {
2127 	int cpu;
2128 
2129 retry:
2130 	if (sched_smt_active()) {
2131 		cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed);
2132 		if (cpu < nr_cpu_ids)
2133 			goto found;
2134 
2135 		if (flags & SCX_PICK_IDLE_CORE)
2136 			return -EBUSY;
2137 	}
2138 
2139 	cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed);
2140 	if (cpu >= nr_cpu_ids)
2141 		return -EBUSY;
2142 
2143 found:
2144 	if (test_and_clear_cpu_idle(cpu))
2145 		return cpu;
2146 	else
2147 		goto retry;
2148 }
2149 
2150 static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
2151 			      u64 wake_flags, bool *found)
2152 {
2153 	s32 cpu;
2154 
2155 	*found = false;
2156 
2157 	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
2158 		scx_ops_error("built-in idle tracking is disabled");
2159 		return prev_cpu;
2160 	}
2161 
2162 	/*
2163 	 * If WAKE_SYNC, the waker's local DSQ is empty, and the system is
2164 	 * under utilized, wake up @p to the local DSQ of the waker. Checking
2165 	 * only for an empty local DSQ is insufficient as it could give the
2166 	 * wakee an unfair advantage when the system is oversaturated.
2167 	 * Checking only for the presence of idle CPUs is also insufficient as
2168 	 * the local DSQ of the waker could have tasks piled up on it even if
2169 	 * there is an idle core elsewhere on the system.
2170 	 */
2171 	cpu = smp_processor_id();
2172 	if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 &&
2173 	    !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) &&
2174 	    cpu_rq(cpu)->scx.local_dsq.nr == 0) {
2175 		if (cpumask_test_cpu(cpu, p->cpus_ptr))
2176 			goto cpu_found;
2177 	}
2178 
2179 	if (p->nr_cpus_allowed == 1) {
2180 		if (test_and_clear_cpu_idle(prev_cpu)) {
2181 			cpu = prev_cpu;
2182 			goto cpu_found;
2183 		} else {
2184 			return prev_cpu;
2185 		}
2186 	}
2187 
2188 	/*
2189 	 * If CPU has SMT, any wholly idle CPU is likely a better pick than
2190 	 * partially idle @prev_cpu.
2191 	 */
2192 	if (sched_smt_active()) {
2193 		if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
2194 		    test_and_clear_cpu_idle(prev_cpu)) {
2195 			cpu = prev_cpu;
2196 			goto cpu_found;
2197 		}
2198 
2199 		cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
2200 		if (cpu >= 0)
2201 			goto cpu_found;
2202 	}
2203 
2204 	if (test_and_clear_cpu_idle(prev_cpu)) {
2205 		cpu = prev_cpu;
2206 		goto cpu_found;
2207 	}
2208 
2209 	cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
2210 	if (cpu >= 0)
2211 		goto cpu_found;
2212 
2213 	return prev_cpu;
2214 
2215 cpu_found:
2216 	*found = true;
2217 	return cpu;
2218 }
2219 
2220 static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags)
2221 {
2222 	/*
2223 	 * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it
2224 	 * can be a good migration opportunity with low cache and memory
2225 	 * footprint. Returning a CPU different than @prev_cpu triggers
2226 	 * immediate rq migration. However, for SCX, as the current rq
2227 	 * association doesn't dictate where the task is going to run, this
2228 	 * doesn't fit well. If necessary, we can later add a dedicated method
2229 	 * which can decide to preempt self to force it through the regular
2230 	 * scheduling path.
2231 	 */
2232 	if (unlikely(wake_flags & WF_EXEC))
2233 		return prev_cpu;
2234 
2235 	if (SCX_HAS_OP(select_cpu)) {
2236 		s32 cpu;
2237 		struct task_struct **ddsp_taskp;
2238 
2239 		ddsp_taskp = this_cpu_ptr(&direct_dispatch_task);
2240 		WARN_ON_ONCE(*ddsp_taskp);
2241 		*ddsp_taskp = p;
2242 
2243 		cpu = SCX_CALL_OP_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU,
2244 				      select_cpu, p, prev_cpu, wake_flags);
2245 		*ddsp_taskp = NULL;
2246 		if (ops_cpu_valid(cpu, "from ops.select_cpu()"))
2247 			return cpu;
2248 		else
2249 			return prev_cpu;
2250 	} else {
2251 		bool found;
2252 		s32 cpu;
2253 
2254 		cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found);
2255 		if (found) {
2256 			p->scx.slice = SCX_SLICE_DFL;
2257 			p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL;
2258 		}
2259 		return cpu;
2260 	}
2261 }
2262 
2263 static void set_cpus_allowed_scx(struct task_struct *p,
2264 				 struct affinity_context *ac)
2265 {
2266 	set_cpus_allowed_common(p, ac);
2267 
2268 	/*
2269 	 * The effective cpumask is stored in @p->cpus_ptr which may temporarily
2270 	 * differ from the configured one in @p->cpus_mask. Always tell the bpf
2271 	 * scheduler the effective one.
2272 	 *
2273 	 * Fine-grained memory write control is enforced by BPF making the const
2274 	 * designation pointless. Cast it away when calling the operation.
2275 	 */
2276 	if (SCX_HAS_OP(set_cpumask))
2277 		SCX_CALL_OP(SCX_KF_REST, set_cpumask, p,
2278 			    (struct cpumask *)p->cpus_ptr);
2279 }
2280 
2281 static void reset_idle_masks(void)
2282 {
2283 	/*
2284 	 * Consider all online cpus idle. Should converge to the actual state
2285 	 * quickly.
2286 	 */
2287 	cpumask_copy(idle_masks.cpu, cpu_online_mask);
2288 	cpumask_copy(idle_masks.smt, cpu_online_mask);
2289 }
2290 
2291 void __scx_update_idle(struct rq *rq, bool idle)
2292 {
2293 	int cpu = cpu_of(rq);
2294 
2295 	if (SCX_HAS_OP(update_idle)) {
2296 		SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle);
2297 		if (!static_branch_unlikely(&scx_builtin_idle_enabled))
2298 			return;
2299 	}
2300 
2301 	if (idle)
2302 		cpumask_set_cpu(cpu, idle_masks.cpu);
2303 	else
2304 		cpumask_clear_cpu(cpu, idle_masks.cpu);
2305 
2306 #ifdef CONFIG_SCHED_SMT
2307 	if (sched_smt_active()) {
2308 		const struct cpumask *smt = cpu_smt_mask(cpu);
2309 
2310 		if (idle) {
2311 			/*
2312 			 * idle_masks.smt handling is racy but that's fine as
2313 			 * it's only for optimization and self-correcting.
2314 			 */
2315 			for_each_cpu(cpu, smt) {
2316 				if (!cpumask_test_cpu(cpu, idle_masks.cpu))
2317 					return;
2318 			}
2319 			cpumask_or(idle_masks.smt, idle_masks.smt, smt);
2320 		} else {
2321 			cpumask_andnot(idle_masks.smt, idle_masks.smt, smt);
2322 		}
2323 	}
2324 #endif
2325 }
2326 
2327 #else	/* CONFIG_SMP */
2328 
2329 static bool test_and_clear_cpu_idle(int cpu) { return false; }
2330 static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; }
2331 static void reset_idle_masks(void) {}
2332 
2333 #endif	/* CONFIG_SMP */
2334 
2335 static bool check_rq_for_timeouts(struct rq *rq)
2336 {
2337 	struct task_struct *p;
2338 	struct rq_flags rf;
2339 	bool timed_out = false;
2340 
2341 	rq_lock_irqsave(rq, &rf);
2342 	list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) {
2343 		unsigned long last_runnable = p->scx.runnable_at;
2344 
2345 		if (unlikely(time_after(jiffies,
2346 					last_runnable + scx_watchdog_timeout))) {
2347 			u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
2348 
2349 			scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
2350 					   "%s[%d] failed to run for %u.%03us",
2351 					   p->comm, p->pid,
2352 					   dur_ms / 1000, dur_ms % 1000);
2353 			timed_out = true;
2354 			break;
2355 		}
2356 	}
2357 	rq_unlock_irqrestore(rq, &rf);
2358 
2359 	return timed_out;
2360 }
2361 
2362 static void scx_watchdog_workfn(struct work_struct *work)
2363 {
2364 	int cpu;
2365 
2366 	WRITE_ONCE(scx_watchdog_timestamp, jiffies);
2367 
2368 	for_each_online_cpu(cpu) {
2369 		if (unlikely(check_rq_for_timeouts(cpu_rq(cpu))))
2370 			break;
2371 
2372 		cond_resched();
2373 	}
2374 	queue_delayed_work(system_unbound_wq, to_delayed_work(work),
2375 			   scx_watchdog_timeout / 2);
2376 }
2377 
2378 void scx_tick(struct rq *rq)
2379 {
2380 	unsigned long last_check;
2381 
2382 	if (!scx_enabled())
2383 		return;
2384 
2385 	last_check = READ_ONCE(scx_watchdog_timestamp);
2386 	if (unlikely(time_after(jiffies,
2387 				last_check + READ_ONCE(scx_watchdog_timeout)))) {
2388 		u32 dur_ms = jiffies_to_msecs(jiffies - last_check);
2389 
2390 		scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
2391 				   "watchdog failed to check in for %u.%03us",
2392 				   dur_ms / 1000, dur_ms % 1000);
2393 	}
2394 
2395 	update_other_load_avgs(rq);
2396 }
2397 
2398 static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
2399 {
2400 	update_curr_scx(rq);
2401 
2402 	/*
2403 	 * While bypassing, always resched as we can't trust the slice
2404 	 * management.
2405 	 */
2406 	if (scx_ops_bypassing())
2407 		curr->scx.slice = 0;
2408 	else if (SCX_HAS_OP(tick))
2409 		SCX_CALL_OP(SCX_KF_REST, tick, curr);
2410 
2411 	if (!curr->scx.slice)
2412 		resched_curr(rq);
2413 }
2414 
2415 static enum scx_task_state scx_get_task_state(const struct task_struct *p)
2416 {
2417 	return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT;
2418 }
2419 
2420 static void scx_set_task_state(struct task_struct *p, enum scx_task_state state)
2421 {
2422 	enum scx_task_state prev_state = scx_get_task_state(p);
2423 	bool warn = false;
2424 
2425 	BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS));
2426 
2427 	switch (state) {
2428 	case SCX_TASK_NONE:
2429 		break;
2430 	case SCX_TASK_INIT:
2431 		warn = prev_state != SCX_TASK_NONE;
2432 		break;
2433 	case SCX_TASK_READY:
2434 		warn = prev_state == SCX_TASK_NONE;
2435 		break;
2436 	case SCX_TASK_ENABLED:
2437 		warn = prev_state != SCX_TASK_READY;
2438 		break;
2439 	default:
2440 		warn = true;
2441 		return;
2442 	}
2443 
2444 	WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]",
2445 		  prev_state, state, p->comm, p->pid);
2446 
2447 	p->scx.flags &= ~SCX_TASK_STATE_MASK;
2448 	p->scx.flags |= state << SCX_TASK_STATE_SHIFT;
2449 }
2450 
2451 static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork)
2452 {
2453 	int ret;
2454 
2455 	p->scx.disallow = false;
2456 
2457 	if (SCX_HAS_OP(init_task)) {
2458 		struct scx_init_task_args args = {
2459 			.fork = fork,
2460 		};
2461 
2462 		ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init_task, p, &args);
2463 		if (unlikely(ret)) {
2464 			ret = ops_sanitize_err("init_task", ret);
2465 			return ret;
2466 		}
2467 	}
2468 
2469 	scx_set_task_state(p, SCX_TASK_INIT);
2470 
2471 	if (p->scx.disallow) {
2472 		struct rq *rq;
2473 		struct rq_flags rf;
2474 
2475 		rq = task_rq_lock(p, &rf);
2476 
2477 		/*
2478 		 * We're either in fork or load path and @p->policy will be
2479 		 * applied right after. Reverting @p->policy here and rejecting
2480 		 * %SCHED_EXT transitions from scx_check_setscheduler()
2481 		 * guarantees that if ops.init_task() sets @p->disallow, @p can
2482 		 * never be in SCX.
2483 		 */
2484 		if (p->policy == SCHED_EXT) {
2485 			p->policy = SCHED_NORMAL;
2486 			atomic_long_inc(&scx_nr_rejected);
2487 		}
2488 
2489 		task_rq_unlock(rq, p, &rf);
2490 	}
2491 
2492 	p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT;
2493 	return 0;
2494 }
2495 
2496 static void set_task_scx_weight(struct task_struct *p)
2497 {
2498 	u32 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO];
2499 
2500 	p->scx.weight = sched_weight_to_cgroup(weight);
2501 }
2502 
2503 static void scx_ops_enable_task(struct task_struct *p)
2504 {
2505 	lockdep_assert_rq_held(task_rq(p));
2506 
2507 	/*
2508 	 * Set the weight before calling ops.enable() so that the scheduler
2509 	 * doesn't see a stale value if they inspect the task struct.
2510 	 */
2511 	set_task_scx_weight(p);
2512 	if (SCX_HAS_OP(enable))
2513 		SCX_CALL_OP(SCX_KF_REST, enable, p);
2514 	scx_set_task_state(p, SCX_TASK_ENABLED);
2515 
2516 	if (SCX_HAS_OP(set_weight))
2517 		SCX_CALL_OP(SCX_KF_REST, set_weight, p, p->scx.weight);
2518 }
2519 
2520 static void scx_ops_disable_task(struct task_struct *p)
2521 {
2522 	lockdep_assert_rq_held(task_rq(p));
2523 	WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED);
2524 
2525 	if (SCX_HAS_OP(disable))
2526 		SCX_CALL_OP(SCX_KF_REST, disable, p);
2527 	scx_set_task_state(p, SCX_TASK_READY);
2528 }
2529 
2530 static void scx_ops_exit_task(struct task_struct *p)
2531 {
2532 	struct scx_exit_task_args args = {
2533 		.cancelled = false,
2534 	};
2535 
2536 	lockdep_assert_rq_held(task_rq(p));
2537 
2538 	switch (scx_get_task_state(p)) {
2539 	case SCX_TASK_NONE:
2540 		return;
2541 	case SCX_TASK_INIT:
2542 		args.cancelled = true;
2543 		break;
2544 	case SCX_TASK_READY:
2545 		break;
2546 	case SCX_TASK_ENABLED:
2547 		scx_ops_disable_task(p);
2548 		break;
2549 	default:
2550 		WARN_ON_ONCE(true);
2551 		return;
2552 	}
2553 
2554 	if (SCX_HAS_OP(exit_task))
2555 		SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args);
2556 	scx_set_task_state(p, SCX_TASK_NONE);
2557 }
2558 
2559 void init_scx_entity(struct sched_ext_entity *scx)
2560 {
2561 	/*
2562 	 * init_idle() calls this function again after fork sequence is
2563 	 * complete. Don't touch ->tasks_node as it's already linked.
2564 	 */
2565 	memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node));
2566 
2567 	INIT_LIST_HEAD(&scx->dsq_node);
2568 	scx->sticky_cpu = -1;
2569 	scx->holding_cpu = -1;
2570 	INIT_LIST_HEAD(&scx->runnable_node);
2571 	scx->runnable_at = jiffies;
2572 	scx->ddsp_dsq_id = SCX_DSQ_INVALID;
2573 	scx->slice = SCX_SLICE_DFL;
2574 }
2575 
2576 void scx_pre_fork(struct task_struct *p)
2577 {
2578 	/*
2579 	 * BPF scheduler enable/disable paths want to be able to iterate and
2580 	 * update all tasks which can become complex when racing forks. As
2581 	 * enable/disable are very cold paths, let's use a percpu_rwsem to
2582 	 * exclude forks.
2583 	 */
2584 	percpu_down_read(&scx_fork_rwsem);
2585 }
2586 
2587 int scx_fork(struct task_struct *p)
2588 {
2589 	percpu_rwsem_assert_held(&scx_fork_rwsem);
2590 
2591 	if (scx_enabled())
2592 		return scx_ops_init_task(p, task_group(p), true);
2593 	else
2594 		return 0;
2595 }
2596 
2597 void scx_post_fork(struct task_struct *p)
2598 {
2599 	if (scx_enabled()) {
2600 		scx_set_task_state(p, SCX_TASK_READY);
2601 
2602 		/*
2603 		 * Enable the task immediately if it's running on sched_ext.
2604 		 * Otherwise, it'll be enabled in switching_to_scx() if and
2605 		 * when it's ever configured to run with a SCHED_EXT policy.
2606 		 */
2607 		if (p->sched_class == &ext_sched_class) {
2608 			struct rq_flags rf;
2609 			struct rq *rq;
2610 
2611 			rq = task_rq_lock(p, &rf);
2612 			scx_ops_enable_task(p);
2613 			task_rq_unlock(rq, p, &rf);
2614 		}
2615 	}
2616 
2617 	spin_lock_irq(&scx_tasks_lock);
2618 	list_add_tail(&p->scx.tasks_node, &scx_tasks);
2619 	spin_unlock_irq(&scx_tasks_lock);
2620 
2621 	percpu_up_read(&scx_fork_rwsem);
2622 }
2623 
2624 void scx_cancel_fork(struct task_struct *p)
2625 {
2626 	if (scx_enabled()) {
2627 		struct rq *rq;
2628 		struct rq_flags rf;
2629 
2630 		rq = task_rq_lock(p, &rf);
2631 		WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY);
2632 		scx_ops_exit_task(p);
2633 		task_rq_unlock(rq, p, &rf);
2634 	}
2635 
2636 	percpu_up_read(&scx_fork_rwsem);
2637 }
2638 
2639 void sched_ext_free(struct task_struct *p)
2640 {
2641 	unsigned long flags;
2642 
2643 	spin_lock_irqsave(&scx_tasks_lock, flags);
2644 	list_del_init(&p->scx.tasks_node);
2645 	spin_unlock_irqrestore(&scx_tasks_lock, flags);
2646 
2647 	/*
2648 	 * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY ->
2649 	 * ENABLED transitions can't race us. Disable ops for @p.
2650 	 */
2651 	if (scx_get_task_state(p) != SCX_TASK_NONE) {
2652 		struct rq_flags rf;
2653 		struct rq *rq;
2654 
2655 		rq = task_rq_lock(p, &rf);
2656 		scx_ops_exit_task(p);
2657 		task_rq_unlock(rq, p, &rf);
2658 	}
2659 }
2660 
2661 static void reweight_task_scx(struct rq *rq, struct task_struct *p, int newprio)
2662 {
2663 	lockdep_assert_rq_held(task_rq(p));
2664 
2665 	set_task_scx_weight(p);
2666 	if (SCX_HAS_OP(set_weight))
2667 		SCX_CALL_OP(SCX_KF_REST, set_weight, p, p->scx.weight);
2668 }
2669 
2670 static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio)
2671 {
2672 }
2673 
2674 static void switching_to_scx(struct rq *rq, struct task_struct *p)
2675 {
2676 	scx_ops_enable_task(p);
2677 
2678 	/*
2679 	 * set_cpus_allowed_scx() is not called while @p is associated with a
2680 	 * different scheduler class. Keep the BPF scheduler up-to-date.
2681 	 */
2682 	if (SCX_HAS_OP(set_cpumask))
2683 		SCX_CALL_OP(SCX_KF_REST, set_cpumask, p,
2684 			    (struct cpumask *)p->cpus_ptr);
2685 }
2686 
2687 static void switched_from_scx(struct rq *rq, struct task_struct *p)
2688 {
2689 	scx_ops_disable_task(p);
2690 }
2691 
2692 static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {}
2693 static void switched_to_scx(struct rq *rq, struct task_struct *p) {}
2694 
2695 int scx_check_setscheduler(struct task_struct *p, int policy)
2696 {
2697 	lockdep_assert_rq_held(task_rq(p));
2698 
2699 	/* if disallow, reject transitioning into SCX */
2700 	if (scx_enabled() && READ_ONCE(p->scx.disallow) &&
2701 	    p->policy != policy && policy == SCHED_EXT)
2702 		return -EACCES;
2703 
2704 	return 0;
2705 }
2706 
2707 /*
2708  * Omitted operations:
2709  *
2710  * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task
2711  *   isn't tied to the CPU at that point. Preemption is implemented by resetting
2712  *   the victim task's slice to 0 and triggering reschedule on the target CPU.
2713  *
2714  * - migrate_task_rq: Unnecessary as task to cpu mapping is transient.
2715  *
2716  * - task_fork/dead: We need fork/dead notifications for all tasks regardless of
2717  *   their current sched_class. Call them directly from sched core instead.
2718  *
2719  * - task_woken: Unnecessary.
2720  */
2721 DEFINE_SCHED_CLASS(ext) = {
2722 	.enqueue_task		= enqueue_task_scx,
2723 	.dequeue_task		= dequeue_task_scx,
2724 	.yield_task		= yield_task_scx,
2725 	.yield_to_task		= yield_to_task_scx,
2726 
2727 	.wakeup_preempt		= wakeup_preempt_scx,
2728 
2729 	.pick_next_task		= pick_next_task_scx,
2730 
2731 	.put_prev_task		= put_prev_task_scx,
2732 	.set_next_task		= set_next_task_scx,
2733 
2734 #ifdef CONFIG_SMP
2735 	.balance		= balance_scx,
2736 	.select_task_rq		= select_task_rq_scx,
2737 	.set_cpus_allowed	= set_cpus_allowed_scx,
2738 #endif
2739 
2740 	.task_tick		= task_tick_scx,
2741 
2742 	.switching_to		= switching_to_scx,
2743 	.switched_from		= switched_from_scx,
2744 	.switched_to		= switched_to_scx,
2745 	.reweight_task		= reweight_task_scx,
2746 	.prio_changed		= prio_changed_scx,
2747 
2748 	.update_curr		= update_curr_scx,
2749 
2750 #ifdef CONFIG_UCLAMP_TASK
2751 	.uclamp_enabled		= 0,
2752 #endif
2753 };
2754 
2755 static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id)
2756 {
2757 	memset(dsq, 0, sizeof(*dsq));
2758 
2759 	raw_spin_lock_init(&dsq->lock);
2760 	INIT_LIST_HEAD(&dsq->list);
2761 	dsq->id = dsq_id;
2762 }
2763 
2764 static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node)
2765 {
2766 	struct scx_dispatch_q *dsq;
2767 	int ret;
2768 
2769 	if (dsq_id & SCX_DSQ_FLAG_BUILTIN)
2770 		return ERR_PTR(-EINVAL);
2771 
2772 	dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node);
2773 	if (!dsq)
2774 		return ERR_PTR(-ENOMEM);
2775 
2776 	init_dsq(dsq, dsq_id);
2777 
2778 	ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node,
2779 				     dsq_hash_params);
2780 	if (ret) {
2781 		kfree(dsq);
2782 		return ERR_PTR(ret);
2783 	}
2784 	return dsq;
2785 }
2786 
2787 static void free_dsq_irq_workfn(struct irq_work *irq_work)
2788 {
2789 	struct llist_node *to_free = llist_del_all(&dsqs_to_free);
2790 	struct scx_dispatch_q *dsq, *tmp_dsq;
2791 
2792 	llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node)
2793 		kfree_rcu(dsq, rcu);
2794 }
2795 
2796 static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn);
2797 
2798 static void destroy_dsq(u64 dsq_id)
2799 {
2800 	struct scx_dispatch_q *dsq;
2801 	unsigned long flags;
2802 
2803 	rcu_read_lock();
2804 
2805 	dsq = find_user_dsq(dsq_id);
2806 	if (!dsq)
2807 		goto out_unlock_rcu;
2808 
2809 	raw_spin_lock_irqsave(&dsq->lock, flags);
2810 
2811 	if (dsq->nr) {
2812 		scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)",
2813 			      dsq->id, dsq->nr);
2814 		goto out_unlock_dsq;
2815 	}
2816 
2817 	if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params))
2818 		goto out_unlock_dsq;
2819 
2820 	/*
2821 	 * Mark dead by invalidating ->id to prevent dispatch_enqueue() from
2822 	 * queueing more tasks. As this function can be called from anywhere,
2823 	 * freeing is bounced through an irq work to avoid nesting RCU
2824 	 * operations inside scheduler locks.
2825 	 */
2826 	dsq->id = SCX_DSQ_INVALID;
2827 	llist_add(&dsq->free_node, &dsqs_to_free);
2828 	irq_work_queue(&free_dsq_irq_work);
2829 
2830 out_unlock_dsq:
2831 	raw_spin_unlock_irqrestore(&dsq->lock, flags);
2832 out_unlock_rcu:
2833 	rcu_read_unlock();
2834 }
2835 
2836 
2837 /********************************************************************************
2838  * Sysfs interface and ops enable/disable.
2839  */
2840 
2841 #define SCX_ATTR(_name)								\
2842 	static struct kobj_attribute scx_attr_##_name = {			\
2843 		.attr = { .name = __stringify(_name), .mode = 0444 },		\
2844 		.show = scx_attr_##_name##_show,				\
2845 	}
2846 
2847 static ssize_t scx_attr_state_show(struct kobject *kobj,
2848 				   struct kobj_attribute *ka, char *buf)
2849 {
2850 	return sysfs_emit(buf, "%s\n",
2851 			  scx_ops_enable_state_str[scx_ops_enable_state()]);
2852 }
2853 SCX_ATTR(state);
2854 
2855 static ssize_t scx_attr_switch_all_show(struct kobject *kobj,
2856 					struct kobj_attribute *ka, char *buf)
2857 {
2858 	return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all));
2859 }
2860 SCX_ATTR(switch_all);
2861 
2862 static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj,
2863 					 struct kobj_attribute *ka, char *buf)
2864 {
2865 	return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected));
2866 }
2867 SCX_ATTR(nr_rejected);
2868 
2869 static struct attribute *scx_global_attrs[] = {
2870 	&scx_attr_state.attr,
2871 	&scx_attr_switch_all.attr,
2872 	&scx_attr_nr_rejected.attr,
2873 	NULL,
2874 };
2875 
2876 static const struct attribute_group scx_global_attr_group = {
2877 	.attrs = scx_global_attrs,
2878 };
2879 
2880 static void scx_kobj_release(struct kobject *kobj)
2881 {
2882 	kfree(kobj);
2883 }
2884 
2885 static ssize_t scx_attr_ops_show(struct kobject *kobj,
2886 				 struct kobj_attribute *ka, char *buf)
2887 {
2888 	return sysfs_emit(buf, "%s\n", scx_ops.name);
2889 }
2890 SCX_ATTR(ops);
2891 
2892 static struct attribute *scx_sched_attrs[] = {
2893 	&scx_attr_ops.attr,
2894 	NULL,
2895 };
2896 ATTRIBUTE_GROUPS(scx_sched);
2897 
2898 static const struct kobj_type scx_ktype = {
2899 	.release = scx_kobj_release,
2900 	.sysfs_ops = &kobj_sysfs_ops,
2901 	.default_groups = scx_sched_groups,
2902 };
2903 
2904 static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env)
2905 {
2906 	return add_uevent_var(env, "SCXOPS=%s", scx_ops.name);
2907 }
2908 
2909 static const struct kset_uevent_ops scx_uevent_ops = {
2910 	.uevent = scx_uevent,
2911 };
2912 
2913 /*
2914  * Used by sched_fork() and __setscheduler_prio() to pick the matching
2915  * sched_class. dl/rt are already handled.
2916  */
2917 bool task_should_scx(struct task_struct *p)
2918 {
2919 	if (!scx_enabled() ||
2920 	    unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING))
2921 		return false;
2922 	if (READ_ONCE(scx_switching_all))
2923 		return true;
2924 	return p->policy == SCHED_EXT;
2925 }
2926 
2927 /**
2928  * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
2929  *
2930  * Bypassing guarantees that all runnable tasks make forward progress without
2931  * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might
2932  * be held by tasks that the BPF scheduler is forgetting to run, which
2933  * unfortunately also excludes toggling the static branches.
2934  *
2935  * Let's work around by overriding a couple ops and modifying behaviors based on
2936  * the DISABLING state and then cycling the queued tasks through dequeue/enqueue
2937  * to force global FIFO scheduling.
2938  *
2939  * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order.
2940  *
2941  * b. ops.dispatch() is ignored.
2942  *
2943  * c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be
2944  *    trusted. Whenever a tick triggers, the running task is rotated to the tail
2945  *    of the queue.
2946  *
2947  * d. pick_next_task() suppresses zero slice warning.
2948  *
2949  * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM
2950  *    operations.
2951  */
2952 static void scx_ops_bypass(bool bypass)
2953 {
2954 	int depth, cpu;
2955 
2956 	if (bypass) {
2957 		depth = atomic_inc_return(&scx_ops_bypass_depth);
2958 		WARN_ON_ONCE(depth <= 0);
2959 		if (depth != 1)
2960 			return;
2961 	} else {
2962 		depth = atomic_dec_return(&scx_ops_bypass_depth);
2963 		WARN_ON_ONCE(depth < 0);
2964 		if (depth != 0)
2965 			return;
2966 	}
2967 
2968 	/*
2969 	 * We need to guarantee that no tasks are on the BPF scheduler while
2970 	 * bypassing. Either we see enabled or the enable path sees the
2971 	 * increased bypass_depth before moving tasks to SCX.
2972 	 */
2973 	if (!scx_enabled())
2974 		return;
2975 
2976 	/*
2977 	 * No task property is changing. We just need to make sure all currently
2978 	 * queued tasks are re-queued according to the new scx_ops_bypassing()
2979 	 * state. As an optimization, walk each rq's runnable_list instead of
2980 	 * the scx_tasks list.
2981 	 *
2982 	 * This function can't trust the scheduler and thus can't use
2983 	 * cpus_read_lock(). Walk all possible CPUs instead of online.
2984 	 */
2985 	for_each_possible_cpu(cpu) {
2986 		struct rq *rq = cpu_rq(cpu);
2987 		struct rq_flags rf;
2988 		struct task_struct *p, *n;
2989 
2990 		rq_lock_irqsave(rq, &rf);
2991 
2992 		/*
2993 		 * The use of list_for_each_entry_safe_reverse() is required
2994 		 * because each task is going to be removed from and added back
2995 		 * to the runnable_list during iteration. Because they're added
2996 		 * to the tail of the list, safe reverse iteration can still
2997 		 * visit all nodes.
2998 		 */
2999 		list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list,
3000 						 scx.runnable_node) {
3001 			struct sched_enq_and_set_ctx ctx;
3002 
3003 			/* cycling deq/enq is enough, see the function comment */
3004 			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
3005 			sched_enq_and_set_task(&ctx);
3006 		}
3007 
3008 		rq_unlock_irqrestore(rq, &rf);
3009 	}
3010 }
3011 
3012 static void free_exit_info(struct scx_exit_info *ei)
3013 {
3014 	kfree(ei->dump);
3015 	kfree(ei->msg);
3016 	kfree(ei->bt);
3017 	kfree(ei);
3018 }
3019 
3020 static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len)
3021 {
3022 	struct scx_exit_info *ei;
3023 
3024 	ei = kzalloc(sizeof(*ei), GFP_KERNEL);
3025 	if (!ei)
3026 		return NULL;
3027 
3028 	ei->bt = kcalloc(sizeof(ei->bt[0]), SCX_EXIT_BT_LEN, GFP_KERNEL);
3029 	ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL);
3030 	ei->dump = kzalloc(exit_dump_len, GFP_KERNEL);
3031 
3032 	if (!ei->bt || !ei->msg || !ei->dump) {
3033 		free_exit_info(ei);
3034 		return NULL;
3035 	}
3036 
3037 	return ei;
3038 }
3039 
3040 static const char *scx_exit_reason(enum scx_exit_kind kind)
3041 {
3042 	switch (kind) {
3043 	case SCX_EXIT_UNREG:
3044 		return "Scheduler unregistered from user space";
3045 	case SCX_EXIT_UNREG_BPF:
3046 		return "Scheduler unregistered from BPF";
3047 	case SCX_EXIT_UNREG_KERN:
3048 		return "Scheduler unregistered from the main kernel";
3049 	case SCX_EXIT_SYSRQ:
3050 		return "disabled by sysrq-S";
3051 	case SCX_EXIT_ERROR:
3052 		return "runtime error";
3053 	case SCX_EXIT_ERROR_BPF:
3054 		return "scx_bpf_error";
3055 	case SCX_EXIT_ERROR_STALL:
3056 		return "runnable task stall";
3057 	default:
3058 		return "<UNKNOWN>";
3059 	}
3060 }
3061 
3062 static void scx_ops_disable_workfn(struct kthread_work *work)
3063 {
3064 	struct scx_exit_info *ei = scx_exit_info;
3065 	struct scx_task_iter sti;
3066 	struct task_struct *p;
3067 	struct rhashtable_iter rht_iter;
3068 	struct scx_dispatch_q *dsq;
3069 	int i, kind;
3070 
3071 	kind = atomic_read(&scx_exit_kind);
3072 	while (true) {
3073 		/*
3074 		 * NONE indicates that a new scx_ops has been registered since
3075 		 * disable was scheduled - don't kill the new ops. DONE
3076 		 * indicates that the ops has already been disabled.
3077 		 */
3078 		if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)
3079 			return;
3080 		if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE))
3081 			break;
3082 	}
3083 	ei->kind = kind;
3084 	ei->reason = scx_exit_reason(ei->kind);
3085 
3086 	/* guarantee forward progress by bypassing scx_ops */
3087 	scx_ops_bypass(true);
3088 
3089 	switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) {
3090 	case SCX_OPS_DISABLING:
3091 		WARN_ONCE(true, "sched_ext: duplicate disabling instance?");
3092 		break;
3093 	case SCX_OPS_DISABLED:
3094 		pr_warn("sched_ext: ops error detected without ops (%s)\n",
3095 			scx_exit_info->msg);
3096 		WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
3097 			     SCX_OPS_DISABLING);
3098 		goto done;
3099 	default:
3100 		break;
3101 	}
3102 
3103 	/*
3104 	 * Here, every runnable task is guaranteed to make forward progress and
3105 	 * we can safely use blocking synchronization constructs. Actually
3106 	 * disable ops.
3107 	 */
3108 	mutex_lock(&scx_ops_enable_mutex);
3109 
3110 	static_branch_disable(&__scx_switched_all);
3111 	WRITE_ONCE(scx_switching_all, false);
3112 
3113 	/*
3114 	 * Avoid racing against fork. See scx_ops_enable() for explanation on
3115 	 * the locking order.
3116 	 */
3117 	percpu_down_write(&scx_fork_rwsem);
3118 	cpus_read_lock();
3119 
3120 	spin_lock_irq(&scx_tasks_lock);
3121 	scx_task_iter_init(&sti);
3122 	/*
3123 	 * Invoke scx_ops_exit_task() on all non-idle tasks, including
3124 	 * TASK_DEAD tasks. Because dead tasks may have a nonzero refcount,
3125 	 * we may not have invoked sched_ext_free() on them by the time a
3126 	 * scheduler is disabled. We must therefore exit the task here, or we'd
3127 	 * fail to invoke ops.exit_task(), as the scheduler will have been
3128 	 * unloaded by the time the task is subsequently exited on the
3129 	 * sched_ext_free() path.
3130 	 */
3131 	while ((p = scx_task_iter_next_locked(&sti, true))) {
3132 		const struct sched_class *old_class = p->sched_class;
3133 		struct sched_enq_and_set_ctx ctx;
3134 
3135 		if (READ_ONCE(p->__state) != TASK_DEAD) {
3136 			sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE,
3137 					       &ctx);
3138 
3139 			p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL);
3140 			__setscheduler_prio(p, p->prio);
3141 			check_class_changing(task_rq(p), p, old_class);
3142 
3143 			sched_enq_and_set_task(&ctx);
3144 
3145 			check_class_changed(task_rq(p), p, old_class, p->prio);
3146 		}
3147 		scx_ops_exit_task(p);
3148 	}
3149 	scx_task_iter_exit(&sti);
3150 	spin_unlock_irq(&scx_tasks_lock);
3151 
3152 	/* no task is on scx, turn off all the switches and flush in-progress calls */
3153 	static_branch_disable_cpuslocked(&__scx_ops_enabled);
3154 	for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++)
3155 		static_branch_disable_cpuslocked(&scx_has_op[i]);
3156 	static_branch_disable_cpuslocked(&scx_ops_enq_last);
3157 	static_branch_disable_cpuslocked(&scx_ops_enq_exiting);
3158 	static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
3159 	synchronize_rcu();
3160 
3161 	cpus_read_unlock();
3162 	percpu_up_write(&scx_fork_rwsem);
3163 
3164 	if (ei->kind >= SCX_EXIT_ERROR) {
3165 		printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name);
3166 
3167 		if (ei->msg[0] == '\0')
3168 			printk(KERN_ERR "sched_ext: %s\n", ei->reason);
3169 		else
3170 			printk(KERN_ERR "sched_ext: %s (%s)\n", ei->reason, ei->msg);
3171 
3172 		stack_trace_print(ei->bt, ei->bt_len, 2);
3173 	}
3174 
3175 	if (scx_ops.exit)
3176 		SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei);
3177 
3178 	cancel_delayed_work_sync(&scx_watchdog_work);
3179 
3180 	/*
3181 	 * Delete the kobject from the hierarchy eagerly in addition to just
3182 	 * dropping a reference. Otherwise, if the object is deleted
3183 	 * asynchronously, sysfs could observe an object of the same name still
3184 	 * in the hierarchy when another scheduler is loaded.
3185 	 */
3186 	kobject_del(scx_root_kobj);
3187 	kobject_put(scx_root_kobj);
3188 	scx_root_kobj = NULL;
3189 
3190 	memset(&scx_ops, 0, sizeof(scx_ops));
3191 
3192 	rhashtable_walk_enter(&dsq_hash, &rht_iter);
3193 	do {
3194 		rhashtable_walk_start(&rht_iter);
3195 
3196 		while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq))
3197 			destroy_dsq(dsq->id);
3198 
3199 		rhashtable_walk_stop(&rht_iter);
3200 	} while (dsq == ERR_PTR(-EAGAIN));
3201 	rhashtable_walk_exit(&rht_iter);
3202 
3203 	free_percpu(scx_dsp_ctx);
3204 	scx_dsp_ctx = NULL;
3205 	scx_dsp_max_batch = 0;
3206 
3207 	free_exit_info(scx_exit_info);
3208 	scx_exit_info = NULL;
3209 
3210 	mutex_unlock(&scx_ops_enable_mutex);
3211 
3212 	WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) !=
3213 		     SCX_OPS_DISABLING);
3214 done:
3215 	scx_ops_bypass(false);
3216 }
3217 
3218 static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn);
3219 
3220 static void schedule_scx_ops_disable_work(void)
3221 {
3222 	struct kthread_worker *helper = READ_ONCE(scx_ops_helper);
3223 
3224 	/*
3225 	 * We may be called spuriously before the first bpf_sched_ext_reg(). If
3226 	 * scx_ops_helper isn't set up yet, there's nothing to do.
3227 	 */
3228 	if (helper)
3229 		kthread_queue_work(helper, &scx_ops_disable_work);
3230 }
3231 
3232 static void scx_ops_disable(enum scx_exit_kind kind)
3233 {
3234 	int none = SCX_EXIT_NONE;
3235 
3236 	if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE))
3237 		kind = SCX_EXIT_ERROR;
3238 
3239 	atomic_try_cmpxchg(&scx_exit_kind, &none, kind);
3240 
3241 	schedule_scx_ops_disable_work();
3242 }
3243 
3244 static void dump_newline(struct seq_buf *s)
3245 {
3246 	trace_sched_ext_dump("");
3247 
3248 	/* @s may be zero sized and seq_buf triggers WARN if so */
3249 	if (s->size)
3250 		seq_buf_putc(s, '\n');
3251 }
3252 
3253 static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...)
3254 {
3255 	va_list args;
3256 
3257 #ifdef CONFIG_TRACEPOINTS
3258 	if (trace_sched_ext_dump_enabled()) {
3259 		/* protected by scx_dump_state()::dump_lock */
3260 		static char line_buf[SCX_EXIT_MSG_LEN];
3261 
3262 		va_start(args, fmt);
3263 		vscnprintf(line_buf, sizeof(line_buf), fmt, args);
3264 		va_end(args);
3265 
3266 		trace_sched_ext_dump(line_buf);
3267 	}
3268 #endif
3269 	/* @s may be zero sized and seq_buf triggers WARN if so */
3270 	if (s->size) {
3271 		va_start(args, fmt);
3272 		seq_buf_vprintf(s, fmt, args);
3273 		va_end(args);
3274 
3275 		seq_buf_putc(s, '\n');
3276 	}
3277 }
3278 
3279 static void dump_stack_trace(struct seq_buf *s, const char *prefix,
3280 			     const unsigned long *bt, unsigned int len)
3281 {
3282 	unsigned int i;
3283 
3284 	for (i = 0; i < len; i++)
3285 		dump_line(s, "%s%pS", prefix, (void *)bt[i]);
3286 }
3287 
3288 static void ops_dump_init(struct seq_buf *s, const char *prefix)
3289 {
3290 	struct scx_dump_data *dd = &scx_dump_data;
3291 
3292 	lockdep_assert_irqs_disabled();
3293 
3294 	dd->cpu = smp_processor_id();		/* allow scx_bpf_dump() */
3295 	dd->first = true;
3296 	dd->cursor = 0;
3297 	dd->s = s;
3298 	dd->prefix = prefix;
3299 }
3300 
3301 static void ops_dump_flush(void)
3302 {
3303 	struct scx_dump_data *dd = &scx_dump_data;
3304 	char *line = dd->buf.line;
3305 
3306 	if (!dd->cursor)
3307 		return;
3308 
3309 	/*
3310 	 * There's something to flush and this is the first line. Insert a blank
3311 	 * line to distinguish ops dump.
3312 	 */
3313 	if (dd->first) {
3314 		dump_newline(dd->s);
3315 		dd->first = false;
3316 	}
3317 
3318 	/*
3319 	 * There may be multiple lines in $line. Scan and emit each line
3320 	 * separately.
3321 	 */
3322 	while (true) {
3323 		char *end = line;
3324 		char c;
3325 
3326 		while (*end != '\n' && *end != '\0')
3327 			end++;
3328 
3329 		/*
3330 		 * If $line overflowed, it may not have newline at the end.
3331 		 * Always emit with a newline.
3332 		 */
3333 		c = *end;
3334 		*end = '\0';
3335 		dump_line(dd->s, "%s%s", dd->prefix, line);
3336 		if (c == '\0')
3337 			break;
3338 
3339 		/* move to the next line */
3340 		end++;
3341 		if (*end == '\0')
3342 			break;
3343 		line = end;
3344 	}
3345 
3346 	dd->cursor = 0;
3347 }
3348 
3349 static void ops_dump_exit(void)
3350 {
3351 	ops_dump_flush();
3352 	scx_dump_data.cpu = -1;
3353 }
3354 
3355 static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx,
3356 			  struct task_struct *p, char marker)
3357 {
3358 	static unsigned long bt[SCX_EXIT_BT_LEN];
3359 	char dsq_id_buf[19] = "(n/a)";
3360 	unsigned long ops_state = atomic_long_read(&p->scx.ops_state);
3361 	unsigned int bt_len;
3362 
3363 	if (p->scx.dsq)
3364 		scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx",
3365 			  (unsigned long long)p->scx.dsq->id);
3366 
3367 	dump_newline(s);
3368 	dump_line(s, " %c%c %s[%d] %+ldms",
3369 		  marker, task_state_to_char(p), p->comm, p->pid,
3370 		  jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies));
3371 	dump_line(s, "      scx_state/flags=%u/0x%x ops_state/qseq=%lu/%lu",
3372 		  scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK,
3373 		  ops_state & SCX_OPSS_STATE_MASK,
3374 		  ops_state >> SCX_OPSS_QSEQ_SHIFT);
3375 	dump_line(s, "      sticky/holding_cpu=%d/%d dsq_id=%s",
3376 		  p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf);
3377 	dump_line(s, "      cpus=%*pb", cpumask_pr_args(p->cpus_ptr));
3378 
3379 	if (SCX_HAS_OP(dump_task)) {
3380 		ops_dump_init(s, "    ");
3381 		SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p);
3382 		ops_dump_exit();
3383 	}
3384 
3385 	bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1);
3386 	if (bt_len) {
3387 		dump_newline(s);
3388 		dump_stack_trace(s, "    ", bt, bt_len);
3389 	}
3390 }
3391 
3392 static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len)
3393 {
3394 	static DEFINE_SPINLOCK(dump_lock);
3395 	static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n";
3396 	struct scx_dump_ctx dctx = {
3397 		.kind = ei->kind,
3398 		.exit_code = ei->exit_code,
3399 		.reason = ei->reason,
3400 		.at_ns = ktime_get_ns(),
3401 		.at_jiffies = jiffies,
3402 	};
3403 	struct seq_buf s;
3404 	unsigned long flags;
3405 	char *buf;
3406 	int cpu;
3407 
3408 	spin_lock_irqsave(&dump_lock, flags);
3409 
3410 	seq_buf_init(&s, ei->dump, dump_len);
3411 
3412 	if (ei->kind == SCX_EXIT_NONE) {
3413 		dump_line(&s, "Debug dump triggered by %s", ei->reason);
3414 	} else {
3415 		dump_line(&s, "%s[%d] triggered exit kind %d:",
3416 			  current->comm, current->pid, ei->kind);
3417 		dump_line(&s, "  %s (%s)", ei->reason, ei->msg);
3418 		dump_newline(&s);
3419 		dump_line(&s, "Backtrace:");
3420 		dump_stack_trace(&s, "  ", ei->bt, ei->bt_len);
3421 	}
3422 
3423 	if (SCX_HAS_OP(dump)) {
3424 		ops_dump_init(&s, "");
3425 		SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx);
3426 		ops_dump_exit();
3427 	}
3428 
3429 	dump_newline(&s);
3430 	dump_line(&s, "CPU states");
3431 	dump_line(&s, "----------");
3432 
3433 	for_each_possible_cpu(cpu) {
3434 		struct rq *rq = cpu_rq(cpu);
3435 		struct rq_flags rf;
3436 		struct task_struct *p;
3437 		struct seq_buf ns;
3438 		size_t avail, used;
3439 		bool idle;
3440 
3441 		rq_lock(rq, &rf);
3442 
3443 		idle = list_empty(&rq->scx.runnable_list) &&
3444 			rq->curr->sched_class == &idle_sched_class;
3445 
3446 		if (idle && !SCX_HAS_OP(dump_cpu))
3447 			goto next;
3448 
3449 		/*
3450 		 * We don't yet know whether ops.dump_cpu() will produce output
3451 		 * and we may want to skip the default CPU dump if it doesn't.
3452 		 * Use a nested seq_buf to generate the standard dump so that we
3453 		 * can decide whether to commit later.
3454 		 */
3455 		avail = seq_buf_get_buf(&s, &buf);
3456 		seq_buf_init(&ns, buf, avail);
3457 
3458 		dump_newline(&ns);
3459 		dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x ops_qseq=%lu",
3460 			  cpu, rq->scx.nr_running, rq->scx.flags,
3461 			  rq->scx.ops_qseq);
3462 		dump_line(&ns, "          curr=%s[%d] class=%ps",
3463 			  rq->curr->comm, rq->curr->pid,
3464 			  rq->curr->sched_class);
3465 		if (!cpumask_empty(rq->scx.cpus_to_kick))
3466 			dump_line(&ns, "  cpus_to_kick   : %*pb",
3467 				  cpumask_pr_args(rq->scx.cpus_to_kick));
3468 		if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle))
3469 			dump_line(&ns, "  idle_to_kick   : %*pb",
3470 				  cpumask_pr_args(rq->scx.cpus_to_kick_if_idle));
3471 		if (!cpumask_empty(rq->scx.cpus_to_preempt))
3472 			dump_line(&ns, "  cpus_to_preempt: %*pb",
3473 				  cpumask_pr_args(rq->scx.cpus_to_preempt));
3474 
3475 		used = seq_buf_used(&ns);
3476 		if (SCX_HAS_OP(dump_cpu)) {
3477 			ops_dump_init(&ns, "  ");
3478 			SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle);
3479 			ops_dump_exit();
3480 		}
3481 
3482 		/*
3483 		 * If idle && nothing generated by ops.dump_cpu(), there's
3484 		 * nothing interesting. Skip.
3485 		 */
3486 		if (idle && used == seq_buf_used(&ns))
3487 			goto next;
3488 
3489 		/*
3490 		 * $s may already have overflowed when $ns was created. If so,
3491 		 * calling commit on it will trigger BUG.
3492 		 */
3493 		if (avail) {
3494 			seq_buf_commit(&s, seq_buf_used(&ns));
3495 			if (seq_buf_has_overflowed(&ns))
3496 				seq_buf_set_overflow(&s);
3497 		}
3498 
3499 		if (rq->curr->sched_class == &ext_sched_class)
3500 			scx_dump_task(&s, &dctx, rq->curr, '*');
3501 
3502 		list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node)
3503 			scx_dump_task(&s, &dctx, p, ' ');
3504 	next:
3505 		rq_unlock(rq, &rf);
3506 	}
3507 
3508 	if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker))
3509 		memcpy(ei->dump + dump_len - sizeof(trunc_marker),
3510 		       trunc_marker, sizeof(trunc_marker));
3511 
3512 	spin_unlock_irqrestore(&dump_lock, flags);
3513 }
3514 
3515 static void scx_ops_error_irq_workfn(struct irq_work *irq_work)
3516 {
3517 	struct scx_exit_info *ei = scx_exit_info;
3518 
3519 	if (ei->kind >= SCX_EXIT_ERROR)
3520 		scx_dump_state(ei, scx_ops.exit_dump_len);
3521 
3522 	schedule_scx_ops_disable_work();
3523 }
3524 
3525 static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn);
3526 
3527 static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind,
3528 					     s64 exit_code,
3529 					     const char *fmt, ...)
3530 {
3531 	struct scx_exit_info *ei = scx_exit_info;
3532 	int none = SCX_EXIT_NONE;
3533 	va_list args;
3534 
3535 	if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind))
3536 		return;
3537 
3538 	ei->exit_code = exit_code;
3539 
3540 	if (kind >= SCX_EXIT_ERROR)
3541 		ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1);
3542 
3543 	va_start(args, fmt);
3544 	vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args);
3545 	va_end(args);
3546 
3547 	/*
3548 	 * Set ei->kind and ->reason for scx_dump_state(). They'll be set again
3549 	 * in scx_ops_disable_workfn().
3550 	 */
3551 	ei->kind = kind;
3552 	ei->reason = scx_exit_reason(ei->kind);
3553 
3554 	irq_work_queue(&scx_ops_error_irq_work);
3555 }
3556 
3557 static struct kthread_worker *scx_create_rt_helper(const char *name)
3558 {
3559 	struct kthread_worker *helper;
3560 
3561 	helper = kthread_create_worker(0, name);
3562 	if (helper)
3563 		sched_set_fifo(helper->task);
3564 	return helper;
3565 }
3566 
3567 static int validate_ops(const struct sched_ext_ops *ops)
3568 {
3569 	/*
3570 	 * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the
3571 	 * ops.enqueue() callback isn't implemented.
3572 	 */
3573 	if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) {
3574 		scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented");
3575 		return -EINVAL;
3576 	}
3577 
3578 	return 0;
3579 }
3580 
3581 static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
3582 {
3583 	struct scx_task_iter sti;
3584 	struct task_struct *p;
3585 	unsigned long timeout;
3586 	int i, ret;
3587 
3588 	mutex_lock(&scx_ops_enable_mutex);
3589 
3590 	if (!scx_ops_helper) {
3591 		WRITE_ONCE(scx_ops_helper,
3592 			   scx_create_rt_helper("sched_ext_ops_helper"));
3593 		if (!scx_ops_helper) {
3594 			ret = -ENOMEM;
3595 			goto err_unlock;
3596 		}
3597 	}
3598 
3599 	if (scx_ops_enable_state() != SCX_OPS_DISABLED) {
3600 		ret = -EBUSY;
3601 		goto err_unlock;
3602 	}
3603 
3604 	scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL);
3605 	if (!scx_root_kobj) {
3606 		ret = -ENOMEM;
3607 		goto err_unlock;
3608 	}
3609 
3610 	scx_root_kobj->kset = scx_kset;
3611 	ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root");
3612 	if (ret < 0)
3613 		goto err;
3614 
3615 	scx_exit_info = alloc_exit_info(ops->exit_dump_len);
3616 	if (!scx_exit_info) {
3617 		ret = -ENOMEM;
3618 		goto err_del;
3619 	}
3620 
3621 	/*
3622 	 * Set scx_ops, transition to PREPPING and clear exit info to arm the
3623 	 * disable path. Failure triggers full disabling from here on.
3624 	 */
3625 	scx_ops = *ops;
3626 
3627 	WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) !=
3628 		     SCX_OPS_DISABLED);
3629 
3630 	atomic_set(&scx_exit_kind, SCX_EXIT_NONE);
3631 	scx_warned_zero_slice = false;
3632 
3633 	atomic_long_set(&scx_nr_rejected, 0);
3634 
3635 	/*
3636 	 * Keep CPUs stable during enable so that the BPF scheduler can track
3637 	 * online CPUs by watching ->on/offline_cpu() after ->init().
3638 	 */
3639 	cpus_read_lock();
3640 
3641 	if (scx_ops.init) {
3642 		ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init);
3643 		if (ret) {
3644 			ret = ops_sanitize_err("init", ret);
3645 			goto err_disable_unlock_cpus;
3646 		}
3647 	}
3648 
3649 	cpus_read_unlock();
3650 
3651 	ret = validate_ops(ops);
3652 	if (ret)
3653 		goto err_disable;
3654 
3655 	WARN_ON_ONCE(scx_dsp_ctx);
3656 	scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH;
3657 	scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf,
3658 						   scx_dsp_max_batch),
3659 				     __alignof__(struct scx_dsp_ctx));
3660 	if (!scx_dsp_ctx) {
3661 		ret = -ENOMEM;
3662 		goto err_disable;
3663 	}
3664 
3665 	if (ops->timeout_ms)
3666 		timeout = msecs_to_jiffies(ops->timeout_ms);
3667 	else
3668 		timeout = SCX_WATCHDOG_MAX_TIMEOUT;
3669 
3670 	WRITE_ONCE(scx_watchdog_timeout, timeout);
3671 	WRITE_ONCE(scx_watchdog_timestamp, jiffies);
3672 	queue_delayed_work(system_unbound_wq, &scx_watchdog_work,
3673 			   scx_watchdog_timeout / 2);
3674 
3675 	/*
3676 	 * Lock out forks before opening the floodgate so that they don't wander
3677 	 * into the operations prematurely.
3678 	 *
3679 	 * We don't need to keep the CPUs stable but grab cpus_read_lock() to
3680 	 * ease future locking changes for cgroup suport.
3681 	 *
3682 	 * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the
3683 	 * following dependency chain:
3684 	 *
3685 	 *   scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock
3686 	 */
3687 	percpu_down_write(&scx_fork_rwsem);
3688 	cpus_read_lock();
3689 
3690 	for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++)
3691 		if (((void (**)(void))ops)[i])
3692 			static_branch_enable_cpuslocked(&scx_has_op[i]);
3693 
3694 	if (ops->flags & SCX_OPS_ENQ_LAST)
3695 		static_branch_enable_cpuslocked(&scx_ops_enq_last);
3696 
3697 	if (ops->flags & SCX_OPS_ENQ_EXITING)
3698 		static_branch_enable_cpuslocked(&scx_ops_enq_exiting);
3699 
3700 	if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) {
3701 		reset_idle_masks();
3702 		static_branch_enable_cpuslocked(&scx_builtin_idle_enabled);
3703 	} else {
3704 		static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
3705 	}
3706 
3707 	static_branch_enable_cpuslocked(&__scx_ops_enabled);
3708 
3709 	/*
3710 	 * Enable ops for every task. Fork is excluded by scx_fork_rwsem
3711 	 * preventing new tasks from being added. No need to exclude tasks
3712 	 * leaving as sched_ext_free() can handle both prepped and enabled
3713 	 * tasks. Prep all tasks first and then enable them with preemption
3714 	 * disabled.
3715 	 */
3716 	spin_lock_irq(&scx_tasks_lock);
3717 
3718 	scx_task_iter_init(&sti);
3719 	while ((p = scx_task_iter_next_locked(&sti, false))) {
3720 		get_task_struct(p);
3721 		scx_task_iter_rq_unlock(&sti);
3722 		spin_unlock_irq(&scx_tasks_lock);
3723 
3724 		ret = scx_ops_init_task(p, task_group(p), false);
3725 		if (ret) {
3726 			put_task_struct(p);
3727 			spin_lock_irq(&scx_tasks_lock);
3728 			scx_task_iter_exit(&sti);
3729 			spin_unlock_irq(&scx_tasks_lock);
3730 			pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n",
3731 			       ret, p->comm, p->pid);
3732 			goto err_disable_unlock_all;
3733 		}
3734 
3735 		put_task_struct(p);
3736 		spin_lock_irq(&scx_tasks_lock);
3737 	}
3738 	scx_task_iter_exit(&sti);
3739 
3740 	/*
3741 	 * All tasks are prepped but are still ops-disabled. Ensure that
3742 	 * %current can't be scheduled out and switch everyone.
3743 	 * preempt_disable() is necessary because we can't guarantee that
3744 	 * %current won't be starved if scheduled out while switching.
3745 	 */
3746 	preempt_disable();
3747 
3748 	/*
3749 	 * From here on, the disable path must assume that tasks have ops
3750 	 * enabled and need to be recovered.
3751 	 *
3752 	 * Transition to ENABLING fails iff the BPF scheduler has already
3753 	 * triggered scx_bpf_error(). Returning an error code here would lose
3754 	 * the recorded error information. Exit indicating success so that the
3755 	 * error is notified through ops.exit() with all the details.
3756 	 */
3757 	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) {
3758 		preempt_enable();
3759 		spin_unlock_irq(&scx_tasks_lock);
3760 		WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
3761 		ret = 0;
3762 		goto err_disable_unlock_all;
3763 	}
3764 
3765 	/*
3766 	 * We're fully committed and can't fail. The PREPPED -> ENABLED
3767 	 * transitions here are synchronized against sched_ext_free() through
3768 	 * scx_tasks_lock.
3769 	 */
3770 	WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL));
3771 
3772 	scx_task_iter_init(&sti);
3773 	while ((p = scx_task_iter_next_locked(&sti, false))) {
3774 		const struct sched_class *old_class = p->sched_class;
3775 		struct sched_enq_and_set_ctx ctx;
3776 
3777 		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
3778 
3779 		scx_set_task_state(p, SCX_TASK_READY);
3780 		__setscheduler_prio(p, p->prio);
3781 		check_class_changing(task_rq(p), p, old_class);
3782 
3783 		sched_enq_and_set_task(&ctx);
3784 
3785 		check_class_changed(task_rq(p), p, old_class, p->prio);
3786 	}
3787 	scx_task_iter_exit(&sti);
3788 
3789 	spin_unlock_irq(&scx_tasks_lock);
3790 	preempt_enable();
3791 	cpus_read_unlock();
3792 	percpu_up_write(&scx_fork_rwsem);
3793 
3794 	/* see above ENABLING transition for the explanation on exiting with 0 */
3795 	if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) {
3796 		WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE);
3797 		ret = 0;
3798 		goto err_disable;
3799 	}
3800 
3801 	if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL))
3802 		static_branch_enable(&__scx_switched_all);
3803 
3804 	kobject_uevent(scx_root_kobj, KOBJ_ADD);
3805 	mutex_unlock(&scx_ops_enable_mutex);
3806 
3807 	return 0;
3808 
3809 err_del:
3810 	kobject_del(scx_root_kobj);
3811 err:
3812 	kobject_put(scx_root_kobj);
3813 	scx_root_kobj = NULL;
3814 	if (scx_exit_info) {
3815 		free_exit_info(scx_exit_info);
3816 		scx_exit_info = NULL;
3817 	}
3818 err_unlock:
3819 	mutex_unlock(&scx_ops_enable_mutex);
3820 	return ret;
3821 
3822 err_disable_unlock_all:
3823 	percpu_up_write(&scx_fork_rwsem);
3824 err_disable_unlock_cpus:
3825 	cpus_read_unlock();
3826 err_disable:
3827 	mutex_unlock(&scx_ops_enable_mutex);
3828 	/* must be fully disabled before returning */
3829 	scx_ops_disable(SCX_EXIT_ERROR);
3830 	kthread_flush_work(&scx_ops_disable_work);
3831 	return ret;
3832 }
3833 
3834 
3835 /********************************************************************************
3836  * bpf_struct_ops plumbing.
3837  */
3838 #include <linux/bpf_verifier.h>
3839 #include <linux/bpf.h>
3840 #include <linux/btf.h>
3841 
3842 extern struct btf *btf_vmlinux;
3843 static const struct btf_type *task_struct_type;
3844 static u32 task_struct_type_id;
3845 
3846 static bool set_arg_maybe_null(const char *op, int arg_n, int off, int size,
3847 			       enum bpf_access_type type,
3848 			       const struct bpf_prog *prog,
3849 			       struct bpf_insn_access_aux *info)
3850 {
3851 	struct btf *btf = bpf_get_btf_vmlinux();
3852 	const struct bpf_struct_ops_desc *st_ops_desc;
3853 	const struct btf_member *member;
3854 	const struct btf_type *t;
3855 	u32 btf_id, member_idx;
3856 	const char *mname;
3857 
3858 	/* struct_ops op args are all sequential, 64-bit numbers */
3859 	if (off != arg_n * sizeof(__u64))
3860 		return false;
3861 
3862 	/* btf_id should be the type id of struct sched_ext_ops */
3863 	btf_id = prog->aux->attach_btf_id;
3864 	st_ops_desc = bpf_struct_ops_find(btf, btf_id);
3865 	if (!st_ops_desc)
3866 		return false;
3867 
3868 	/* BTF type of struct sched_ext_ops */
3869 	t = st_ops_desc->type;
3870 
3871 	member_idx = prog->expected_attach_type;
3872 	if (member_idx >= btf_type_vlen(t))
3873 		return false;
3874 
3875 	/*
3876 	 * Get the member name of this struct_ops program, which corresponds to
3877 	 * a field in struct sched_ext_ops. For example, the member name of the
3878 	 * dispatch struct_ops program (callback) is "dispatch".
3879 	 */
3880 	member = &btf_type_member(t)[member_idx];
3881 	mname = btf_name_by_offset(btf_vmlinux, member->name_off);
3882 
3883 	if (!strcmp(mname, op)) {
3884 		/*
3885 		 * The value is a pointer to a type (struct task_struct) given
3886 		 * by a BTF ID (PTR_TO_BTF_ID). It is trusted (PTR_TRUSTED),
3887 		 * however, can be a NULL (PTR_MAYBE_NULL). The BPF program
3888 		 * should check the pointer to make sure it is not NULL before
3889 		 * using it, or the verifier will reject the program.
3890 		 *
3891 		 * Longer term, this is something that should be addressed by
3892 		 * BTF, and be fully contained within the verifier.
3893 		 */
3894 		info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID | PTR_TRUSTED;
3895 		info->btf = btf_vmlinux;
3896 		info->btf_id = task_struct_type_id;
3897 
3898 		return true;
3899 	}
3900 
3901 	return false;
3902 }
3903 
3904 static bool bpf_scx_is_valid_access(int off, int size,
3905 				    enum bpf_access_type type,
3906 				    const struct bpf_prog *prog,
3907 				    struct bpf_insn_access_aux *info)
3908 {
3909 	if (type != BPF_READ)
3910 		return false;
3911 	if (set_arg_maybe_null("dispatch", 1, off, size, type, prog, info) ||
3912 	    set_arg_maybe_null("yield", 1, off, size, type, prog, info))
3913 		return true;
3914 	if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
3915 		return false;
3916 	if (off % size != 0)
3917 		return false;
3918 
3919 	return btf_ctx_access(off, size, type, prog, info);
3920 }
3921 
3922 static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log,
3923 				     const struct bpf_reg_state *reg, int off,
3924 				     int size)
3925 {
3926 	const struct btf_type *t;
3927 
3928 	t = btf_type_by_id(reg->btf, reg->btf_id);
3929 	if (t == task_struct_type) {
3930 		if (off >= offsetof(struct task_struct, scx.slice) &&
3931 		    off + size <= offsetofend(struct task_struct, scx.slice))
3932 			return SCALAR_VALUE;
3933 		if (off >= offsetof(struct task_struct, scx.disallow) &&
3934 		    off + size <= offsetofend(struct task_struct, scx.disallow))
3935 			return SCALAR_VALUE;
3936 	}
3937 
3938 	return -EACCES;
3939 }
3940 
3941 static const struct bpf_func_proto *
3942 bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
3943 {
3944 	switch (func_id) {
3945 	case BPF_FUNC_task_storage_get:
3946 		return &bpf_task_storage_get_proto;
3947 	case BPF_FUNC_task_storage_delete:
3948 		return &bpf_task_storage_delete_proto;
3949 	default:
3950 		return bpf_base_func_proto(func_id, prog);
3951 	}
3952 }
3953 
3954 static const struct bpf_verifier_ops bpf_scx_verifier_ops = {
3955 	.get_func_proto = bpf_scx_get_func_proto,
3956 	.is_valid_access = bpf_scx_is_valid_access,
3957 	.btf_struct_access = bpf_scx_btf_struct_access,
3958 };
3959 
3960 static int bpf_scx_init_member(const struct btf_type *t,
3961 			       const struct btf_member *member,
3962 			       void *kdata, const void *udata)
3963 {
3964 	const struct sched_ext_ops *uops = udata;
3965 	struct sched_ext_ops *ops = kdata;
3966 	u32 moff = __btf_member_bit_offset(t, member) / 8;
3967 	int ret;
3968 
3969 	switch (moff) {
3970 	case offsetof(struct sched_ext_ops, dispatch_max_batch):
3971 		if (*(u32 *)(udata + moff) > INT_MAX)
3972 			return -E2BIG;
3973 		ops->dispatch_max_batch = *(u32 *)(udata + moff);
3974 		return 1;
3975 	case offsetof(struct sched_ext_ops, flags):
3976 		if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS)
3977 			return -EINVAL;
3978 		ops->flags = *(u64 *)(udata + moff);
3979 		return 1;
3980 	case offsetof(struct sched_ext_ops, name):
3981 		ret = bpf_obj_name_cpy(ops->name, uops->name,
3982 				       sizeof(ops->name));
3983 		if (ret < 0)
3984 			return ret;
3985 		if (ret == 0)
3986 			return -EINVAL;
3987 		return 1;
3988 	case offsetof(struct sched_ext_ops, timeout_ms):
3989 		if (msecs_to_jiffies(*(u32 *)(udata + moff)) >
3990 		    SCX_WATCHDOG_MAX_TIMEOUT)
3991 			return -E2BIG;
3992 		ops->timeout_ms = *(u32 *)(udata + moff);
3993 		return 1;
3994 	case offsetof(struct sched_ext_ops, exit_dump_len):
3995 		ops->exit_dump_len =
3996 			*(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN;
3997 		return 1;
3998 	}
3999 
4000 	return 0;
4001 }
4002 
4003 static int bpf_scx_check_member(const struct btf_type *t,
4004 				const struct btf_member *member,
4005 				const struct bpf_prog *prog)
4006 {
4007 	u32 moff = __btf_member_bit_offset(t, member) / 8;
4008 
4009 	switch (moff) {
4010 	case offsetof(struct sched_ext_ops, init_task):
4011 	case offsetof(struct sched_ext_ops, init):
4012 	case offsetof(struct sched_ext_ops, exit):
4013 		break;
4014 	default:
4015 		if (prog->sleepable)
4016 			return -EINVAL;
4017 	}
4018 
4019 	return 0;
4020 }
4021 
4022 static int bpf_scx_reg(void *kdata, struct bpf_link *link)
4023 {
4024 	return scx_ops_enable(kdata, link);
4025 }
4026 
4027 static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
4028 {
4029 	scx_ops_disable(SCX_EXIT_UNREG);
4030 	kthread_flush_work(&scx_ops_disable_work);
4031 }
4032 
4033 static int bpf_scx_init(struct btf *btf)
4034 {
4035 	u32 type_id;
4036 
4037 	type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT);
4038 	if (type_id < 0)
4039 		return -EINVAL;
4040 	task_struct_type = btf_type_by_id(btf, type_id);
4041 	task_struct_type_id = type_id;
4042 
4043 	return 0;
4044 }
4045 
4046 static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link)
4047 {
4048 	/*
4049 	 * sched_ext does not support updating the actively-loaded BPF
4050 	 * scheduler, as registering a BPF scheduler can always fail if the
4051 	 * scheduler returns an error code for e.g. ops.init(), ops.init_task(),
4052 	 * etc. Similarly, we can always race with unregistration happening
4053 	 * elsewhere, such as with sysrq.
4054 	 */
4055 	return -EOPNOTSUPP;
4056 }
4057 
4058 static int bpf_scx_validate(void *kdata)
4059 {
4060 	return 0;
4061 }
4062 
4063 static s32 select_cpu_stub(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; }
4064 static void enqueue_stub(struct task_struct *p, u64 enq_flags) {}
4065 static void dequeue_stub(struct task_struct *p, u64 enq_flags) {}
4066 static void dispatch_stub(s32 prev_cpu, struct task_struct *p) {}
4067 static bool yield_stub(struct task_struct *from, struct task_struct *to) { return false; }
4068 static void set_weight_stub(struct task_struct *p, u32 weight) {}
4069 static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {}
4070 static void update_idle_stub(s32 cpu, bool idle) {}
4071 static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; }
4072 static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {}
4073 static void enable_stub(struct task_struct *p) {}
4074 static void disable_stub(struct task_struct *p) {}
4075 static s32 init_stub(void) { return -EINVAL; }
4076 static void exit_stub(struct scx_exit_info *info) {}
4077 
4078 static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
4079 	.select_cpu = select_cpu_stub,
4080 	.enqueue = enqueue_stub,
4081 	.dequeue = dequeue_stub,
4082 	.dispatch = dispatch_stub,
4083 	.yield = yield_stub,
4084 	.set_weight = set_weight_stub,
4085 	.set_cpumask = set_cpumask_stub,
4086 	.update_idle = update_idle_stub,
4087 	.init_task = init_task_stub,
4088 	.exit_task = exit_task_stub,
4089 	.enable = enable_stub,
4090 	.disable = disable_stub,
4091 	.init = init_stub,
4092 	.exit = exit_stub,
4093 };
4094 
4095 static struct bpf_struct_ops bpf_sched_ext_ops = {
4096 	.verifier_ops = &bpf_scx_verifier_ops,
4097 	.reg = bpf_scx_reg,
4098 	.unreg = bpf_scx_unreg,
4099 	.check_member = bpf_scx_check_member,
4100 	.init_member = bpf_scx_init_member,
4101 	.init = bpf_scx_init,
4102 	.update = bpf_scx_update,
4103 	.validate = bpf_scx_validate,
4104 	.name = "sched_ext_ops",
4105 	.owner = THIS_MODULE,
4106 	.cfi_stubs = &__bpf_ops_sched_ext_ops
4107 };
4108 
4109 
4110 /********************************************************************************
4111  * System integration and init.
4112  */
4113 
4114 static void sysrq_handle_sched_ext_reset(u8 key)
4115 {
4116 	if (scx_ops_helper)
4117 		scx_ops_disable(SCX_EXIT_SYSRQ);
4118 	else
4119 		pr_info("sched_ext: BPF scheduler not yet used\n");
4120 }
4121 
4122 static const struct sysrq_key_op sysrq_sched_ext_reset_op = {
4123 	.handler	= sysrq_handle_sched_ext_reset,
4124 	.help_msg	= "reset-sched-ext(S)",
4125 	.action_msg	= "Disable sched_ext and revert all tasks to CFS",
4126 	.enable_mask	= SYSRQ_ENABLE_RTNICE,
4127 };
4128 
4129 static void sysrq_handle_sched_ext_dump(u8 key)
4130 {
4131 	struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" };
4132 
4133 	if (scx_enabled())
4134 		scx_dump_state(&ei, 0);
4135 }
4136 
4137 static const struct sysrq_key_op sysrq_sched_ext_dump_op = {
4138 	.handler	= sysrq_handle_sched_ext_dump,
4139 	.help_msg	= "dump-sched-ext(D)",
4140 	.action_msg	= "Trigger sched_ext debug dump",
4141 	.enable_mask	= SYSRQ_ENABLE_RTNICE,
4142 };
4143 
4144 static bool can_skip_idle_kick(struct rq *rq)
4145 {
4146 	lockdep_assert_rq_held(rq);
4147 
4148 	/*
4149 	 * We can skip idle kicking if @rq is going to go through at least one
4150 	 * full SCX scheduling cycle before going idle. Just checking whether
4151 	 * curr is not idle is insufficient because we could be racing
4152 	 * balance_one() trying to pull the next task from a remote rq, which
4153 	 * may fail, and @rq may become idle afterwards.
4154 	 *
4155 	 * The race window is small and we don't and can't guarantee that @rq is
4156 	 * only kicked while idle anyway. Skip only when sure.
4157 	 */
4158 	return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_BALANCING);
4159 }
4160 
4161 static void kick_one_cpu(s32 cpu, struct rq *this_rq)
4162 {
4163 	struct rq *rq = cpu_rq(cpu);
4164 	struct scx_rq *this_scx = &this_rq->scx;
4165 	unsigned long flags;
4166 
4167 	raw_spin_rq_lock_irqsave(rq, flags);
4168 
4169 	/*
4170 	 * During CPU hotplug, a CPU may depend on kicking itself to make
4171 	 * forward progress. Allow kicking self regardless of online state.
4172 	 */
4173 	if (cpu_online(cpu) || cpu == cpu_of(this_rq)) {
4174 		if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) {
4175 			if (rq->curr->sched_class == &ext_sched_class)
4176 				rq->curr->scx.slice = 0;
4177 			cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
4178 		}
4179 
4180 		resched_curr(rq);
4181 	} else {
4182 		cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt);
4183 	}
4184 
4185 	raw_spin_rq_unlock_irqrestore(rq, flags);
4186 }
4187 
4188 static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq)
4189 {
4190 	struct rq *rq = cpu_rq(cpu);
4191 	unsigned long flags;
4192 
4193 	raw_spin_rq_lock_irqsave(rq, flags);
4194 
4195 	if (!can_skip_idle_kick(rq) &&
4196 	    (cpu_online(cpu) || cpu == cpu_of(this_rq)))
4197 		resched_curr(rq);
4198 
4199 	raw_spin_rq_unlock_irqrestore(rq, flags);
4200 }
4201 
4202 static void kick_cpus_irq_workfn(struct irq_work *irq_work)
4203 {
4204 	struct rq *this_rq = this_rq();
4205 	struct scx_rq *this_scx = &this_rq->scx;
4206 	s32 cpu;
4207 
4208 	for_each_cpu(cpu, this_scx->cpus_to_kick) {
4209 		kick_one_cpu(cpu, this_rq);
4210 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick);
4211 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
4212 	}
4213 
4214 	for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) {
4215 		kick_one_cpu_if_idle(cpu, this_rq);
4216 		cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle);
4217 	}
4218 }
4219 
4220 /**
4221  * print_scx_info - print out sched_ext scheduler state
4222  * @log_lvl: the log level to use when printing
4223  * @p: target task
4224  *
4225  * If a sched_ext scheduler is enabled, print the name and state of the
4226  * scheduler. If @p is on sched_ext, print further information about the task.
4227  *
4228  * This function can be safely called on any task as long as the task_struct
4229  * itself is accessible. While safe, this function isn't synchronized and may
4230  * print out mixups or garbages of limited length.
4231  */
4232 void print_scx_info(const char *log_lvl, struct task_struct *p)
4233 {
4234 	enum scx_ops_enable_state state = scx_ops_enable_state();
4235 	const char *all = READ_ONCE(scx_switching_all) ? "+all" : "";
4236 	char runnable_at_buf[22] = "?";
4237 	struct sched_class *class;
4238 	unsigned long runnable_at;
4239 
4240 	if (state == SCX_OPS_DISABLED)
4241 		return;
4242 
4243 	/*
4244 	 * Carefully check if the task was running on sched_ext, and then
4245 	 * carefully copy the time it's been runnable, and its state.
4246 	 */
4247 	if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) ||
4248 	    class != &ext_sched_class) {
4249 		printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name,
4250 		       scx_ops_enable_state_str[state], all);
4251 		return;
4252 	}
4253 
4254 	if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at,
4255 				      sizeof(runnable_at)))
4256 		scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms",
4257 			  jiffies_delta_msecs(runnable_at, jiffies));
4258 
4259 	/* print everything onto one line to conserve console space */
4260 	printk("%sSched_ext: %s (%s%s), task: runnable_at=%s",
4261 	       log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all,
4262 	       runnable_at_buf);
4263 }
4264 
4265 void __init init_sched_ext_class(void)
4266 {
4267 	s32 cpu, v;
4268 
4269 	/*
4270 	 * The following is to prevent the compiler from optimizing out the enum
4271 	 * definitions so that BPF scheduler implementations can use them
4272 	 * through the generated vmlinux.h.
4273 	 */
4274 	WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT);
4275 
4276 	BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params));
4277 	init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL);
4278 #ifdef CONFIG_SMP
4279 	BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL));
4280 	BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL));
4281 #endif
4282 	for_each_possible_cpu(cpu) {
4283 		struct rq *rq = cpu_rq(cpu);
4284 
4285 		init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL);
4286 		INIT_LIST_HEAD(&rq->scx.runnable_list);
4287 
4288 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL));
4289 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL));
4290 		BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL));
4291 		init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn);
4292 	}
4293 
4294 	register_sysrq_key('S', &sysrq_sched_ext_reset_op);
4295 	register_sysrq_key('D', &sysrq_sched_ext_dump_op);
4296 	INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn);
4297 }
4298 
4299 
4300 /********************************************************************************
4301  * Helpers that can be called from the BPF scheduler.
4302  */
4303 #include <linux/btf_ids.h>
4304 
4305 __bpf_kfunc_start_defs();
4306 
4307 /**
4308  * scx_bpf_create_dsq - Create a custom DSQ
4309  * @dsq_id: DSQ to create
4310  * @node: NUMA node to allocate from
4311  *
4312  * Create a custom DSQ identified by @dsq_id. Can be called from ops.init() and
4313  * ops.init_task().
4314  */
4315 __bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node)
4316 {
4317 	if (!scx_kf_allowed(SCX_KF_SLEEPABLE))
4318 		return -EINVAL;
4319 
4320 	if (unlikely(node >= (int)nr_node_ids ||
4321 		     (node < 0 && node != NUMA_NO_NODE)))
4322 		return -EINVAL;
4323 	return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node));
4324 }
4325 
4326 __bpf_kfunc_end_defs();
4327 
4328 BTF_KFUNCS_START(scx_kfunc_ids_sleepable)
4329 BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
4330 BTF_KFUNCS_END(scx_kfunc_ids_sleepable)
4331 
4332 static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = {
4333 	.owner			= THIS_MODULE,
4334 	.set			= &scx_kfunc_ids_sleepable,
4335 };
4336 
4337 __bpf_kfunc_start_defs();
4338 
4339 /**
4340  * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu()
4341  * @p: task_struct to select a CPU for
4342  * @prev_cpu: CPU @p was on previously
4343  * @wake_flags: %SCX_WAKE_* flags
4344  * @is_idle: out parameter indicating whether the returned CPU is idle
4345  *
4346  * Can only be called from ops.select_cpu() if the built-in CPU selection is
4347  * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set.
4348  * @p, @prev_cpu and @wake_flags match ops.select_cpu().
4349  *
4350  * Returns the picked CPU with *@is_idle indicating whether the picked CPU is
4351  * currently idle and thus a good candidate for direct dispatching.
4352  */
4353 __bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
4354 				       u64 wake_flags, bool *is_idle)
4355 {
4356 	if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) {
4357 		*is_idle = false;
4358 		return prev_cpu;
4359 	}
4360 #ifdef CONFIG_SMP
4361 	return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle);
4362 #else
4363 	*is_idle = false;
4364 	return prev_cpu;
4365 #endif
4366 }
4367 
4368 __bpf_kfunc_end_defs();
4369 
4370 BTF_KFUNCS_START(scx_kfunc_ids_select_cpu)
4371 BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU)
4372 BTF_KFUNCS_END(scx_kfunc_ids_select_cpu)
4373 
4374 static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
4375 	.owner			= THIS_MODULE,
4376 	.set			= &scx_kfunc_ids_select_cpu,
4377 };
4378 
4379 static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags)
4380 {
4381 	if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
4382 		return false;
4383 
4384 	lockdep_assert_irqs_disabled();
4385 
4386 	if (unlikely(!p)) {
4387 		scx_ops_error("called with NULL task");
4388 		return false;
4389 	}
4390 
4391 	if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) {
4392 		scx_ops_error("invalid enq_flags 0x%llx", enq_flags);
4393 		return false;
4394 	}
4395 
4396 	return true;
4397 }
4398 
4399 static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags)
4400 {
4401 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
4402 	struct task_struct *ddsp_task;
4403 
4404 	ddsp_task = __this_cpu_read(direct_dispatch_task);
4405 	if (ddsp_task) {
4406 		mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags);
4407 		return;
4408 	}
4409 
4410 	if (unlikely(dspc->cursor >= scx_dsp_max_batch)) {
4411 		scx_ops_error("dispatch buffer overflow");
4412 		return;
4413 	}
4414 
4415 	dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){
4416 		.task = p,
4417 		.qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK,
4418 		.dsq_id = dsq_id,
4419 		.enq_flags = enq_flags,
4420 	};
4421 }
4422 
4423 __bpf_kfunc_start_defs();
4424 
4425 /**
4426  * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ
4427  * @p: task_struct to dispatch
4428  * @dsq_id: DSQ to dispatch to
4429  * @slice: duration @p can run for in nsecs
4430  * @enq_flags: SCX_ENQ_*
4431  *
4432  * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe
4433  * to call this function spuriously. Can be called from ops.enqueue(),
4434  * ops.select_cpu(), and ops.dispatch().
4435  *
4436  * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch
4437  * and @p must match the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be
4438  * used to target the local DSQ of a CPU other than the enqueueing one. Use
4439  * ops.select_cpu() to be on the target CPU in the first place.
4440  *
4441  * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p
4442  * will be directly dispatched to the corresponding dispatch queue after
4443  * ops.select_cpu() returns. If @p is dispatched to SCX_DSQ_LOCAL, it will be
4444  * dispatched to the local DSQ of the CPU returned by ops.select_cpu().
4445  * @enq_flags are OR'd with the enqueue flags on the enqueue path before the
4446  * task is dispatched.
4447  *
4448  * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id
4449  * and this function can be called upto ops.dispatch_max_batch times to dispatch
4450  * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the
4451  * remaining slots. scx_bpf_consume() flushes the batch and resets the counter.
4452  *
4453  * This function doesn't have any locking restrictions and may be called under
4454  * BPF locks (in the future when BPF introduces more flexible locking).
4455  *
4456  * @p is allowed to run for @slice. The scheduling path is triggered on slice
4457  * exhaustion. If zero, the current residual slice is maintained.
4458  */
4459 __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
4460 				  u64 enq_flags)
4461 {
4462 	if (!scx_dispatch_preamble(p, enq_flags))
4463 		return;
4464 
4465 	if (slice)
4466 		p->scx.slice = slice;
4467 	else
4468 		p->scx.slice = p->scx.slice ?: 1;
4469 
4470 	scx_dispatch_commit(p, dsq_id, enq_flags);
4471 }
4472 
4473 __bpf_kfunc_end_defs();
4474 
4475 BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
4476 BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU)
4477 BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
4478 
4479 static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
4480 	.owner			= THIS_MODULE,
4481 	.set			= &scx_kfunc_ids_enqueue_dispatch,
4482 };
4483 
4484 __bpf_kfunc_start_defs();
4485 
4486 /**
4487  * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots
4488  *
4489  * Can only be called from ops.dispatch().
4490  */
4491 __bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void)
4492 {
4493 	if (!scx_kf_allowed(SCX_KF_DISPATCH))
4494 		return 0;
4495 
4496 	return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor);
4497 }
4498 
4499 /**
4500  * scx_bpf_dispatch_cancel - Cancel the latest dispatch
4501  *
4502  * Cancel the latest dispatch. Can be called multiple times to cancel further
4503  * dispatches. Can only be called from ops.dispatch().
4504  */
4505 __bpf_kfunc void scx_bpf_dispatch_cancel(void)
4506 {
4507 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
4508 
4509 	if (!scx_kf_allowed(SCX_KF_DISPATCH))
4510 		return;
4511 
4512 	if (dspc->cursor > 0)
4513 		dspc->cursor--;
4514 	else
4515 		scx_ops_error("dispatch buffer underflow");
4516 }
4517 
4518 /**
4519  * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ
4520  * @dsq_id: DSQ to consume
4521  *
4522  * Consume a task from the non-local DSQ identified by @dsq_id and transfer it
4523  * to the current CPU's local DSQ for execution. Can only be called from
4524  * ops.dispatch().
4525  *
4526  * This function flushes the in-flight dispatches from scx_bpf_dispatch() before
4527  * trying to consume the specified DSQ. It may also grab rq locks and thus can't
4528  * be called under any BPF locks.
4529  *
4530  * Returns %true if a task has been consumed, %false if there isn't any task to
4531  * consume.
4532  */
4533 __bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
4534 {
4535 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
4536 	struct scx_dispatch_q *dsq;
4537 
4538 	if (!scx_kf_allowed(SCX_KF_DISPATCH))
4539 		return false;
4540 
4541 	flush_dispatch_buf(dspc->rq, dspc->rf);
4542 
4543 	dsq = find_non_local_dsq(dsq_id);
4544 	if (unlikely(!dsq)) {
4545 		scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id);
4546 		return false;
4547 	}
4548 
4549 	if (consume_dispatch_q(dspc->rq, dspc->rf, dsq)) {
4550 		/*
4551 		 * A successfully consumed task can be dequeued before it starts
4552 		 * running while the CPU is trying to migrate other dispatched
4553 		 * tasks. Bump nr_tasks to tell balance_scx() to retry on empty
4554 		 * local DSQ.
4555 		 */
4556 		dspc->nr_tasks++;
4557 		return true;
4558 	} else {
4559 		return false;
4560 	}
4561 }
4562 
4563 __bpf_kfunc_end_defs();
4564 
4565 BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
4566 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
4567 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
4568 BTF_ID_FLAGS(func, scx_bpf_consume)
4569 BTF_KFUNCS_END(scx_kfunc_ids_dispatch)
4570 
4571 static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = {
4572 	.owner			= THIS_MODULE,
4573 	.set			= &scx_kfunc_ids_dispatch,
4574 };
4575 
4576 __bpf_kfunc_start_defs();
4577 
4578 /**
4579  * scx_bpf_kick_cpu - Trigger reschedule on a CPU
4580  * @cpu: cpu to kick
4581  * @flags: %SCX_KICK_* flags
4582  *
4583  * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or
4584  * trigger rescheduling on a busy CPU. This can be called from any online
4585  * scx_ops operation and the actual kicking is performed asynchronously through
4586  * an irq work.
4587  */
4588 __bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags)
4589 {
4590 	struct rq *this_rq;
4591 	unsigned long irq_flags;
4592 
4593 	if (!ops_cpu_valid(cpu, NULL))
4594 		return;
4595 
4596 	/*
4597 	 * While bypassing for PM ops, IRQ handling may not be online which can
4598 	 * lead to irq_work_queue() malfunction such as infinite busy wait for
4599 	 * IRQ status update. Suppress kicking.
4600 	 */
4601 	if (scx_ops_bypassing())
4602 		return;
4603 
4604 	local_irq_save(irq_flags);
4605 
4606 	this_rq = this_rq();
4607 
4608 	/*
4609 	 * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting
4610 	 * rq locks. We can probably be smarter and avoid bouncing if called
4611 	 * from ops which don't hold a rq lock.
4612 	 */
4613 	if (flags & SCX_KICK_IDLE) {
4614 		struct rq *target_rq = cpu_rq(cpu);
4615 
4616 		if (unlikely(flags & SCX_KICK_PREEMPT))
4617 			scx_ops_error("PREEMPT cannot be used with SCX_KICK_IDLE");
4618 
4619 		if (raw_spin_rq_trylock(target_rq)) {
4620 			if (can_skip_idle_kick(target_rq)) {
4621 				raw_spin_rq_unlock(target_rq);
4622 				goto out;
4623 			}
4624 			raw_spin_rq_unlock(target_rq);
4625 		}
4626 		cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle);
4627 	} else {
4628 		cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick);
4629 
4630 		if (flags & SCX_KICK_PREEMPT)
4631 			cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt);
4632 	}
4633 
4634 	irq_work_queue(&this_rq->scx.kick_cpus_irq_work);
4635 out:
4636 	local_irq_restore(irq_flags);
4637 }
4638 
4639 /**
4640  * scx_bpf_dsq_nr_queued - Return the number of queued tasks
4641  * @dsq_id: id of the DSQ
4642  *
4643  * Return the number of tasks in the DSQ matching @dsq_id. If not found,
4644  * -%ENOENT is returned.
4645  */
4646 __bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id)
4647 {
4648 	struct scx_dispatch_q *dsq;
4649 	s32 ret;
4650 
4651 	preempt_disable();
4652 
4653 	if (dsq_id == SCX_DSQ_LOCAL) {
4654 		ret = READ_ONCE(this_rq()->scx.local_dsq.nr);
4655 		goto out;
4656 	} else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) {
4657 		s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK;
4658 
4659 		if (ops_cpu_valid(cpu, NULL)) {
4660 			ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr);
4661 			goto out;
4662 		}
4663 	} else {
4664 		dsq = find_non_local_dsq(dsq_id);
4665 		if (dsq) {
4666 			ret = READ_ONCE(dsq->nr);
4667 			goto out;
4668 		}
4669 	}
4670 	ret = -ENOENT;
4671 out:
4672 	preempt_enable();
4673 	return ret;
4674 }
4675 
4676 /**
4677  * scx_bpf_destroy_dsq - Destroy a custom DSQ
4678  * @dsq_id: DSQ to destroy
4679  *
4680  * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with
4681  * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is
4682  * empty and no further tasks are dispatched to it. Ignored if called on a DSQ
4683  * which doesn't exist. Can be called from any online scx_ops operations.
4684  */
4685 __bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id)
4686 {
4687 	destroy_dsq(dsq_id);
4688 }
4689 
4690 __bpf_kfunc_end_defs();
4691 
4692 static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size,
4693 			 char *fmt, unsigned long long *data, u32 data__sz)
4694 {
4695 	struct bpf_bprintf_data bprintf_data = { .get_bin_args = true };
4696 	s32 ret;
4697 
4698 	if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 ||
4699 	    (data__sz && !data)) {
4700 		scx_ops_error("invalid data=%p and data__sz=%u",
4701 			      (void *)data, data__sz);
4702 		return -EINVAL;
4703 	}
4704 
4705 	ret = copy_from_kernel_nofault(data_buf, data, data__sz);
4706 	if (ret < 0) {
4707 		scx_ops_error("failed to read data fields (%d)", ret);
4708 		return ret;
4709 	}
4710 
4711 	ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8,
4712 				  &bprintf_data);
4713 	if (ret < 0) {
4714 		scx_ops_error("format preparation failed (%d)", ret);
4715 		return ret;
4716 	}
4717 
4718 	ret = bstr_printf(line_buf, line_size, fmt,
4719 			  bprintf_data.bin_args);
4720 	bpf_bprintf_cleanup(&bprintf_data);
4721 	if (ret < 0) {
4722 		scx_ops_error("(\"%s\", %p, %u) failed to format",
4723 			      fmt, data, data__sz);
4724 		return ret;
4725 	}
4726 
4727 	return ret;
4728 }
4729 
4730 static s32 bstr_format(struct scx_bstr_buf *buf,
4731 		       char *fmt, unsigned long long *data, u32 data__sz)
4732 {
4733 	return __bstr_format(buf->data, buf->line, sizeof(buf->line),
4734 			     fmt, data, data__sz);
4735 }
4736 
4737 __bpf_kfunc_start_defs();
4738 
4739 /**
4740  * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler.
4741  * @exit_code: Exit value to pass to user space via struct scx_exit_info.
4742  * @fmt: error message format string
4743  * @data: format string parameters packaged using ___bpf_fill() macro
4744  * @data__sz: @data len, must end in '__sz' for the verifier
4745  *
4746  * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops
4747  * disabling.
4748  */
4749 __bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt,
4750 				   unsigned long long *data, u32 data__sz)
4751 {
4752 	unsigned long flags;
4753 
4754 	raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
4755 	if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
4756 		scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s",
4757 				  scx_exit_bstr_buf.line);
4758 	raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
4759 }
4760 
4761 /**
4762  * scx_bpf_error_bstr - Indicate fatal error
4763  * @fmt: error message format string
4764  * @data: format string parameters packaged using ___bpf_fill() macro
4765  * @data__sz: @data len, must end in '__sz' for the verifier
4766  *
4767  * Indicate that the BPF scheduler encountered a fatal error and initiate ops
4768  * disabling.
4769  */
4770 __bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data,
4771 				    u32 data__sz)
4772 {
4773 	unsigned long flags;
4774 
4775 	raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags);
4776 	if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0)
4777 		scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s",
4778 				  scx_exit_bstr_buf.line);
4779 	raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags);
4780 }
4781 
4782 /**
4783  * scx_bpf_dump - Generate extra debug dump specific to the BPF scheduler
4784  * @fmt: format string
4785  * @data: format string parameters packaged using ___bpf_fill() macro
4786  * @data__sz: @data len, must end in '__sz' for the verifier
4787  *
4788  * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and
4789  * dump_task() to generate extra debug dump specific to the BPF scheduler.
4790  *
4791  * The extra dump may be multiple lines. A single line may be split over
4792  * multiple calls. The last line is automatically terminated.
4793  */
4794 __bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data,
4795 				   u32 data__sz)
4796 {
4797 	struct scx_dump_data *dd = &scx_dump_data;
4798 	struct scx_bstr_buf *buf = &dd->buf;
4799 	s32 ret;
4800 
4801 	if (raw_smp_processor_id() != dd->cpu) {
4802 		scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends");
4803 		return;
4804 	}
4805 
4806 	/* append the formatted string to the line buf */
4807 	ret = __bstr_format(buf->data, buf->line + dd->cursor,
4808 			    sizeof(buf->line) - dd->cursor, fmt, data, data__sz);
4809 	if (ret < 0) {
4810 		dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)",
4811 			  dd->prefix, fmt, data, data__sz, ret);
4812 		return;
4813 	}
4814 
4815 	dd->cursor += ret;
4816 	dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line));
4817 
4818 	if (!dd->cursor)
4819 		return;
4820 
4821 	/*
4822 	 * If the line buf overflowed or ends in a newline, flush it into the
4823 	 * dump. This is to allow the caller to generate a single line over
4824 	 * multiple calls. As ops_dump_flush() can also handle multiple lines in
4825 	 * the line buf, the only case which can lead to an unexpected
4826 	 * truncation is when the caller keeps generating newlines in the middle
4827 	 * instead of the end consecutively. Don't do that.
4828 	 */
4829 	if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n')
4830 		ops_dump_flush();
4831 }
4832 
4833 /**
4834  * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs
4835  *
4836  * All valid CPU IDs in the system are smaller than the returned value.
4837  */
4838 __bpf_kfunc u32 scx_bpf_nr_cpu_ids(void)
4839 {
4840 	return nr_cpu_ids;
4841 }
4842 
4843 /**
4844  * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask
4845  */
4846 __bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void)
4847 {
4848 	return cpu_possible_mask;
4849 }
4850 
4851 /**
4852  * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask
4853  */
4854 __bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void)
4855 {
4856 	return cpu_online_mask;
4857 }
4858 
4859 /**
4860  * scx_bpf_put_cpumask - Release a possible/online cpumask
4861  * @cpumask: cpumask to release
4862  */
4863 __bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask)
4864 {
4865 	/*
4866 	 * Empty function body because we aren't actually acquiring or releasing
4867 	 * a reference to a global cpumask, which is read-only in the caller and
4868 	 * is never released. The acquire / release semantics here are just used
4869 	 * to make the cpumask is a trusted pointer in the caller.
4870 	 */
4871 }
4872 
4873 /**
4874  * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking
4875  * per-CPU cpumask.
4876  *
4877  * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
4878  */
4879 __bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void)
4880 {
4881 	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
4882 		scx_ops_error("built-in idle tracking is disabled");
4883 		return cpu_none_mask;
4884 	}
4885 
4886 #ifdef CONFIG_SMP
4887 	return idle_masks.cpu;
4888 #else
4889 	return cpu_none_mask;
4890 #endif
4891 }
4892 
4893 /**
4894  * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking,
4895  * per-physical-core cpumask. Can be used to determine if an entire physical
4896  * core is free.
4897  *
4898  * Returns NULL if idle tracking is not enabled, or running on a UP kernel.
4899  */
4900 __bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void)
4901 {
4902 	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
4903 		scx_ops_error("built-in idle tracking is disabled");
4904 		return cpu_none_mask;
4905 	}
4906 
4907 #ifdef CONFIG_SMP
4908 	if (sched_smt_active())
4909 		return idle_masks.smt;
4910 	else
4911 		return idle_masks.cpu;
4912 #else
4913 	return cpu_none_mask;
4914 #endif
4915 }
4916 
4917 /**
4918  * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to
4919  * either the percpu, or SMT idle-tracking cpumask.
4920  */
4921 __bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask)
4922 {
4923 	/*
4924 	 * Empty function body because we aren't actually acquiring or releasing
4925 	 * a reference to a global idle cpumask, which is read-only in the
4926 	 * caller and is never released. The acquire / release semantics here
4927 	 * are just used to make the cpumask a trusted pointer in the caller.
4928 	 */
4929 }
4930 
4931 /**
4932  * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state
4933  * @cpu: cpu to test and clear idle for
4934  *
4935  * Returns %true if @cpu was idle and its idle state was successfully cleared.
4936  * %false otherwise.
4937  *
4938  * Unavailable if ops.update_idle() is implemented and
4939  * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
4940  */
4941 __bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu)
4942 {
4943 	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
4944 		scx_ops_error("built-in idle tracking is disabled");
4945 		return false;
4946 	}
4947 
4948 	if (ops_cpu_valid(cpu, NULL))
4949 		return test_and_clear_cpu_idle(cpu);
4950 	else
4951 		return false;
4952 }
4953 
4954 /**
4955  * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu
4956  * @cpus_allowed: Allowed cpumask
4957  * @flags: %SCX_PICK_IDLE_CPU_* flags
4958  *
4959  * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu
4960  * number on success. -%EBUSY if no matching cpu was found.
4961  *
4962  * Idle CPU tracking may race against CPU scheduling state transitions. For
4963  * example, this function may return -%EBUSY as CPUs are transitioning into the
4964  * idle state. If the caller then assumes that there will be dispatch events on
4965  * the CPUs as they were all busy, the scheduler may end up stalling with CPUs
4966  * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and
4967  * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch
4968  * event in the near future.
4969  *
4970  * Unavailable if ops.update_idle() is implemented and
4971  * %SCX_OPS_KEEP_BUILTIN_IDLE is not set.
4972  */
4973 __bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed,
4974 				      u64 flags)
4975 {
4976 	if (!static_branch_likely(&scx_builtin_idle_enabled)) {
4977 		scx_ops_error("built-in idle tracking is disabled");
4978 		return -EBUSY;
4979 	}
4980 
4981 	return scx_pick_idle_cpu(cpus_allowed, flags);
4982 }
4983 
4984 /**
4985  * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU
4986  * @cpus_allowed: Allowed cpumask
4987  * @flags: %SCX_PICK_IDLE_CPU_* flags
4988  *
4989  * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any
4990  * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu
4991  * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is
4992  * empty.
4993  *
4994  * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not
4995  * set, this function can't tell which CPUs are idle and will always pick any
4996  * CPU.
4997  */
4998 __bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed,
4999 				     u64 flags)
5000 {
5001 	s32 cpu;
5002 
5003 	if (static_branch_likely(&scx_builtin_idle_enabled)) {
5004 		cpu = scx_pick_idle_cpu(cpus_allowed, flags);
5005 		if (cpu >= 0)
5006 			return cpu;
5007 	}
5008 
5009 	cpu = cpumask_any_distribute(cpus_allowed);
5010 	if (cpu < nr_cpu_ids)
5011 		return cpu;
5012 	else
5013 		return -EBUSY;
5014 }
5015 
5016 /**
5017  * scx_bpf_task_running - Is task currently running?
5018  * @p: task of interest
5019  */
5020 __bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p)
5021 {
5022 	return task_rq(p)->curr == p;
5023 }
5024 
5025 /**
5026  * scx_bpf_task_cpu - CPU a task is currently associated with
5027  * @p: task of interest
5028  */
5029 __bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p)
5030 {
5031 	return task_cpu(p);
5032 }
5033 
5034 __bpf_kfunc_end_defs();
5035 
5036 BTF_KFUNCS_START(scx_kfunc_ids_any)
5037 BTF_ID_FLAGS(func, scx_bpf_kick_cpu)
5038 BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued)
5039 BTF_ID_FLAGS(func, scx_bpf_destroy_dsq)
5040 BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS)
5041 BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS)
5042 BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS)
5043 BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids)
5044 BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE)
5045 BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE)
5046 BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE)
5047 BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE)
5048 BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE)
5049 BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE)
5050 BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle)
5051 BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU)
5052 BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU)
5053 BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU)
5054 BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU)
5055 BTF_KFUNCS_END(scx_kfunc_ids_any)
5056 
5057 static const struct btf_kfunc_id_set scx_kfunc_set_any = {
5058 	.owner			= THIS_MODULE,
5059 	.set			= &scx_kfunc_ids_any,
5060 };
5061 
5062 static int __init scx_init(void)
5063 {
5064 	int ret;
5065 
5066 	/*
5067 	 * kfunc registration can't be done from init_sched_ext_class() as
5068 	 * register_btf_kfunc_id_set() needs most of the system to be up.
5069 	 *
5070 	 * Some kfuncs are context-sensitive and can only be called from
5071 	 * specific SCX ops. They are grouped into BTF sets accordingly.
5072 	 * Unfortunately, BPF currently doesn't have a way of enforcing such
5073 	 * restrictions. Eventually, the verifier should be able to enforce
5074 	 * them. For now, register them the same and make each kfunc explicitly
5075 	 * check using scx_kf_allowed().
5076 	 */
5077 	if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
5078 					     &scx_kfunc_set_sleepable)) ||
5079 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
5080 					     &scx_kfunc_set_select_cpu)) ||
5081 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
5082 					     &scx_kfunc_set_enqueue_dispatch)) ||
5083 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
5084 					     &scx_kfunc_set_dispatch)) ||
5085 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
5086 					     &scx_kfunc_set_any)) ||
5087 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
5088 					     &scx_kfunc_set_any)) ||
5089 	    (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL,
5090 					     &scx_kfunc_set_any))) {
5091 		pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret);
5092 		return ret;
5093 	}
5094 
5095 	ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops);
5096 	if (ret) {
5097 		pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret);
5098 		return ret;
5099 	}
5100 
5101 	scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj);
5102 	if (!scx_kset) {
5103 		pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n");
5104 		return -ENOMEM;
5105 	}
5106 
5107 	ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group);
5108 	if (ret < 0) {
5109 		pr_err("sched_ext: Failed to add global attributes\n");
5110 		return ret;
5111 	}
5112 
5113 	return 0;
5114 }
5115 __initcall(scx_init);
5116